In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from functools import lru_cache as memoize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score,recall_score,f1_score
from sklearn.metrics import confusion_matrix

In [2]:
data = pd.read_csv('/Users/camillebustalinio/Desktop/Camille/term 2 project/airbnb data/final_data.csv')

In [3]:
#set X,y
comments_SA = data[(data.language=='en')].comments
tags_SA = data[(data.language=='en')].price_tag
#tags_SA_list = ["pos" if tag == 'exp' else 'neg' for tag in tags_SA]
comments_SA.head(), tags_SA.head()

(0    Host was excellent and was contactable / respo...
 1    The place was clean and spacious and the guy t...
 2    This place wasn't as pleasant as we hoped. Had...
 4    Nice room, cool neighbourhood. Note that this ...
 5    The listing was accurate and the location was ...
 Name: comments, dtype: object, 0    neutral
 1        exp
 2        exp
 4        exp
 5        exp
 Name: price_tag, dtype: object)

In [4]:
@memoize(maxsize=128)
def negate_sequence(text):
    from nltk.tokenize import word_tokenize
    negation = False
    delims = "?.,!:;"
    result = []
    words = word_tokenize(text)
    for word in words:
        stripped = word.strip(delims).lower()
        negated = "not_" + stripped if negation else stripped
        result.append(negated)

        if word in ("not", "n't", "no"):
            negation = not negation
        
        if any(c in word for c in delims):
            negation = False

    return result

In [5]:
train_data, test_data, train_target, test_target = train_test_split(comments_SA,tags_SA)

In [6]:
len(train_data), len(test_data)

(121485, 40496)

In [7]:
vectorizer_count = CountVectorizer(tokenizer=negate_sequence,
                                binary=True)
vectorized_count_train_data = vectorizer_count.fit_transform(train_data)

In [8]:
clf = MultinomialNB()
# Call fit method
clf.fit(vectorized_count_train_data, train_target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [140]:
predict_test = list(clf.predict(vectorizer_count.transform(test_data)))
predict_test[:5]

['cheap', 'neutral', 'neutral', 'neutral', 'neutral']

In [None]:
predict_test[:5], test_target[:5]

In [None]:
vectorized_count_test_data = vectorizer_count.transform(test_data)
clf.score(vectorized_count_test_data, test_target)

In [None]:
sum(test_target==predict_test)

In [142]:
print("Accuracy:", sum(test_target==predict_test)/len(predict_test))
print("Precision:", precision_score(test_target, predict_test, average="macro") )
print("Recall:", recall_score(test_target, predict_test, average="macro") )
print("F1:", f1_score(test_target, predict_test, average="macro") )


Accuracy: 0.737628407744
Precision: 0.704330535417
Recall: 0.468873177603
F1: 0.492312840675


In [143]:
#weighted metrics, since groups are not balanced
print("Accuracy:", sum(test_target==predict_test)/len(predict_test))
print("Precision:", precision_score(test_target, predict_test, average="weighted") )
print("Recall:", recall_score(test_target, predict_test, average="weighted") )
print("F1:", f1_score(test_target, predict_test, average="weighted") )


Accuracy: 0.737628407744
Precision: 0.725852247656
Recall: 0.737628407744
F1: 0.70254498592


In [144]:
print("Rows: Actual, Columns: Predicted")
print('cheap', 'neutral', 'exp')
confusion_matrix(test_target, predict_test, labels = ['cheap', 'neutral', 'exp'])

Rows: Actual, Columns: Predicted
cheap neutral exp


array([[ 4260,  6491,    21],
       [ 1937, 25417,    65],
       [  168,  1943,   194]])