In [1]:
import numpy as np
import pickle
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.model_selection import train_test_split
from gensim.sklearn_api import W2VTransformer

  from numpy.core.umath_tests import inner1d


In [2]:
nv_df = pickle.load(open('data_clean/cleaned_nv.p', 'rb'))

In [3]:
nv_df.reset_index(inplace=True)
nv_df.drop(['state', 'user_id'], axis=1, inplace=True)

In [4]:
nv_df

Unnamed: 0,text,stars_x
0,ive been rooting for this place since it opene...,2.0
1,my partner i arrived first time customers the...,5.0
2,stayed here during new years stayed on the fl...,2.0
3,visited the pharmacy a few times in the last c...,5.0
4,first of all this is one massive strip club th...,5.0
...,...,...
999995,very great hotel overall the staff knows how t...,4.0
999996,with yelps off for iphone screen repair i w...,5.0
999997,i toured over different location and this was...,5.0
999998,this place can get packed drinks are great and...,4.0


In [22]:
total_vocabulary = set(word for text in nv_df.text for word in text)

In [51]:
glove = {}
with open('datasets/glove.6B.100d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in total_vocabulary:
            vector = np.array(parts[1:], dtype=np.float32)
            glove[word] = vector

In [52]:
class W2vVectorizer(object):
    
    def __init__(self, w2v):
        # Takes in a dictionary of words and vectors as input
        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(glove))])
    
    # Note: Even though it doesn't do anything, it's required that this object implement a fit method or else
    # it can't be used in a scikit-learn pipeline  
    def fit(self, X, y):
        return self
            
    def transform(self, X):
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])

In [54]:
rf =  Pipeline([('Word2Vec Vectorizer', W2vVectorizer(glove)),
              ('Random Forest', RandomForestClassifier(n_estimators=100, verbose=True))])
lr = Pipeline([('Word2Vec Vectorizer', W2vVectorizer(glove)),
                ('Logistic Regression', LogisticRegression())])
nb = Pipeline([('W2V Vec', W2vVectorizer(glove)), ('NB Bernoulli', BernoulliNB())])

models = [('Random Forest', rf),
         ('Logistic Regression', lr),
         ('NB Bernoulli', nb)]

scores = [(name, cross_val_score(model, X_train, y_train, cv=2).mean()) for name, model, in models]
scores

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  9.1min finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   14.1s finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  9.1min finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   14.2s finished


[('Random Forest', 0.489052238101637),
 ('Logistic Regression', 0.47854477876510737),
 ('NB Bernoulli', 0.4664149254254013)]

In [65]:
nv_df['text1'] = nv_df.text.astype('str')

In [66]:
nv_df

Unnamed: 0,text,stars_x,text1
0,ive been rooting for this place since it opene...,2.0,ive been rooting for this place since it opene...
1,my partner i arrived first time customers the...,5.0,my partner i arrived first time customers the...
2,stayed here during new years stayed on the fl...,2.0,stayed here during new years stayed on the fl...
3,visited the pharmacy a few times in the last c...,5.0,visited the pharmacy a few times in the last c...
4,first of all this is one massive strip club th...,5.0,first of all this is one massive strip club th...
...,...,...,...
999995,very great hotel overall the staff knows how t...,4.0,very great hotel overall the staff knows how t...
999996,with yelps off for iphone screen repair i w...,5.0,with yelps off for iphone screen repair i w...
999997,i toured over different location and this was...,5.0,i toured over different location and this was...
999998,this place can get packed drinks are great and...,4.0,this place can get packed drinks are great and...


In [15]:
vectorizer = CountVectorizer()

In [43]:
word_list = list(nv_df['text'])[0:100000]

In [45]:
vectorized = vectorizer.fit_transform(word_list)
data_matrix = pd.DataFrame(vectorized.toarray(), columns=vectorizer.get_feature_names())
X_train, X_test, y_train, y_test = train_test_split(data_matrix, nv_df.stars_x[0:100000], test_size=0.33)

In [46]:
rf = RandomForestClassifier(n_estimators=32, verbose=True)
rf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed: 23.8min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=32, n_jobs=1,
            oob_score=False, random_state=None, verbose=True,
            warm_start=False)

In [47]:
y_hat_train = rf.predict(X_train)
y_hat_test = rf.predict(X_test)
acc_random_forest = round(rf.score(X_test, y_test) * 100, 2)
acc_random_forest

[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed:    4.7s finished
[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed:    2.5s finished
[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed:    2.5s finished


55.92

In [48]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

In [49]:
print('Decision Tree:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

Decision Tree:
 1. train 2. test
             precision    recall  f1-score   support

        1.0       1.00      1.00      1.00     10029
        2.0       1.00      1.00      1.00      5046
        3.0       1.00      1.00      1.00      6988
        4.0       1.00      1.00      1.00     13582
        5.0       1.00      1.00      1.00     31355

avg / total       1.00      1.00      1.00     67000

-------------------------------------------------------
             precision    recall  f1-score   support

        1.0       0.65      0.55      0.60      5063
        2.0       0.29      0.02      0.03      2562
        3.0       0.36      0.05      0.09      3423
        4.0       0.35      0.16      0.22      6682
        5.0       0.57      0.94      0.71     15270

avg / total       0.50      0.56      0.48     33000



In [50]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_hat_train = bnb.predict(X_train)
y_hat_test = bnb.predict(X_test)
acc_bnb = round(bnb.score(X_train, y_train) * 100, 2)
acc_bnb

59.06

In [51]:
print('Decision Tree:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

Decision Tree:
 1. train 2. test
             precision    recall  f1-score   support

        1.0       0.62      0.55      0.58     10029
        2.0       0.60      0.15      0.24      5046
        3.0       0.57      0.24      0.33      6988
        4.0       0.48      0.40      0.44     13582
        5.0       0.61      0.84      0.71     31355

avg / total       0.58      0.59      0.56     67000

-------------------------------------------------------
             precision    recall  f1-score   support

        1.0       0.58      0.52      0.55      5063
        2.0       0.25      0.05      0.09      2562
        3.0       0.30      0.14      0.19      3423
        4.0       0.38      0.31      0.34      6682
        5.0       0.60      0.81      0.69     15270

avg / total       0.49      0.54      0.50     33000

