In [2]:
import numpy as np
import pickle
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.model_selection import train_test_split
from gensim.sklearn_api import W2VTransformer
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.naive_bayes import BernoulliNB

  from numpy.core.umath_tests import inner1d


# Alberta

In [3]:
ab_df = pickle.load(open('post_eda/eda_ab.p', 'rb'))

In [9]:
word_list = list(ab_df['text'])
vectorizer = CountVectorizer()
vectorized = vectorizer.fit_transform(word_list)
matrix_ab = pd.DataFrame(vectorized.toarray(), columns=vectorizer.get_feature_names())
X_train, X_test, y_train, y_test = train_test_split(matrix_ab, ab_df.review_rating, test_size=0.33)

rf = RandomForestClassifier(n_estimators=16, verbose=True)
rf.fit(X_train, y_train)

y_hat_train = rf.predict(X_train)
y_hat_test = rf.predict(X_test)

acc_random_forest = round(rf.score(X_test, y_test) * 100, 2)
acc_random_forest

[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    2.4s finished
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Done  16 out of  16 | elapsed:    1.2s finished


46.75

In [10]:
print('Decision Tree:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

Decision Tree:
 1. train 2. test
             precision    recall  f1-score   support

        1.0       1.00      1.00      1.00      9361
        2.0       1.00      1.00      1.00      5816
        3.0       1.00      1.00      1.00      8727
        4.0       1.00      1.00      1.00     18445
        5.0       1.00      1.00      1.00     21301

avg / total       1.00      1.00      1.00     63650

-------------------------------------------------------
             precision    recall  f1-score   support

        1.0       0.57      0.58      0.57      4565
        2.0       0.26      0.06      0.09      2860
        3.0       0.31      0.12      0.18      4297
        4.0       0.40      0.47      0.43      9055
        5.0       0.52      0.67      0.58     10573

avg / total       0.44      0.47      0.44     31350



In [None]:
y_score = random_forest.predict_proba(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])

print(auc(fpr,tpr), threshold)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show();

In [11]:
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_hat_train = bnb.predict(X_train)
y_hat_test = bnb.predict(X_test)
acc_bnb = round(bnb.score(X_train, y_train) * 100, 2)
acc_bnb

53.75

In [12]:
print('Naive Bayes:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

Naive Bayes:
 1. train 2. test
             precision    recall  f1-score   support

        1.0       0.66      0.57      0.61      9361
        2.0       0.68      0.18      0.29      5816
        3.0       0.60      0.28      0.39      8727
        4.0       0.54      0.47      0.51     18445
        5.0       0.49      0.78      0.60     21301

avg / total       0.56      0.54      0.52     63650

-------------------------------------------------------
             precision    recall  f1-score   support

        1.0       0.60      0.53      0.56      4565
        2.0       0.32      0.07      0.12      2860
        3.0       0.33      0.17      0.22      4297
        4.0       0.43      0.37      0.40      9055
        5.0       0.46      0.73      0.56     10573

avg / total       0.44      0.46      0.43     31350



In [None]:
y_score = gaussian.predict_proba(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])

print(auc(fpr,tpr), threshold)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

# Nevada

In [2]:
nv_df = pickle.load(open('post_eda/eda_nv.p', 'rb'))

In [8]:
nv_df.shape

(95000, 12)

In [14]:
word_list = list(nv_df['text'])

vectorized = vectorizer.fit_transform(word_list)
data_matrix = pd.DataFrame(vectorized.toarray(), columns=vectorizer.get_feature_names())
X_train, X_test, y_train, y_test = train_test_split(data_matrix, nv_df.review_rating, test_size=0.33)

rf = RandomForestClassifier(n_estimators=16, verbose=True)
rf.fit(X_train, y_train)

y_hat_train = rf.predict(X_train)
y_hat_test = rf.predict(X_test)
acc_random_forest = round(rf.score(X_test, y_test) * 100, 2)
acc_random_forest

[Parallel(n_jobs=1)]: Done  32 out of  32 | elapsed: 22.6min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=32, n_jobs=1,
            oob_score=False, random_state=None, verbose=True,
            warm_start=False)

In [17]:
print('Random Forest:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

Decision Tree:
 1. train 2. test
             precision    recall  f1-score   support

        1.0       1.00      1.00      1.00      9634
        2.0       1.00      1.00      1.00      4774
        3.0       1.00      1.00      1.00      6534
        4.0       1.00      1.00      1.00     12574
        5.0       1.00      1.00      1.00     30134

avg / total       1.00      1.00      1.00     63650

-------------------------------------------------------
             precision    recall  f1-score   support

        1.0       0.65      0.53      0.59      4610
        2.0       0.35      0.02      0.03      2555
        3.0       0.38      0.05      0.09      3191
        4.0       0.34      0.13      0.19      6312
        5.0       0.57      0.95      0.71     14682

avg / total       0.50      0.56      0.47     31350



In [None]:
y_score = random_forest.predict_proba(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])

print(auc(fpr,tpr), threshold)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])a
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show();

In [18]:
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_hat_train = bnb.predict(X_train)
y_hat_test = bnb.predict(X_test)
acc_bnb = round(bnb.score(X_train, y_train) * 100, 2)
acc_bnb

59.43

In [19]:
print('Naive Bayes:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

Decision Tree:
 1. train 2. test
             precision    recall  f1-score   support

        1.0       0.63      0.56      0.59      9634
        2.0       0.60      0.17      0.26      4774
        3.0       0.58      0.23      0.33      6534
        4.0       0.48      0.39      0.43     12574
        5.0       0.62      0.84      0.71     30134

avg / total       0.59      0.59      0.56     63650

-------------------------------------------------------
             precision    recall  f1-score   support

        1.0       0.57      0.52      0.54      4610
        2.0       0.24      0.05      0.09      2555
        3.0       0.26      0.11      0.16      3191
        4.0       0.37      0.29      0.32      6312
        5.0       0.59      0.82      0.69     14682

avg / total       0.48      0.53      0.49     31350



In [None]:
y_score = gaussian.predict_proba(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])

print(auc(fpr,tpr), threshold)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
#