In [1]:
import numpy as np
import pickle
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.model_selection import train_test_split
from gensim.sklearn_api import W2VTransformer
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.naive_bayes import BernoulliNB

  from numpy.core.umath_tests import inner1d


# Nevada

In [None]:
nv_df = pickle.load(open('post_eda/eda_nv.p', 'rb'))
word_list = list(nv_df['text'])

vectorizer = CountVectorizer()
vectorized = vectorizer.fit_transform(word_list)
data_matrix = pd.DataFrame(vectorized.toarray(), columns=vectorizer.get_feature_names())
X_train, X_test, y_train, y_test = train_test_split(data_matrix, nv_df.review_rating, test_size=0.33)

rf = RandomForestClassifier(n_estimators=16, verbose=True)
rf.fit(X_train, y_train)

y_hat_train = rf.predict(X_train)
y_hat_test = rf.predict(X_test)
acc_random_forest = round(rf.score(X_test, y_test) * 100, 2)
acc_random_forest

In [None]:
print('Random Forest:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

In [None]:
y_score = rf.predict_proba(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])

print(auc(fpr,tpr), threshold)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])a
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show();

In [None]:
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_hat_train = bnb.predict(X_train)
y_hat_test = bnb.predict(X_test)
acc_bnb = round(bnb.score(X_train, y_train) * 100, 2)
acc_bnb

In [None]:
print('Naive Bayes:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

In [None]:
y_score = bnb.predict_proba(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])

print(auc(fpr,tpr), threshold)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show();

# Arizona

In [None]:
az_df = pickle.load(open('post_eda/eda_az.p', 'rb'))
word_list = list(az_df['text'])

vectorized = vectorizer.fit_transform(word_list)
data_matrix = pd.DataFrame(vectorized.toarray(), columns=vectorizer.get_feature_names())
X_train, X_test, y_train, y_test = train_test_split(data_matrix, az_df.review_rating, test_size=0.33)

rf = RandomForestClassifier(n_estimators=16, verbose=True)
rf.fit(X_train, y_train)

y_hat_train = rf.predict(X_train)
y_hat_test = rf.predict(X_test)
acc_random_forest = round(rf.score(X_test, y_test) * 100, 2)
acc_random_forest

In [None]:
print('Random Forest:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

In [None]:
y_score = rf.predict_proba(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])

print(auc(fpr,tpr), threshold)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])a
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show();

In [None]:
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_hat_train = bnb.predict(X_train)
y_hat_test = bnb.predict(X_test)
acc_bnb = round(bnb.score(X_train, y_train) * 100, 2)
acc_bnb

In [None]:
print('Naive Bayes:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

In [None]:
y_score = bnb.predict_proba(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])

print(auc(fpr,tpr), threshold)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show();

# North Carolina

In [None]:
nc_df = pickle.load(open('post_eda/eda_nc.p', 'rb'))
word_list = list(nc_df['text'])

vectorized = vectorizer.fit_transform(word_list)
data_matrix = pd.DataFrame(vectorized.toarray(), columns=vectorizer.get_feature_names())
X_train, X_test, y_train, y_test = train_test_split(data_matrix, nc_df.review_rating, test_size=0.33)

rf = RandomForestClassifier(n_estimators=16, verbose=True)
rf.fit(X_train, y_train)

y_hat_train = rf.predict(X_train)
y_hat_test = rf.predict(X_test)
acc_random_forest = round(rf.score(X_test, y_test) * 100, 2)
acc_random_forest

In [None]:
print('Random Forest:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

In [None]:
y_score = rf.predict_proba(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])

print(auc(fpr,tpr), threshold)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])a
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show();

In [None]:
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_hat_train = bnb.predict(X_train)
y_hat_test = bnb.predict(X_test)
acc_bnb = round(bnb.score(X_train, y_train) * 100, 2)
acc_bnb

In [None]:
print('Naive Bayes:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

In [None]:
y_score = bnb.predict_proba(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])

print(auc(fpr,tpr), threshold)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show();

# Ohio

In [None]:
oh_df = pickle.load(open('post_eda/eda_oh.p', 'rb'))
word_list = list(oh_df['text'])

vectorized = vectorizer.fit_transform(word_list)
data_matrix = pd.DataFrame(vectorized.toarray(), columns=vectorizer.get_feature_names())
X_train, X_test, y_train, y_test = train_test_split(data_matrix, oh_df.review_rating, test_size=0.33)

rf = RandomForestClassifier(n_estimators=16, verbose=True)
rf.fit(X_train, y_train)

y_hat_train = rf.predict(X_train)
y_hat_test = rf.predict(X_test)
acc_random_forest = round(rf.score(X_test, y_test) * 100, 2)
acc_random_forest

In [None]:
print('Random Forest:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

In [None]:
y_score = rf.predict_proba(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])

print(auc(fpr,tpr), threshold)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])a
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show();

In [None]:
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_hat_train = bnb.predict(X_train)
y_hat_test = bnb.predict(X_test)
acc_bnb = round(bnb.score(X_train, y_train) * 100, 2)
acc_bnb

In [None]:
print('Naive Bayes:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

In [None]:
y_score = bnb.predict_proba(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])

print(auc(fpr,tpr), threshold)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show();

# Onatario

In [None]:
on_df = pickle.load(open('post_eda/eda_on.p', 'rb'))
word_list = list(on_df['text'])

vectorized = vectorizer.fit_transform(word_list)
data_matrix = pd.DataFrame(vectorized.toarray(), columns=vectorizer.get_feature_names())
X_train, X_test, y_train, y_test = train_test_split(data_matrix, on_df.review_rating, test_size=0.33)

rf = RandomForestClassifier(n_estimators=16, verbose=True)
rf.fit(X_train, y_train)

y_hat_train = rf.predict(X_train)
y_hat_test = rf.predict(X_test)
acc_random_forest = round(rf.score(X_test, y_test) * 100, 2)
acc_random_forest

In [None]:
print('Random Forest:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

In [None]:
y_score = rf.predict_proba(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])

print(auc(fpr,tpr), threshold)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])a
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show();

In [None]:
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_hat_train = bnb.predict(X_train)
y_hat_test = bnb.predict(X_test)
acc_bnb = round(bnb.score(X_train, y_train) * 100, 2)
acc_bnb

In [None]:
print('Naive Bayes:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

In [None]:
y_score = bnb.predict_proba(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])

print(auc(fpr,tpr), threshold)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show();

# Pennsylvania

In [None]:
pa_df = pickle.load(open('post_eda/eda_pa.p', 'rb'))
word_list = list(pa_df['text'])

vectorized = vectorizer.fit_transform(word_list)
data_matrix = pd.DataFrame(vectorized.toarray(), columns=vectorizer.get_feature_names())
X_train, X_test, y_train, y_test = train_test_split(data_matrix, pa_df.review_rating, test_size=0.33)

rf = RandomForestClassifier(n_estimators=16, verbose=True)
rf.fit(X_train, y_train)

y_hat_train = rf.predict(X_train)
y_hat_test = rf.predict(X_test)
acc_random_forest = round(rf.score(X_test, y_test) * 100, 2)
acc_random_forest

In [None]:
print('Random Forest:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

In [None]:
y_score = rf.predict_proba(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])

print(auc(fpr,tpr), threshold)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])a
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show();

In [None]:
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_hat_train = bnb.predict(X_train)
y_hat_test = bnb.predict(X_test)
acc_bnb = round(bnb.score(X_train, y_train) * 100, 2)
acc_bnb

In [None]:
print('Naive Bayes:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

In [None]:
y_score = bnb.predict_proba(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])

print(auc(fpr,tpr), threshold)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show();

# Quebec

In [None]:
qc_df = pickle.load(open('post_eda/eda_qc.p', 'rb'))
word_list = list(qc_df['text'])

vectorized = vectorizer.fit_transform(word_list)
data_matrix = pd.DataFrame(vectorized.toarray(), columns=vectorizer.get_feature_names())
X_train, X_test, y_train, y_test = train_test_split(data_matrix, qc_df.review_rating, test_size=0.33)

rf = RandomForestClassifier(n_estimators=16, verbose=True)
rf.fit(X_train, y_train)

y_hat_train = rf.predict(X_train)
y_hat_test = rf.predict(X_test)
acc_random_forest = round(rf.score(X_test, y_test) * 100, 2)
acc_random_forest

In [None]:
print('Random Forest:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

In [None]:
y_score = rf.predict_proba(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])

print(auc(fpr,tpr), threshold)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])a
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show();

In [None]:
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_hat_train = bnb.predict(X_train)
y_hat_test = bnb.predict(X_test)
acc_bnb = round(bnb.score(X_train, y_train) * 100, 2)
acc_bnb

In [None]:
print('Naive Bayes:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

In [None]:
y_score = bnb.predict_proba(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])

print(auc(fpr,tpr), threshold)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show();

# Wisconsin

In [None]:
wi_df = pickle.load(open('post_eda/eda_wi.p', 'rb'))
word_list = list(wi_df['text'])

vectorized = vectorizer.fit_transform(word_list)
data_matrix = pd.DataFrame(vectorized.toarray(), columns=vectorizer.get_feature_names())
X_train, X_test, y_train, y_test = train_test_split(data_matrix, wi_df.review_rating, test_size=0.33)

rf = RandomForestClassifier(n_estimators=16, verbose=True)
rf.fit(X_train, y_train)

y_hat_train = rf.predict(X_train)
y_hat_test = rf.predict(X_test)
acc_random_forest = round(rf.score(X_test, y_test) * 100, 2)
acc_random_forest

In [None]:
print('Random Forest:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

In [None]:
y_score = rf.predict_proba(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])

print(auc(fpr,tpr), threshold)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])a
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show();

In [None]:
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_hat_train = bnb.predict(X_train)
y_hat_test = bnb.predict(X_test)
acc_bnb = round(bnb.score(X_train, y_train) * 100, 2)
acc_bnb

In [None]:
print('Naive Bayes:\n 1. train 2. test')
print(classification_report(y_train, y_hat_train), 
      classification_report(y_test, y_hat_test), 
      sep='\n-------------------------------------------------------\n')

In [None]:
y_score = bnb.predict_proba(X_test)

fpr, tpr, threshold = roc_curve(y_test, y_score[:,1])

print(auc(fpr,tpr), threshold)

sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

print('AUC: {}'.format(auc(fpr, tpr)))
plt.figure(figsize=(10, 8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show();