In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import re
import sklearn
%matplotlib inline

[Classifier 1](#Classifier-1) <br>
[Classifier 2](#Classifier-2) <br>
[Classifier 3](#Classifier-3) <br>
[Classifier 3 Holdout Group/Cross-Val](#Classifier-3-Holdout-Group-Cross-Val) <br>
[Classifier 4](#Classifier-4) <br>
[Classifier 5](#Classifier-5) <br>
[Holdout Groups](#Holdout-Groups)

In [2]:
text = pd.read_table("imdb_labelled.txt", sep = '\t', names = ['Review', 'Rating'])
amazon = pd.read_table("amazon_cells_labelled.txt", sep = '\t', names = ['Review', 'Rating'])
yelp = pd.read_table("yelp_labelled.txt", sep = '\t', names = ['Review', 'Rating'])

<a id = "Classifier-1"></a>
### [Classifier 1](#Classifier-1)

In [3]:
# Creating the keyword lists:

positive_keywords = ['love', 'liked', 'best', 'great', 'incredible', 'beautiful', 'cool', 'wonderful']

In [4]:
# Applying those lists to the text of the reviews:

for key in positive_keywords:
    text[str(key)] = text.Review.str.contains(
    ' ' + str(key) + ' ',
        case = False)

In [5]:
# Changing the Rating column so it contains boolean values:

text['Rating'] = (text['Rating'] == 1)

In [6]:
# Setting the data and target columns:

data = text[positive_keywords]
target = text['Rating']

In [7]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()

bnb.fit(data, target)

y_pred = bnb.predict(data)

print("Number of total columns: {}.  Number of mislabeled columns: {}".format(
        data.shape[0],
        (target != y_pred).sum()))

Number of total columns: 748.  Number of mislabeled columns: 329


In [8]:
TN = ((text['Rating'] == False) & (y_pred == False)).sum()
FP = ((text['Rating'] == False) & (y_pred == True)).sum()
TP = ((text['Rating'] == True) & (y_pred == True)).sum()
FN = ((text['Rating'] == True) & (y_pred == False)).sum()

In [9]:
matrix = [[TN, FP],
           [FN, TP]]
matrix

[[350, 12], [317, 69]]

In [10]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)

array([[350,  12],
       [317,  69]])

In [11]:
# Percentage of positives correctly identified:
sensitivity = (TP/(FN+TP))

# Percentage of negatives correctly identified:
specificity = (TN/(TN+FP))

print("The Sensitivity is {:.2%}. The Specificity is {:.2%}.".format(sensitivity, specificity))

The Sensitivity is 17.88%. The Specificity is 96.69%.


In [12]:
target.value_counts()

True     386
False    362
Name: Rating, dtype: int64

In [13]:
X_test = data[:74]
X_test2 = data[75:150]
X_train = data[75:]
X_train2 = pd.concat([X_test, data[151:]])

y_test = target[:74]
y_test2 = target[75:150]
y_train = target[75:]
y_train2 = pd.concat([y_test, target[151:]])

In [14]:
bnb.fit(X_train2, y_train2)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [15]:
bnb.score(X_train2, y_train2)


0.5499254843517138

In [16]:
bnb.score(X_test2, y_test2)

0.22666666666666666

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)

In [18]:
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 20% Holdout: 0.5733333333333334
Testing on Sample: 0.5601604278074866


In [19]:
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.5       , 0.56578947, 0.54666667, 0.53333333, 0.56      ,
       0.54666667, 0.66216216, 0.54054054, 0.58108108, 0.54054054])

This classifier was biased towards negative labeling.  Given that the list of positive terms was so short, the classifer would classify anything not containing those words as negative, even though they were positive.  As a result, the false negatives were overly prevalent, while the negative columns were well predicted.  

<a = id = "Classifier-2"></a>
### [Classifier 2](#Classifier-2)

In [57]:
negative_keywords = ['terrible', 'hated', 'bad', 'too', 'awful', 'slow', 'bore', 'boring']

In [58]:
for key in negative_keywords:
    text[str(key)] = text.Review.str.contains(
    ' ' + str(key) + ' ',
        case = False)

In [59]:
data = text[negative_keywords]
text['bad_rating'] = (text['Rating'] == False)
target = text.bad_rating

In [60]:
bnb = BernoulliNB()

bnb.fit(data, target)

y_pred = bnb.predict(data)

In [61]:
print("Number of total columns: {}.  Number of mislabeled columns: {}".format(
        data.shape[0],
        (target != y_pred).sum()))

Number of total columns: 748.  Number of mislabeled columns: 321


In [62]:
TN = ((target == True) & (y_pred == True)).sum()
FP = ((target == True) & (y_pred == False)).sum()
FN = ((target == False) & (y_pred == True)).sum()
TP = ((target == False) & (y_pred == False)).sum()

matrix = [[TN, FP],
           [FN, TP]]
matrix

[[48, 314], [7, 379]]

In [63]:
confusion_matrix(target, y_pred)

array([[379,   7],
       [314,  48]])

In [64]:
# Percentage of positives correctly identified:
sensitivity = (TP/(FN+TP))

# Percentage of negatives correctly identified:
specificity = (TN/(TN+FP))

print("The Sensitivity is {:.2%}. The Specificity is {:.2%}.".format(sensitivity, specificity))

The Sensitivity is 98.19%. The Specificity is 13.26%.


In [67]:
# Holdout Testing
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = .2)
print('Twenty Percent Holdout Score:' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))

Twenty Percent Holdout Score:0.4866666666666667


In [68]:
# Cross-Valuation Testing:
cross_val_score(bnb, data, target, cv = 10)

array([0.52631579, 0.56578947, 0.56      , 0.6       , 0.6       ,
       0.57333333, 0.55405405, 0.58108108, 0.52702703, 0.59459459])

This was trying out the negative list, which correctly identified 8 more inputs than the last classifier.  The Sensitivity and Specificity also switched...which means that we are identifying more positive ratings correctly, but our negative identifiers have lost their accuracy.

<a id = "Classifier-3"></a>
### [Classifier 3](#Classifier-3)

In [69]:
positive_keywords = ['love', 'liked', 'best', 'great', 'incredible', 'beautiful', 'cool', 'wonderful']
negative_keywords = ['terrible', 'hated', 'bad', 'too', 'awful', 'slow', 'bore', 'boring']

for key in positive_keywords:
    text[str(key)] = text.Review.str.contains(
    ' ' + str(key) + ' ',
        case = False)
    
for key in negative_keywords:
    text[str(key)] = text.Review.str.contains(
    ' ' + str(key) + ' ',
        case = False)

In [70]:
data = text[negative_keywords + positive_keywords]
target = text['Rating'] 

In [71]:
bnb.fit(data, target)

y_pred = bnb.predict(data)

In [72]:
print("Number of total columns: {}.  Number of mislabeled columns: {}".format(
        data.shape[0],
        (target != y_pred).sum()))

Number of total columns: 748.  Number of mislabeled columns: 320


In [73]:
TN = ((text['Rating'] == False) & (y_pred == False)).sum()
FP = ((text['Rating'] == False) & (y_pred == True)).sum()
FN = ((text['Rating'] == True) & (y_pred == False)).sum()
TP = ((text['Rating'] == True) & (y_pred == True)).sum()

matrix = [[TN, FP],
           [FN, TP]]
matrix

[[47, 315], [5, 381]]

In [74]:
confusion_matrix(target, y_pred)

array([[ 47, 315],
       [  5, 381]])

In [75]:
# Percentage of positives correctly identified:
sensitivity = (TP/(FN+TP))

# Percentage of negatives correctly identified:
specificity = (TN/(TN+FP))

print("The Sensitivity is {:.2%}. The Specificity is {:.2%}.".format(sensitivity, specificity))

The Sensitivity is 98.70%. The Specificity is 12.98%.


In [77]:
# With Holdout Testing:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = .2)
print('Twenty Percent Holdout Score:' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))

Twenty Percent Holdout Score:0.5266666666666666


In [78]:
# Cross Validation testing:
cross_val_score(bnb, data, target, cv = 10)

array([0.55263158, 0.55263158, 0.54666667, 0.6       , 0.57333333,
       0.57333333, 0.55405405, 0.58108108, 0.52702703, 0.60810811])

This was trying out both of the negatives and positives together, which got us one more correctly identified review.

<a id = "Classifier-4"></a>
### [Classifier 4](#Classifier-4)

In [89]:
pos_keys = ['great', 'incredible', 'like', 'liked', 'love', 'loved', 'awesome', 'I would recommend', 'I recommend',
           'best', 'wonderful']
neg_keys = ['I hated', 'I hate', 'bad', 'terrible', 'disgusting', 'boring', 'bore', 'waste', 'wasted',
           'awful', 'slow', 'worse', 'worst']

In [90]:
for key in pos_keys:
    text[str(key)] = text.Review.str.contains(
    ' ' + str(key) + ' ',
        case = False)
    
for key in neg_keys:
    text[str(key)] = text.Review.str.contains(
    ' ' + str(key) + ' ',
        case = False)

In [91]:
data = text[pos_keys + neg_keys]
target = text['Rating']

In [92]:
bnb.fit(data, target)

y_pred = bnb.predict(data)

In [93]:
print("Number of total columns: {}.  Number of mislabeled columns: {}".format(
        data.shape[0],
        (target != y_pred).sum()))

Number of total columns: 748.  Number of mislabeled columns: 302


In [94]:
TN = ((text['Rating'] == False) & (y_pred == False)).sum()
FP = ((text['Rating'] == False) & (y_pred == True)).sum()
TP = ((text['Rating'] == True) & (y_pred == True)).sum()
FN = ((text['Rating'] == True) & (y_pred == False)).sum()

matrix = [[TN, FP],
           [FN, TP]]
matrix

[[64, 298], [4, 382]]

In [95]:
confusion_matrix(target, y_pred)

array([[ 64, 298],
       [  4, 382]])

In [96]:
# Percentage of positives correctly identified:
sensitivity = (TP/(FN+TP))

# Percentage of negatives correctly identified:
specificity = (TN/(TN+FP))

print("The Sensitivity is {:.2%}. The Specificity is {:.2%}.".format(sensitivity, specificity))

The Sensitivity is 98.96%. The Specificity is 17.68%.


In [101]:
# Holdout Testing:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = .2)
print('Twenty Percent Holdout Score' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))

Twenty Percent Holdout Score0.6533333333333333


In [102]:
# Cross Val testing:
cross_val_score(bnb, data, target, cv = 10)

array([0.51315789, 0.59210526, 0.54666667, 0.62666667, 0.62666667,
       0.54666667, 0.58108108, 0.60810811, 0.54054054, 0.66216216])

<a id = "Classifier-5"></a>
### [Classifier 5](#Classifier-5)

In [103]:
pos_keys = ['great', 'incredible', 'like', 'liked', 'love', 'loved', 'awesome', 'I would recommend', 'I recommend',
           'best', 'wonderful', 'would', 'should']
neg_keys = ['I hated', 'I hate', 'bad', 'terrible', 'disgusting', 'boring', 'bore', 'waste', 'wasted',
           'awful', 'slow', 'worse', 'worst', 'cheesey', 'cheesy', 'raunchy', 'flaw', 'flawed', 'wouldn\'t', 
           'don\'t', 'shouldn\'t', 'don\'t go']

In [104]:
for key in pos_keys:
    text[str(key)] = text.Review.str.contains(
    ' ' + str(key) + ' ',
        case = False)
    
for key in neg_keys:
    text[str(key)] = text.Review.str.contains(
    ' ' + str(key) + ' ',
        case = False)

In [105]:
data = text[pos_keys + neg_keys]
target = text['Rating']

In [106]:
bnb.fit(data, target)

y_pred = bnb.predict(data)

In [107]:
print("Number of total columns: {}.  Number of mislabeled columns: {}".format(
        data.shape[0],
        (target != y_pred).sum()))

Number of total columns: 748.  Number of mislabeled columns: 285


In [108]:
TN = ((text['Rating'] == False) & (y_pred == False)).sum()
FP = ((text['Rating'] == False) & (y_pred == True)).sum()
TP = ((text['Rating'] == True) & (y_pred == True)).sum()
FN = ((text['Rating'] == True) & (y_pred == False)).sum()

matrix = [[TN, FP],
           [FN, TP]]
matrix

[[92, 270], [15, 371]]

In [109]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)

array([[ 92, 270],
       [ 15, 371]])

In [110]:
# Percentage of positives correctly identified:
sensitivity = (TP/(FN+TP))

# Percentage of negatives correctly identified:
specificity = (TN/(TN+FP))

print("The Sensitivity is {:.2%}. The Specificity is {:.2%}.".format(sensitivity, specificity))

The Sensitivity is 96.11%. The Specificity is 25.41%.


In [114]:
# Twenty Percent Holdout:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size = .2)
print('20% Holdout Score' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))

20% Holdout Score0.6333333333333333


In [113]:
#Cross_val
cross_val_score(bnb, data, target, cv = 10)

array([0.51315789, 0.60526316, 0.53333333, 0.64      , 0.64      ,
       0.62666667, 0.59459459, 0.63513514, 0.56756757, 0.68918919])

<a id = "Holdout-Groups"></a>
### [Holdout Groups](#Holdout-Groups)

In [55]:
text_holdout = text[450:747]
text_normal = text[0:449]