In [2]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
imdb = pd.read_csv('imdb_labelled.txt', delimiter= '\t', header=None)
imdb.head(25)

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
5,"The rest of the movie lacks art, charm, meanin...",0
6,Wasted two hours.,0
7,Saw the movie today and thought it was a good ...,1
8,A bit predictable.,0
9,Loved the casting of Jimmy Buffet as the scien...,1


In [4]:
imdb.columns = ['review', 'pos_neg']

In [5]:
keywords = ['best', 'loved', 'good', 'great', 'beautiful', 'love', 'enjoyed', 'perfect', 'wonderful', 'nice', 'cool', 'treat', 'worth', 'classic', '10']

for key in keywords:
    imdb[str(key)] = imdb.review.str.contains(' '+str(key)+ ' ', case=False)

In [6]:
data = imdb[keywords]
target = imdb['pos_neg']

In [7]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()

y_pred = bnb.fit(data, target).predict(data)

print("Accuracy of model: {}".format((target == y_pred).sum()/data.shape[0]*100))

Accuracy of model: 60.6951871657754


In [8]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)

array([[338,  24],
       [270, 116]])

From this we see that the majority of the error is coming from failure to predict the positive reviews, with 270/294 of our errors failing to identify positive reviews. So, we want to get better at predicting the positive reviews. 

In [9]:
# Test your model with different holdout groups.

from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 20% Holdout: 0.56
Testing on Sample: 0.606951871657754


In [10]:
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.57894737, 0.57894737, 0.64      , 0.61333333, 0.64      ,
       0.57333333, 0.68918919, 0.59459459, 0.58108108, 0.54054054])

After receiving better performance on the whole sample compared to our holdout, I'll attempt to add a few features to improve the performance of the model.

In [20]:
imdb.iloc[np.where(imdb['pos_neg'] == 1)]['review']

4      The best scene in the movie was when Gerardo i...
7      Saw the movie today and thought it was a good ...
9      Loved the casting of Jimmy Buffet as the scien...
10                  And those baby owls were adorable.  
11     The movie showed a lot of Florida at it's best...
12     The Songs Were The Best And The Muppets Were S...
13                                     It Was So Cool.  
14     This is a very "right on case" movie that deli...
16     This review is long overdue, since I consider ...
17     I'll put this gem up against any movie in term...
18     It's practically perfect in all of them  a tr...
20     This if the first movie I've given a 10 to in ...
21     If there was ever a movie that needed word-of-...
22     Overall, the film is interesting and thought-p...
23     Plus, it was well-paced and suited its relativ...
24                               Give this one a look.  
25                                      I gave it a 10  
26     The Wind and the Lion is

In [21]:
keywords = ['best', 'loved', 'good', 'great', 'beautiful', 'love', 'enjoyed', 'perfect', 'wonderful', 'nice', 'cool', 'treat', 'worth', 'classic', '10', 'terrific', 'superb', 'well']

for key in keywords:
    imdb[str(key)] = imdb.review.str.contains(' '+str(key)+ ' ', case=False)

In [22]:
data = imdb[keywords]
target = imdb['pos_neg']

In [24]:
from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=20)
print('With 30% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 30% Holdout: 0.6044444444444445
Testing on Sample: 0.6163101604278075


In [25]:
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.61842105, 0.59210526, 0.65333333, 0.6       , 0.64      ,
       0.58666667, 0.68918919, 0.58108108, 0.59459459, 0.56756757])

We do see better performance with the addition of more features so we will keep these additional features. 