In [104]:
import numpy as np 
import pandas as pd
import sklearn

train_docs = pd.read_csv("movie_review_train.csv")
train_docs.head()

Unnamed: 0,class,text
0,Pos,a common complaint amongst film critics is ...
1,Pos,whew this film oozes energy the kind of b...
2,Pos,steven spielberg s amistad which is bas...
3,Pos,he has spent his entire life in an awful litt...
4,Pos,being that it is a foreign language film with...


In [105]:
train_docs['class'].value_counts()

Pos    800
Neg    800
Name: class, dtype: int64

In [106]:
print("Positive rate is about {0}%".format(round((((train_docs[train_docs['class'] == 'Pos']).shape[0])/(train_docs.shape[0]))*100,2)))

Positive rate is about 50.0%


In [107]:
train_docs['class'] = train_docs['class'].map({'Pos':1,'Neg':0})
train_docs

Unnamed: 0,class,text
0,1,a common complaint amongst film critics is ...
1,1,whew this film oozes energy the kind of b...
2,1,steven spielberg s amistad which is bas...
3,1,he has spent his entire life in an awful litt...
4,1,being that it is a foreign language film with...
...,...,...
1595,0,if anything stigmata should be taken as...
1596,0,john boorman s zardoz is a goofy cinemati...
1597,0,the kids in the hall are an acquired taste ...
1598,0,there was a time when john carpenter was a gr...


In [108]:
# renaming 'class' to label
train_docs.rename(columns={'class':'label'},inplace=True)

In [109]:
# convert to X and y 

X = train_docs.text
y = train_docs.label

print(X.shape)
print(Y.shape)


(1600,)
(1600,)


In [110]:
# # splitting into test and train
# from sklearn.model_selection  import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=1)

In [111]:
X.head()

0     a common complaint amongst film critics is   ...
1     whew   this film oozes energy   the kind of b...
2     steven spielberg s   amistad     which is bas...
3     he has spent his entire life in an awful litt...
4     being that it is a foreign language film with...
Name: text, dtype: object

In [112]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: label, dtype: int64

In [113]:
# vectorizing the sentences with stop words removed
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english',min_df=.03, max_df=.8)

In [114]:
#fitting the training data to vetorize
vect.fit(X)

CountVectorizer(max_df=0.8, min_df=0.03, stop_words='english')

In [115]:
# printing the vocabulary
vect.vocabulary_

{'common': 264,
 'critics': 323,
 'aren': 78,
 'available': 101,
 'gives': 618,
 'hope': 693,
 'art': 81,
 'writing': 1632,
 'isn': 753,
 'dead': 342,
 'hollywood': 690,
 'need': 970,
 'look': 853,
 'films': 549,
 'content': 287,
 'paul': 1037,
 'script': 1248,
 'takes': 1429,
 'thriller': 1471,
 'late': 805,
 'delivers': 357,
 'telling': 1449,
 'post': 1089,
 'war': 1571,
 'american': 59,
 'dream': 413,
 'tv': 1512,
 'radio': 1145,
 'heavy': 673,
 'direction': 386,
 'robert': 1205,
 'performances': 1043,
 'john': 766,
 'rob': 1204,
 'perfectly': 1041,
 'usually': 1537,
 'quality': 1136,
 'sets': 1272,
 'camera': 192,
 'work': 1618,
 'recent': 1165,
 'century': 213,
 'period': 1044,
 'pieces': 1056,
 'years': 1638,
 'old': 1001,
 'images': 716,
 'true': 1502,
 'era': 464,
 'generation': 605,
 'gone': 625,
 '15': 4,
 'world': 1623,
 'themes': 1461,
 'good': 626,
 'life': 831,
 'family': 508,
 'match': 894,
 'father': 521,
 'fame': 506,
 'audience': 99,
 'appear': 72,
 'familiar': 507,
 

In [116]:
#checking the length of the vocabulary
len(vect.vocabulary_.keys())

1643

In [134]:
# transforming the train and test datasets
X_train_transformed = vect.transform(X)
# X_test_transformed = vect.transform(y)

In [135]:
X_train_transformed

<1600x1643 sparse matrix of type '<class 'numpy.int64'>'
	with 217396 stored elements in Compressed Sparse Row format>

In [136]:
print(X_train_transformed)

  (0, 4)	1
  (0, 59)	2
  (0, 72)	1
  (0, 78)	1
  (0, 81)	1
  (0, 99)	3
  (0, 101)	1
  (0, 192)	1
  (0, 211)	1
  (0, 213)	1
  (0, 220)	1
  (0, 264)	2
  (0, 287)	1
  (0, 316)	1
  (0, 323)	1
  (0, 328)	1
  (0, 340)	1
  (0, 342)	1
  (0, 357)	1
  (0, 386)	1
  (0, 413)	2
  (0, 445)	2
  (0, 464)	1
  (0, 503)	1
  (0, 506)	1
  :	:
  (1599, 1247)	1
  (1599, 1258)	2
  (1599, 1267)	1
  (1599, 1271)	1
  (1599, 1331)	1
  (1599, 1335)	1
  (1599, 1339)	1
  (1599, 1366)	3
  (1599, 1371)	1
  (1599, 1375)	1
  (1599, 1379)	1
  (1599, 1413)	1
  (1599, 1421)	1
  (1599, 1429)	2
  (1599, 1447)	1
  (1599, 1501)	1
  (1599, 1533)	1
  (1599, 1550)	1
  (1599, 1555)	1
  (1599, 1556)	1
  (1599, 1570)	1
  (1599, 1579)	2
  (1599, 1589)	1
  (1599, 1609)	1
  (1599, 1616)	1


In [137]:
print(X_train_transformed.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


### 2. Building and Evaluating the Model

In [141]:
from sklearn.naive_bayes import BernoulliNB

# instantiating bernoulli NB class
bnb=BernoulliNB()

In [143]:
# fitting the model
bnb.fit(X_train_transformed, y)

X_test=X_test_transformed.toarray()

#predict class
# bnb.predict_proba(X_test)
prob_bnb = bnb.predict_proba(X_test)
pred_class = bnb.predict(X_test)

### Model Evaluation

In [147]:
#printing the overall accuracy 
from sklearn import metrics
metrics.accuracy_score(y_test,pred_class)

0.79

In [148]:
#Confusion Metrics
metrics.confusion_matrix(y_test,pred_class)

array([[177,  23],
       [ 61, 139]], dtype=int64)

In [90]:
confusion = metrics.confusion_matrix(y_test,y_pred_class)

In [91]:
# Extracting TN,FP,FN,TP values evaluate other metrics
TN = confusion[0,0]
FP = confusion[0,1]
FN = confusion[1,0]
TP = confusion[1,1]

In [92]:
#Calculating Sensivity
Sensivity = TP/(TP+FN)
Sensivity

0.8214285714285714

In [93]:
#Calculating Specificity
Specificity = TN/(TN+FP)
Specificity

0.7990196078431373

In [94]:
#Calculating precision
precision = TP / float(TP + FP)
precision

0.7970297029702971

In [95]:
print("precision",precision)
print("PRECISION SCORE :",metrics.precision_score(y_test, y_pred_class))
print("RECALL SCORE :", metrics.recall_score(y_test, y_pred_class))
print("F1 SCORE :",metrics.f1_score(y_test, y_pred_class))

precision 0.7970297029702971
PRECISION SCORE : 0.7970297029702971
RECALL SCORE : 0.8214285714285714
F1 SCORE : 0.8090452261306533


In [96]:
# Creating ROC Curve
from sklearn.metrics import confusion_matrix as sk_confusion_matrix
from sklearn.metrics import roc_curve , auc
import matplotlib.pyplot as plt

In [97]:
fpr, tpr, threshold = roc_curve(y_test, predic_prob[:,1])
roc_auc = auc(fpr, tpr)

In [98]:
# Area under the curve 
print(roc_auc)

0.8947829131652661


In [99]:
len(y_test)

400

In [101]:
# matrix of thresholds, tpr, fpr
pd.DataFrame({'Threshold': threshold, 
              'TPR': tpr, 
              'FPR':fpr
             })

Unnamed: 0,Threshold,TPR,FPR
0,2.000000e+00,0.000000,0.000000
1,1.000000e+00,0.454082,0.049020
2,1.000000e+00,0.464286,0.049020
3,1.000000e+00,0.474490,0.049020
4,1.000000e+00,0.484694,0.049020
...,...,...,...
82,9.553986e-23,0.989796,0.833333
83,7.646669e-23,0.994898,0.833333
84,4.092430e-34,0.994898,0.960784
85,8.016768e-35,1.000000,0.960784


In [120]:
train_docs_test = pd.read_csv("movie_review_test.csv")
train_docs.head()

Unnamed: 0,label,text
0,1,a common complaint amongst film critics is ...
1,1,whew this film oozes energy the kind of b...
2,1,steven spielberg s amistad which is bas...
3,1,he has spent his entire life in an awful litt...
4,1,being that it is a foreign language film with...


In [121]:
train_docs_test['class'] = train_docs_test['class'].map({'Pos':1,'Neg':0})
train_docs_test

Unnamed: 0,class,text
0,1,films adapted from comic books have had plent...
1,1,every now and then a movie comes along from a...
2,1,you ve got mail works alot better than it des...
3,1,jaws is a rare film that grabs your atte...
4,1,moviemaking is a lot like being the general m...
...,...,...
395,0,one of the first films of 1999 is this mtv pi...
396,0,susan granger s review of america s sweethe...
397,0,susan granger s review of jeepers creepers ...
398,0,this independent film written and directed by...


In [122]:
# renaming 'class' to label
train_docs_test.rename(columns={'class':'label'},inplace=True)

In [125]:

X_test = train_docs_test.text
y_test = train_docs_test.label

print(X_test.shape)
print(y_test.shape)


(400,)
(400,)


In [126]:
X_test_transformed = vect.transform(X_test)

In [130]:
X_test_transformed.shape

(400, 1643)

In [131]:
X_train_transformed.shape

(1600, 1643)

In [133]:
X_test_transformed.size

51663

In [140]:
print(X_test_transformed.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 1 0 ... 0 0 0]
 [0 1 0 ... 0 1 0]
 [0 0 0 ... 0 2 0]]


In [146]:
print(y_test.size)

400
