# Importing the libraries

In [94]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset

In [95]:
dataset = pd.read_csv('imdb_reviews.csv', delimiter = '\t', quoting = 3)
dataset

Unnamed: 0,"A very, very, very slow-moving, aimless movie about a distressed, drifting young man.",0
0,Not sure who was more lost - the flat characte...,0
1,Attempting artiness with black & white and cle...,0
2,Very little music or anything to speak of.,0
3,The best scene in the movie was when Gerardo i...,1
4,"The rest of the movie lacks art, charm, meanin...",0
5,Wasted two hours.,0
6,Saw the movie today and thought it was a good ...,1
7,A bit predictable.,0
8,Loved the casting of Jimmy Buffet as the scien...,1
9,And those baby owls were adorable.,1


# Cleaning the texts

In [96]:
imdb_array = np.array(dataset)

In [97]:
new_imdb = pd.DataFrame(imdb_array, columns=['title', 'label'])

In [98]:
new_imdb

Unnamed: 0,title,label
0,Not sure who was more lost - the flat characte...,0
1,Attempting artiness with black & white and cle...,0
2,Very little music or anything to speak of.,0
3,The best scene in the movie was when Gerardo i...,1
4,"The rest of the movie lacks art, charm, meanin...",0
5,Wasted two hours.,0
6,Saw the movie today and thought it was a good ...,1
7,A bit predictable.,0
8,Loved the casting of Jimmy Buffet as the scien...,1
9,And those baby owls were adorable.,1


In [99]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/thevirusx3/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [100]:
corpus = []
for i in range(0, 999):
    review = re.sub('[^a-zA-Z]', ' ', new_imdb['title'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [101]:
corpus

['sure lost flat charact audienc nearli half walk',
 'attempt arti black white clever camera angl movi disappoint becam even ridicul act poor plot line almost non exist',
 'littl music anyth speak',
 'best scene movi gerardo tri find song keep run head',
 'rest movi lack art charm mean empti work guess empti',
 'wast two hour',
 'saw movi today thought good effort good messag kid',
 'bit predict',
 'love cast jimmi buffet scienc teacher',
 'babi owl ador',
 'movi show lot florida best made look appeal',
 'song best muppet hilari',
 'cool',
 'right case movi deliv everyth almost right face',
 'averag act main person low budget clearli see',
 'review long overdu sinc consid tale two sister singl greatest film ever made',
 'put gem movi term screenplay cinematographi act post product edit direct aspect film make',
 'practic perfect true masterpiec sea faux masterpiec',
 'structur film easili tightli construct histori cinema',
 'think film someth vital import occur everi minut',
 'word con

# Creating the Bag of Words model

In [102]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

# Splitting the dataset into the Training set and Test set

In [129]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Fitting Naive Bayes to the Training set

In [130]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None)

In [131]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [132]:
print(y_pred)

[0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1
 0 1 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0 0 1 0 0 1 1 0 1 1 0 1
 0 0 1 1 0 1 1 0 0 0 1 1 0 0 1 1 1 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 1 1 1 0 0 0 0 1 0 0 0 1 0 1 1 0
 0 0 0 1 0 1 1 1 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 1 1 0 1 0 1 0
 0 1 0 0 0 0 1 1 0 1 1 0 1 0 0]


In [133]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[75 12]
 [55 58]]


In [134]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.58      0.86      0.69        87
          1       0.83      0.51      0.63       113

avg / total       0.72      0.67      0.66       200



In [135]:
print(f'Accuracy of GNB classifier on training set: {classifier.score(X_train, y_train)}')

print(f'Accuracy of GNB classifier on test set: {classifier.score(X_test, y_test)}')

Accuracy of GNB classifier on training set: 0.8986232790988736
Accuracy of GNB classifier on test set: 0.665


# Fitting Logistic Regression to the Training set

In [136]:
from sklearn.linear_model import LogisticRegression
logclassifier = LogisticRegression()
logclassifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [137]:
log_predict = logclassifier.predict(X_test)

In [138]:
print(log_predict)

[0 1 1 0 1 1 1 0 0 1 1 0 1 0 0 1 0 1 0 1 1 1 1 1 0 1 0 1 1 1 0 0 0 0 1 0 1
 0 1 0 1 0 1 0 1 1 1 0 0 0 0 0 1 0 0 1 0 1 1 0 0 1 1 0 1 1 1 1 1 0 1 1 0 1
 1 1 1 0 1 1 1 1 0 0 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0
 0 1 0 1 0 1 1 0 1 1 1 1 0 1 1 1 0 0 1 1 1 0 1 1 0 0 0 0 1 1 0 1 0 0 1 1 0
 0 0 0 1 1 1 1 1 0 0 1 1 0 1 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 1 1 1 1 1 0
 0 1 0 1 0 0 1 0 1 1 1 1 1 1 0]


In [139]:
new_cm = confusion_matrix(y_test, log_predict)
print(new_cm)

[[67 20]
 [25 88]]


In [140]:
print(classification_report(y_test, log_predict))

             precision    recall  f1-score   support

          0       0.73      0.77      0.75        87
          1       0.81      0.78      0.80       113

avg / total       0.78      0.78      0.78       200



In [141]:
print(f'Accuracy of Logistic regression classifier on training set: {logclassifier.score(X_train, y_train)}')

print(f'Accuracy of Logistic regression classifier on test set: {logclassifier.score(X_test, y_test)}')

Accuracy of Logistic regression classifier on training set: 0.9662077596996246
Accuracy of Logistic regression classifier on test set: 0.775
