In [75]:
# Import Modules

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [76]:
# Load the training and testing data

df_train = pd.read_csv(r"C:\Users\manch\OneDrive\Desktop\Data Science Live Course\Movie Rating using Naive Bayes\Dataset\Train.csv")
df_test = pd.read_csv(r"C:\Users\manch\OneDrive\Desktop\Data Science Live Course\Movie Rating using Naive Bayes\Dataset\Test.csv")

# NLTK

* Bag of words pipline
    - Getting the corpus
    - Tokenization
    - Removing Stopwords
    - Vectorization
    - Building the classification model

In [77]:
# Import modules for NLT Preprocessing

from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [78]:
# Initalise the objects

tokenizer = RegexpTokenizer("[a-zA-z0-9]+")
ps = PorterStemmer()
en_stopwords = set(stopwords.words('English'))
tfidf = TfidfVectorizer()

In [79]:
# 1) Getting the corpus

df_train.head(n=10)

Unnamed: 0,review,label
0,mature intelligent and highly charged melodram...,pos
1,http://video.google.com/videoplay?docid=211772...,pos
2,Title: Opera (1987) Director: Dario Argento Ca...,pos
3,I think a lot of people just wrote this off as...,pos
4,This is a story of two dogs and a cat looking ...,pos
5,Steve Carell comes into his own in his first s...,pos
6,I'm only going to write more because it's requ...,neg
7,"OK, it was a ""risky"" move to rent this flick, ...",neg
8,"Cannibalism, a pair of cinematic references to...",pos
9,This is one of the great modern kung fu films....,pos


In [80]:
X = df_train['review'].values
y = df_train['label'].values

In [81]:
X.shape, y.shape

((40000,), (40000,))

In [22]:
X[1]

'http://video.google.com/videoplay?docid=211772166650071408&hl=en Distribution was tried.<br /><br />We opted for mass appeal.<br /><br />We want the best possible viewing range so, we forgo profit and continue our manual labor jobs gladly to entertain you for working yours.<br /><br />View Texas tale, please write about it... If you like it or not, if you like Alex or not, if you like Stuie, Texas or Texas tale... Just write about it.<br /><br />Your opinion rules.'

In [102]:
# NLTK Preprocessing
def NLTK_preprocessing(review):
    
    words = []
    for i in range(review.shape[0]):
        # 2 Tokenization using Regular Expression
        words.append(tokenizer.tokenize(review[i]))

    useful_words = []
    # 3 Remove stopwords
    for i in range(len(words)):
        useful_text = [w for w in words[i] if w not in en_stopwords]
        useful_words.append(useful_text)
    
    # 4 Stemming
    for i in range(len(useful_words)):
        for j in range(len(useful_words[i])):
            # Stem the words
            useful_words[i][j] = ps.stem(useful_words[i][j])

    # Join the words
    for i in range(len(useful_words)):
        useful_words[i] = " ".join(useful_words[i])
        
    # 5 Vectorization
    VectorizedCorpus = tfidf.fit_transform(useful_words)
    
    return VectorizedCorpus

In [103]:
vc = NLTK_preprocessing(X)

In [104]:
vc.shape

(40000, 64377)

In [106]:
print(vc)

  (0, 55639)	0.07684381064085315
  (0, 34560)	0.15982311717744793
  (0, 50061)	0.058321332120231696
  (0, 40990)	0.3217183297518254
  (0, 51579)	0.11020267915811242
  (0, 58050)	0.1999433719427246
  (0, 33856)	0.07203606426840088
  (0, 10173)	0.3217183297518254
  (0, 42767)	0.0838985395420125
  (0, 54449)	0.29842737764167104
  (0, 62016)	0.5428362876041004
  (0, 418)	0.21900108687785186
  (0, 11052)	0.18262662783195674
  (0, 20446)	0.08967426547136485
  (0, 59055)	0.3217183297518254
  (0, 36303)	0.1741713963354127
  (0, 10634)	0.16107091134223578
  (0, 26195)	0.12278073628212646
  (0, 28472)	0.13171927796418542
  (0, 35611)	0.1688249625042704
  (1, 48424)	0.10938566763734439
  (1, 40962)	0.09125251511187199
  (1, 63807)	0.12270950248787452
  (1, 30041)	0.08825342761017835
  (1, 54441)	0.21788881926230955
  :	:
  (39999, 49319)	0.06557702422070845
  (39999, 21948)	0.08000096010890395
  (39999, 26785)	0.090170939690557
  (39999, 42754)	0.10051817181347081
  (39999, 43641)	0.0649312353104

In [107]:
# label Encode the target column
def label_encoder(target_col):
    
    for i in range(target_col.shape[0]):
        
        # Encode the target columns: 1 for positive class and 0 for negative
        if target_col[i] == 'pos':
            
            target_col[i] = 1
        else:
            
            target_col[i] = 0
    
    return target_col.astype('int')

In [108]:
y = label_encoder(y)

In [110]:
y.shape

(40000,)

# Train_Test_Split 

In [111]:
from sklearn.model_selection import train_test_split

In [112]:
 X_train, X_test, y_train, y_test = train_test_split(vc, y, test_size=0.3, random_state=42)

In [113]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((28000, 64377), (28000,), (12000, 64377), (12000,))

# Classification

   * Multinomial Naive Bayes
   * Bernoulli Naive Bayes

In [114]:
# Load the models
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

In [118]:
# Initialise the object
mnb = MultinomialNB()
bnm = BernoulliNB()

In [119]:
# 1) Multinomial NB model

mnb.fit(X_train, y_train)

MultinomialNB()

In [138]:
# Make Predictions
training_predictions = mnb.predict(X_test)

In [139]:
# Import Cross Validation score, Confusion matrix, precesion and recall
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score

print("Cross Validation Score : " + str(cross_val_score(mnb, X_train, y_train, cv = 4, scoring = "accuracy")))
print("Precision Score :" + str(precision_score(y_test, training_predictions)))
print("Recall Score : " + str(recall_score(y_test, training_predictions)))
print("Confusion Matrix : " + str(confusion_matrix(y_test, training_predictions)))

Cross Validation Score : [0.85957143 0.85985714 0.85142857 0.85057143]
Precision Score :0.878735232800556
Recall Score : 0.8354806739345887
Confusion Matrix : [[5248  698]
 [ 996 5058]]


In [155]:
# 2) Bernoulli NB model
bnm.fit(X_train, y_train)

BernoulliNB()

In [157]:
# Make predictions
training_predictions_bnm = bnm.predict(X_test)

In [158]:
# Import Cross Validation score, Confusion matrix, precesion and recall
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score

print("Cross Validation Score : " + str(cross_val_score(bnm, X_train, y_train, cv = 4, scoring = "accuracy")))
print("Precision Score :" + str(precision_score(y_test, training_predictions_bnm)))
print("Recall Score : " + str(recall_score(y_test, training_predictions_bnm)))
print("Confusion Matrix : " + str(confusion_matrix(y_test, training_predictions_bnm)))

Cross Validation Score : [0.84785714 0.84528571 0.841      0.83542857]
Precision Score :0.8761377833303587
Recall Score : 0.8108688470432772
Confusion Matrix : [[5252  694]
 [1145 4909]]


# Test the model on testing data

In [131]:
df_test.head(n=10)

Unnamed: 0,review
0,Remember those old kung fu movies we used to w...
1,This movie is another one on my List of Movies...
2,How in the world does a thing like this get in...
3,"""Queen of the Damned"" is one of the best vampi..."
4,The Caprica episode (S01E01) is well done as a...
5,I usually really enjoy Steven Seagal movies. T...
6,JiÃ¸Ã­ Trnka made his last animated short an i...
7,This is so bad it will be my contribution to t...
8,Watching this hilariously retro but very enter...
9,"Excellent political thriller, played much quie..."


In [137]:
test = df_test['review'].values

In [135]:
test.shape

(10000,)

In [140]:
# NLTK Preprocessing
def NLTK_preprocessing_testing(review):
    
    words = []
    for i in range(review.shape[0]):
        # 2 Tokenization using Regular Expression
        words.append(tokenizer.tokenize(review[i]))

    useful_words = []
    # 3 Remove stopwords
    for i in range(len(words)):
        useful_text = [w for w in words[i] if w not in en_stopwords]
        useful_words.append(useful_text)
    
    # 4 Stemming
    for i in range(len(useful_words)):
        for j in range(len(useful_words[i])):
            # Stem the words
            useful_words[i][j] = ps.stem(useful_words[i][j])

    # Join the words
    for i in range(len(useful_words)):
        useful_words[i] = " ".join(useful_words[i])
    
    return useful_words

In [141]:
testing_words = NLTK_preprocessing_testing(test)

In [144]:
len(testing_words)

10000

In [145]:
testing_vc = tfidf.transform(testing_words)

In [146]:
testing_vc.shape

(10000, 64377)

In [147]:
# Make Predictions

testing_predictions = mnb.predict(testing_vc)

In [152]:
def convert_to_csv(arr):
    
    # Convert integer classes into strings
    arr = arr.astype('str')
    
    for i in range(arr.shape[0]):
        
        if arr[i] == "1":
            arr[i] = 'pos'
        else:
            arr[i] = 'neg'
            
    # Create a dataframe
    df = pd.DataFrame(arr, columns = ['label'])
    df.index.name = 'Id'
    
    # Convert to csv
    
    final_submissions_1 = df.to_csv(r"C:\Users\manch\OneDrive\Desktop\Data Science Live Course\Movie Rating using Naive Bayes\Dataset\final_submissions_1.csv")

In [153]:
convert_to_csv(testing_predictions)

In [159]:
testing_predictions_bnb = bnm.predict(testing_vc)

In [160]:
convert_to_csv(testing_predictions_bnb)