# STEP 1: IMPORT REQUIRED LIBRARIES

In [16]:
import pandas as pd
import numpy as np
import nltk
import timeit
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# STEP 2: SET RANDOM SEED

In [17]:
# This is used to reproduce the same result every time if the script is kept consistent.
# Otherwise, each run will produce different results. The seed can be set to any number.
np.random.seed(500)

# STEP 3: LOAD IMDB MOVIE REVIEW DATASET

In [18]:
# STEP 3: ADD CORPUS
print("Loading data...")
dataset = pd.read_csv("Datasets/IMDB.csv", encoding='latin-1')
print("Data successfully loaded!")
print(dataset)
# positive sentiment = 0, negative sentiment = 1

Loading data...
Data successfully loaded!
                                                    text sentiment
0      Now, I won't deny that when I purchased this o...       neg
1      The saddest thing about this "tribute" is that...       neg
2      Last night I decided to watch the prequel or s...       neg
3      I have to admit that i liked the first half of...       neg
4      I was not impressed about this film especially...       neg
...                                                  ...       ...
49995  For one thing, he produced this movie. It has ...       neg
49996  The title comes from an alteration an adolesce...       pos
49997  Christopher Nolan's first film is a 'no budget...       pos
49998  The story is shortly about the faith-lacking b...       neg
49999  I found parts of this movie rather slow, espec...       pos

[50000 rows x 2 columns]


# STEP 4: DATA PROCESSING

In [19]:
# Step 4a: Remove blank rows if any.
print("\nStart of Data Pre-processing...")
print("- Removing blanks...")
dataset['text'].dropna(inplace=True)
print("- Blanks removed.")


Start of Data Pre-processing...
- Removing blanks...
- Blanks removed.


In [20]:
# Step 4b : Change all the text to lower case. 
print("- Converting to lowercase...")
dataset['text'] = [entry.lower() for entry in dataset['text']]
print("- Converted to lowercase.")

- Converting to lowercase...
- Converted to lowercase.


In [21]:
# Step 4c: Tokenization to divide each movie review into a set of words (unigrams)
print("- Starting tokenization...")
dataset['text'] = [word_tokenize(entry) for entry in dataset['text']]
print("- Tokenization is complete")
print(dataset['text'])
print("- End of data pre-processing.")
print("\n***** Current View of Corpus *****\n")
print(dataset)

- Starting tokenization...
- Tokenization is complete
0        [now, ,, i, wo, n't, deny, that, when, i, purc...
1        [the, saddest, thing, about, this, ``, tribute...
2        [last, night, i, decided, to, watch, the, preq...
3        [i, have, to, admit, that, i, liked, the, firs...
4        [i, was, not, impressed, about, this, film, es...
                               ...                        
49995    [for, one, thing, ,, he, produced, this, movie...
49996    [the, title, comes, from, an, alteration, an, ...
49997    [christopher, nolan, 's, first, film, is, a, '...
49998    [the, story, is, shortly, about, the, faith-la...
49999    [i, found, parts, of, this, movie, rather, slo...
Name: text, Length: 50000, dtype: object
- End of data pre-processing.

***** Current View of Corpus *****

                                                    text sentiment
0      [now, ,, i, wo, n't, deny, that, when, i, purc...       neg
1      [the, saddest, thing, about, this, ``, tribute..

# STEP 5: PREPARE TRAIN & TEST DATASETS

In [22]:
print("- Splitting dataset (80% train, 20% test)...")
train_x, test_x, train_y, test_y = model_selection.train_test_split(dataset['text'], dataset['sentiment'], test_size=0.2)
print("- Data splitting is complete.")
print(train_x)
print(train_y)

- Splitting dataset (80% train, 20% test)...
- Data splitting is complete.
543      [i, have, read, all, of, the, love, come, soft...
25470    [i, was, in, my, mid, teens, when, i, saw, thi...
29798    [while, the, camerawork, was, certainly, ``, f...
28698    [if, you, are, looking, for, a, modern, film, ...
18438    [this, film, got, terrible, reviews, but, beca...
                               ...                        
19389    [the, howling, ii, starts, as, it, means, to, ...
3790     [this, was, a, new, alltime, low, among, weste...
41233    [heart, of, darkness, was, terrible, ., the, n...
44865    [the, three, stooges, has, always, been, some,...
17335    [look, ,, i, 'm, reading, and, reading, this, ...
Name: text, Length: 40000, dtype: object
543      neg
25470    pos
29798    pos
28698    pos
18438    pos
        ... 
19389    neg
3790     neg
41233    neg
44865    pos
17335    neg
Name: sentiment, Length: 40000, dtype: object


# STEP 6: DATA ENCODING

In [23]:
print("- Encoding data if not already done (i.e. categorical data converted to numerical values)...\n")
Encoder = LabelEncoder()
print("Original Train_Y: ", train_y)
train_y = Encoder.fit_transform(train_y)
print("New Train_Y:      ", train_y)
print("Original Test_Y:  ", test_y)
test_y = Encoder.fit_transform(test_y)
print("New Test_Y:       ", test_y)
print("\n- Data encoding is complete.")

- Encoding data if not already done (i.e. categorical data converted to numerical values)...

Original Train_Y:  543      neg
25470    pos
29798    pos
28698    pos
18438    pos
        ... 
19389    neg
3790     neg
41233    neg
44865    pos
17335    neg
Name: sentiment, Length: 40000, dtype: object
New Train_Y:       [0 1 1 ... 0 1 0]
Original Test_Y:   26124    pos
8562     neg
43159    pos
41935    neg
8867     pos
        ... 
37047    pos
40066    pos
33888    neg
27835    pos
1304     neg
Name: sentiment, Length: 10000, dtype: object
New Test_Y:        [1 0 1 ... 0 1 0]

- Data encoding is complete.


# STEP 7: TF-IDF Feature Vectorization

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)

tfidf.fit(dataset['text'])
tfidf.vocabulary_

{'now': 107301,
 ',': 6266,
 'i': 77891,
 'wo': 165187,
 "n't": 103403,
 'deny': 46722,
 'that': 149932,
 'when': 163791,
 'purchased': 121533,
 'this': 150626,
 'off': 108441,
 'ebay': 53177,
 'had': 71132,
 'high': 74533,
 'expectations': 57748,
 '.': 6986,
 'was': 162198,
 'an': 17111,
 'incredible': 79600,
 'out-of-print': 110584,
 'work': 165654,
 'from': 64521,
 'the': 149994,
 'master': 95659,
 'of': 108413,
 'comedy': 38656,
 'so': 139617,
 'enjoy': 55238,
 'however': 76953,
 'soon': 140426,
 'to': 152004,
 'be': 23800,
 'disappointed': 48797,
 'apologies': 18668,
 'those': 150756,
 'who': 164104,
 'enjoyed': 55259,
 'it': 82146,
 'but': 31081,
 'just': 84472,
 'found': 63569,
 'compleat': 39226,
 'al': 15490,
 'very': 160025,
 'difficult': 48106,
 'watch': 162325,
 'got': 68881,
 'a': 12489,
 'few': 60617,
 'smiles': 139097,
 'sure': 146492,
 'majority': 94006,
 'funny': 65023,
 'came': 31924,
 'music': 102884,
 'videos': 160340,
 '(': 5393,
 'which': 163837,
 "'ve": 5114,
 'o

In [26]:
# These will now contain for each row a list of unique integer number and
# its associated importance as calculated by TF-IDF.
train_x_tfidf = tfidf.transform(train_x)
test_x_tfidf = tfidf.transform(test_x)
print("Word vectorization is complete.")
print("\n- Tfidf vectorized format of training set: ")
print(train_x_tfidf)
print("\n- Tfidf vectorized format of testing set: ")
print(test_x_tfidf)

Word vectorization is complete.

- Tfidf vectorized format of training set: 
  (0, 167122)	0.07964176797888631
  (0, 164139)	0.06528437786785464
  (0, 163045)	0.04575004974678714
  (0, 162198)	0.14276749576037187
  (0, 160869)	0.1135938381204398
  (0, 158852)	0.07541086601202374
  (0, 155383)	0.11711980458250261
  (0, 152441)	0.05034290237328452
  (0, 152004)	0.021256661923826824
  (0, 150626)	0.044002503983757615
  (0, 150463)	0.058200880542624706
  (0, 150375)	0.07361558164657424
  (0, 150102)	0.050353847855037694
  (0, 149994)	0.28248077266901955
  (0, 149932)	0.19406297749430287
  (0, 143925)	0.04469554372737391
  (0, 139967)	0.17647442783051487
  (0, 139617)	0.03558095640065858
  (0, 130542)	0.06043055114736108
  (0, 130087)	0.17524974561519838
  (0, 129114)	0.1199896207643265
  (0, 124134)	0.15101381709540648
  (0, 123460)	0.08965372589547715
  (0, 117015)	0.06627073253433186
  (0, 115920)	0.1585533672640553
  :	:
  (39999, 14745)	0.030053981547781904
  (39999, 13321)	0.055856684

# STEP 8: TRAIN & TEST ML MODELS FOR PREDICTING SENTIMENT OF IMDB MOVIE REVIEWS

In [27]:
# --- STEP 8: USE ML ALGORITHMS TO PREDICT OUTCOME ---
print("\n********** ML ALGORITHMS **********\n")
print("- Classifier 1: Multinomial Naive Bayes")
# fit the training dataset on the NB classifier
print("|___ Fitting training dataset on the MNB classifier...")
model_MNB = naive_bayes.MultinomialNB()
start = timeit.default_timer()
model_MNB.fit(train_x_tfidf, train_y)
stop = timeit.default_timer()
print("|___ Training is complete. Total time: ", stop - start)
# predict the labels on validation dataset
print("|___ Making predictions...")
start = timeit.default_timer()
predictions_MNB = model_MNB.predict(test_x_tfidf)
stop = timeit.default_timer()
print("|___ Finished making predictions. Total time: ", stop - start)
# Use accuracy_score function to get the accuracy
# ground truth: news_test.target; predicted labels: predicted
print("|___ Naive Bayes Accuracy Score -> ", accuracy_score(test_y, predictions_MNB)*100)
print(classification_report(test_y, predictions_MNB))
print("Confusion Matrix:\n", confusion_matrix(test_y, predictions_MNB))


********** ML ALGORITHMS **********

- Classifier 1: Multinomial Naive Bayes
|___ Fitting training dataset on the MNB classifier...
|___ Training is complete. Total time:  0.05070279999995364
|___ Making predictions...
|___ Finished making predictions. Total time:  0.009250099999917438
|___ Naive Bayes Accuracy Score ->  85.96000000000001
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      5015
           1       0.88      0.83      0.86      4985

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

Confusion Matrix:
 [[4434  581]
 [ 823 4162]]


In [28]:
# Classifier 2 - Algorithm - SVM
# fit the training dataset on the SVM classifier
print("\n- Classifier 2: Support Vector Machines (SVM)")
print("|___ Fitting training dataset on the SVM classifier...")
model_SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
start = timeit.default_timer()
model_SVM.fit(train_x_tfidf, train_y)
stop = timeit.default_timer()
print("|___ Training is complete. Total time: ", stop-start)
# predict the labels on validation dataset
print("|___ Making predictions...")
start = timeit.default_timer()
predictions_SVM = model_SVM.predict(test_x_tfidf)
stop = timeit.default_timer()
print("|___ Finished making predictions. Total time: ", stop-start)
# Use accuracy_score function to get the accuracy
print("|___ SVM Accuracy Score -> ", accuracy_score(test_y, predictions_SVM)*100)
print(classification_report(test_y, predictions_SVM))
print("Confusion Matrix:\n", confusion_matrix(test_y, predictions_SVM))


- Classifier 2: Support Vector Machines (SVM)
|___ Fitting training dataset on the SVM classifier...
|___ Training is complete. Total time:  1515.4763630999998
|___ Making predictions...
|___ Finished making predictions. Total time:  178.54507049999984
|___ SVM Accuracy Score ->  90.62
              precision    recall  f1-score   support

           0       0.92      0.90      0.91      5015
           1       0.90      0.92      0.91      4985

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000

Confusion Matrix:
 [[4491  524]
 [ 414 4571]]


In [29]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    model_LR = LogisticRegression(C=c, max_iter=1000)
    # fit the training dataset on the Logistic Regression classifier
    print("\nFitting training dataset on the Logistic Regression classifier...")
    start = timeit.default_timer()
    model_LR.fit(train_x_tfidf, train_y)
    stop = timeit.default_timer()
    print("Training is complete. Total time: ", stop - start)
    # predict the labels on validation dataset (i.e. test dataset)
    print("Making predictions...")
    start = timeit.default_timer()
    predictions_LR = model_LR.predict(test_x_tfidf)
    stop = timeit.default_timer()
    print("- Finished making predictions! Total Time: ", stop - start)
    print("- LR Accuracy Score for C=%s: %s" % (c, accuracy_score(test_y, predictions_LR)*100))
    print(classification_report(test_y, predictions_LR))
    print("Confusion Matrix:\n", confusion_matrix(test_y, predictions_LR))


Fitting training dataset on the Logistic Regression classifier...
Training is complete. Total time:  0.9530605999998443
Making predictions...
- Finished making predictions! Total Time:  0.006024400000114838
- LR Accuracy Score for C=0.01: 77.88000000000001
              precision    recall  f1-score   support

           0       0.80      0.74      0.77      5015
           1       0.76      0.81      0.79      4985

    accuracy                           0.78     10000
   macro avg       0.78      0.78      0.78     10000
weighted avg       0.78      0.78      0.78     10000

Confusion Matrix:
 [[3726 1289]
 [ 923 4062]]

Fitting training dataset on the Logistic Regression classifier...
Training is complete. Total time:  1.0883217000000514
Making predictions...
- Finished making predictions! Total Time:  0.004285400000298978
- LR Accuracy Score for C=0.05: 83.41
              precision    recall  f1-score   support

           0       0.85      0.81      0.83      5015
           1  