In [1]:
import pandas as pd
import numpy as np
import nltk
import timeit
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv("imdb.csv")

In [5]:
# positive sentiment = 0, negative sentiment = 1
df.head()

Unnamed: 0,review,sentiment
0,China White (1989) was Ronny Yu's first intern...,positive
1,"Alejandro Amenabar, the young and talented Spa...",positive
2,"Yes, I watch this show. Because my girlfriend ...",positive
3,"I've loved all of Cream's work, even as there ...",positive
4,The remake of H.B. Halicki's classic seventies...,negative


In [6]:
df['review'].dropna(inplace=True)

In [10]:
df['review'] = [entry.lower() for entry in df['review']]

In [11]:
df['review'] = [word_tokenize(entry) for entry in df['review']]

In [13]:
print(df['review'])

0       [china, white, (, 1989, ), was, ronny, yu, 's,...
1       [alejandro, amenabar, ,, the, young, and, tale...
2       [yes, ,, i, watch, this, show, ., because, my,...
3       [i, 've, loved, all, of, cream, 's, work, ,, e...
4       [the, remake, of, h.b, ., halicki, 's, classic...
                              ...                        
9995    [i, enjoyed, carax, 's, ``, les, amants, du, p...
9997    [i, am, a, fan, of, his, ..., this, movie, suc...
9998    [loved, today, 's, show, !, !, !, it, was, a, ...
9999    [this, documentary, is, the, most, hypnotizing...
Name: review, Length: 10000, dtype: object


In [14]:
print("- Splitting dataset (80% train, 20% test)...")
train_x, test_x, train_y, test_y = model_selection.train_test_split(df['review'], df['sentiment'], test_size=0.2)
print("- Data splitting is complete.")
print(train_x)
print(train_y)

- Splitting dataset (80% train, 20% test)...
- Data splitting is complete.
3331    [bad, ,, bad, ,, bad, ., how, do, movies, like...
5006    [this, is, so, poor, it, 's, watchable., <, br...
7117    [ok., first, said, ,, i, just, wanted, to, che...
3048    [it, was, a, serious, attempt, to, show, the, ...
8887    [i, tried, to, watch, this, movie, in, a, mili...
                              ...                        
118     [marked, for, death, (, 1990, ), spends, more,...
5579    [a, stupid, young, man, becomes, obsessed, wit...
4754    [night, of, the, living, homeless, is, a, funn...
7760    [i, would, of, given, this, film, a, zero, out...
698     [the, cast, is, excellent, ,, the, acting, goo...
Name: review, Length: 8000, dtype: object
3331    negative
5006    negative
7117    negative
3048    positive
8887    negative
          ...   
118     negative
5579    positive
4754    positive
7760    negative
698     positive
Name: sentiment, Length: 8000, dtype: object


In [16]:
Encoder = LabelEncoder()
print("Original Train_Y: ", train_y)
train_y = Encoder.fit_transform(train_y)
print("New Train_Y:      ", train_y)
print("Original Test_Y:  ", test_y)
test_y = Encoder.fit_transform(test_y)
print("New Test_Y:       ", test_y)
print("\n- Data encoding is complete.")


Original Train_Y:  3331    negative
5006    negative
7117    negative
3048    positive
8887    negative
          ...   
118     negative
5579    positive
4754    positive
7760    negative
698     positive
Name: sentiment, Length: 8000, dtype: object
New Train_Y:       [0 0 0 ... 1 0 1]
Original Test_Y:   3885    negative
6396    positive
497     positive
3825    negative
6058    negative
          ...   
7633    negative
5234    negative
6319    positive
726     positive
6865    positive
Name: sentiment, Length: 2000, dtype: object
New Test_Y:        [0 1 1 ... 1 1 1]

- Data encoding is complete.


In [18]:

from sklearn.feature_extraction.text import TfidfVectorizer

def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None)

tfidf.fit(df['review'])
tfidf.vocabulary_

{'china': 13342,
 'white': 68351,
 '(': 1671,
 '1989': 2660,
 ')': 1672,
 'was': 67611,
 'ronny': 53229,
 'yu': 69803,
 "'s": 1282,
 'first': 24712,
 'international': 33088,
 'film': 24454,
 '.': 1924,
 'this': 62619,
 'u.k': 64781,
 ',': 1679,
 '/holland/hong': 2054,
 'kong': 35468,
 'production': 49187,
 'shot': 56393,
 'in': 32106,
 'english': 21990,
 'and': 5568,
 'slightly': 57395,
 'edited': 21173,
 'for': 25334,
 'the': 62372,
 'western': 68153,
 'audience': 7090,
 'american': 5388,
 'wong': 68988,
 'brothers': 10889,
 'michael': 40115,
 'russell': 53628,
 'were': 68122,
 'supposed': 60854,
 'to': 63168,
 'star': 59221,
 'together': 63215,
 'but': 11365,
 'due': 20635,
 'prior': 49046,
 'commitments': 14751,
 'unavailable': 65042,
 'so': 57853,
 'another': 5814,
 'actor': 4164,
 'steven': 59514,
 'leigh': 36520,
 'took': 63378,
 'his': 30355,
 'spot': 58871,
 'several': 55724,
 'hong': 30728,
 'stars': 59280,
 'such': 60443,
 'as': 6659,
 'tommy': 63301,
 'playing': 47659,
 'a':

In [19]:
# These will now contain for each row a list of unique integer number and
# its associated importance as calculated by TF-IDF.
train_x_tfidf = tfidf.transform(train_x)
test_x_tfidf = tfidf.transform(test_x)
print("Word vectorization is complete.")
print("\n- Tfidf vectorized format of training set: ")
print(train_x_tfidf)
print("\n- Tfidf vectorized format of testing set: ")
print(test_x_tfidf)

Word vectorization is complete.

- Tfidf vectorized format of training set: 
  (0, 69319)	0.1333107841352205
  (0, 68453)	0.09686396946216957
  (0, 63168)	0.035648091266605106
  (0, 62619)	0.07401428821871625
  (0, 62412)	0.0846022639740091
  (0, 62401)	0.07704150198937129
  (0, 62372)	0.13549840253129092
  (0, 62349)	0.0813873561252322
  (0, 58114)	0.11955886922256159
  (0, 55667)	0.12097732285699019
  (0, 53974)	0.10122078772262247
  (0, 53123)	0.11353527279670876
  (0, 48119)	0.16197209406167434
  (0, 47659)	0.13115898019097957
  (0, 44548)	0.053590604667238015
  (0, 44495)	0.09887199604662286
  (0, 43670)	0.15633638221052534
  (0, 43208)	0.07274894613654179
  (0, 42332)	0.3540066319175665
  (0, 41682)	0.08578826352200217
  (0, 38198)	0.08154143452626555
  (0, 36950)	0.05964139561831107
  (0, 36368)	0.2038288863339496
  (0, 33486)	0.07430494156059743
  (0, 33035)	0.15752452831164587
  :	:
  (7999, 29419)	0.10627047336827755
  (7999, 29164)	0.1202551183387072
  (7999, 27592)	0.205930

In [20]:

# --- STEP 8: USE ML ALGORITHMS TO PREDICT OUTCOME ---
print("\n********** ML ALGORITHMS **********\n")
print("- Classifier 1: Multinomial Naive Bayes")
# fit the training dataset on the NB classifier
print("|___ Fitting training dataset on the MNB classifier...")
model_MNB = naive_bayes.MultinomialNB()
start = timeit.default_timer()
model_MNB.fit(train_x_tfidf, train_y)
stop = timeit.default_timer()
print("|___ Training is complete. Total time: ", stop - start)
# predict the labels on validation dataset
print("|___ Making predictions...")
start = timeit.default_timer()
predictions_MNB = model_MNB.predict(test_x_tfidf)
stop = timeit.default_timer()
print("|___ Finished making predictions. Total time: ", stop - start)
# Use accuracy_score function to get the accuracy
# ground truth: news_test.target; predicted labels: predicted
print("|___ Naive Bayes Accuracy Score -> ", accuracy_score(test_y, predictions_MNB)*100)
print(classification_report(test_y, predictions_MNB))
print("Confusion Matrix:\n", confusion_matrix(test_y, predictions_MNB))


********** ML ALGORITHMS **********

- Classifier 1: Multinomial Naive Bayes
|___ Fitting training dataset on the MNB classifier...
|___ Training is complete. Total time:  0.02125669999998081
|___ Making predictions...
|___ Finished making predictions. Total time:  0.003428499999927226
|___ Naive Bayes Accuracy Score ->  84.7
              precision    recall  f1-score   support

           0       0.80      0.93      0.86      1006
           1       0.91      0.77      0.83       994

    accuracy                           0.85      2000
   macro avg       0.86      0.85      0.85      2000
weighted avg       0.86      0.85      0.85      2000

Confusion Matrix:
 [[931  75]
 [231 763]]


In [21]:

# Classifier 2 - Algorithm - SVM
# fit the training dataset on the SVM classifier
print("\n- Classifier 2: Support Vector Machines (SVM)")
print("|___ Fitting training dataset on the SVM classifier...")
model_SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
start = timeit.default_timer()
model_SVM.fit(train_x_tfidf, train_y)
stop = timeit.default_timer()
print("|___ Training is complete. Total time: ", stop-start)
# predict the labels on validation dataset
print("|___ Making predictions...")
start = timeit.default_timer()
predictions_SVM = model_SVM.predict(test_x_tfidf)
stop = timeit.default_timer()
print("|___ Finished making predictions. Total time: ", stop-start)
# Use accuracy_score function to get the accuracy
print("|___ SVM Accuracy Score -> ", accuracy_score(test_y, predictions_SVM)*100)
print(classification_report(test_y, predictions_SVM))
print("Confusion Matrix:\n", confusion_matrix(test_y, predictions_SVM))


- Classifier 2: Support Vector Machines (SVM)
|___ Fitting training dataset on the SVM classifier...
|___ Training is complete. Total time:  42.10728549999999
|___ Making predictions...
|___ Finished making predictions. Total time:  10.662308800000005
|___ SVM Accuracy Score ->  89.55
              precision    recall  f1-score   support

           0       0.89      0.90      0.90      1006
           1       0.90      0.89      0.89       994

    accuracy                           0.90      2000
   macro avg       0.90      0.90      0.90      2000
weighted avg       0.90      0.90      0.90      2000

Confusion Matrix:
 [[905 101]
 [108 886]]


In [22]:

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    model_LR = LogisticRegression(C=c, max_iter=1000)
    # fit the training dataset on the Logistic Regression classifier
    print("\nFitting training dataset on the Logistic Regression classifier...")
    start = timeit.default_timer()
    model_LR.fit(train_x_tfidf, train_y)
    stop = timeit.default_timer()
    print("Training is complete. Total time: ", stop - start)
    # predict the labels on validation dataset (i.e. test dataset)
    print("Making predictions...")
    start = timeit.default_timer()
    predictions_LR = model_LR.predict(test_x_tfidf)
    stop = timeit.default_timer()
    print("- Finished making predictions! Total Time: ", stop - start)
    print("- LR Accuracy Score for C=%s: %s" % (c, accuracy_score(test_y, predictions_LR)*100))
    print(classification_report(test_y, predictions_LR))
    print("Confusion Matrix:\n", confusion_matrix(test_y, predictions_LR))


Fitting training dataset on the Logistic Regression classifier...
Training is complete. Total time:  0.29214919999992617
Making predictions...
- Finished making predictions! Total Time:  0.0013181000000486165
- LR Accuracy Score for C=0.01: 74.6
              precision    recall  f1-score   support

           0       0.77      0.71      0.74      1006
           1       0.73      0.78      0.75       994

    accuracy                           0.75      2000
   macro avg       0.75      0.75      0.75      2000
weighted avg       0.75      0.75      0.75      2000

Confusion Matrix:
 [[712 294]
 [214 780]]

Fitting training dataset on the Logistic Regression classifier...
Training is complete. Total time:  0.231114600000069
Making predictions...
- Finished making predictions! Total Time:  0.001548399999933281
- LR Accuracy Score for C=0.05: 78.55
              precision    recall  f1-score   support

           0       0.80      0.76      0.78      1006
           1       0.77      0