# reload automatic

In [45]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Import necessary libraries and packages

In [46]:

import string

import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt 

from sklearn.pipeline import Pipeline 
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Utility Function

In [47]:
wordnet_lemmatizer = WordNetLemmatizer()

In [48]:
def clean_text(text: str):
    # removes upper cases
    text = text.lower()
    
    # removes punctuation
    for char in string.punctuation:
        text = text.replace(char, "")
    
    #lematize the words and join back into string text
    text = " ".join([wordnet_lemmatizer.lemmatize(word) for word in word_tokenize(text)])
    return text

# Load Data

In [49]:
dt = pd.read_csv('imdb_labelled.txt', delimiter = '\t', names=['review', 'rating'])
dt.info()
dt.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  748 non-null    object
 1   rating  748 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 11.8+ KB


Unnamed: 0,review,rating
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


# Create labels inform of positive and negative

In [50]:
dt['labels'] = dt["rating"].apply(lambda x: "Positive" if x else "Negative")
dt.head() 

Unnamed: 0,review,rating,labels
0,"A very, very, very slow-moving, aimless movie ...",0,Negative
1,Not sure who was more lost - the flat characte...,0,Negative
2,Attempting artiness with black & white and cle...,0,Negative
3,Very little music or anything to speak of.,0,Negative
4,The best scene in the movie was when Gerardo i...,1,Positive


# Check for missing values

In [51]:
dt.isnull().sum()

review    0
rating    0
labels    0
dtype: int64

# Clean data

In [52]:
dt['clean_review'] = dt["review"].apply(clean_text)
dt.head()

Unnamed: 0,review,rating,labels,clean_review
0,"A very, very, very slow-moving, aimless movie ...",0,Negative,a very very very slowmoving aimless movie abou...
1,Not sure who was more lost - the flat characte...,0,Negative,not sure who wa more lost the flat character o...
2,Attempting artiness with black & white and cle...,0,Negative,attempting artiness with black white and cleve...
3,Very little music or anything to speak of.,0,Negative,very little music or anything to speak of
4,The best scene in the movie was when Gerardo i...,1,Positive,the best scene in the movie wa when gerardo is...


# Create a column length to handle to length of the individual text

In [53]:
dt['length'] = dt['review'].apply(lambda x: len(str(x).split(' ')))
dt

Unnamed: 0,review,rating,labels,clean_review,length
0,"A very, very, very slow-moving, aimless movie ...",0,Negative,a very very very slowmoving aimless movie abou...,15
1,Not sure who was more lost - the flat characte...,0,Negative,not sure who wa more lost the flat character o...,21
2,Attempting artiness with black & white and cle...,0,Negative,attempting artiness with black white and cleve...,33
3,Very little music or anything to speak of.,0,Negative,very little music or anything to speak of,10
4,The best scene in the movie was when Gerardo i...,1,Positive,the best scene in the movie wa when gerardo is...,23
...,...,...,...,...,...
743,I just got bored watching Jessice Lange take h...,0,Negative,i just got bored watching jessice lange take h...,13
744,"Unfortunately, any virtue in this film's produ...",0,Negative,unfortunately any virtue in this film producti...,16
745,"In a word, it is embarrassing.",0,Negative,in a word it is embarrassing,8
746,Exceptionally bad!,0,Negative,exceptionally bad,4


# Value of individual column length

In [54]:
dt['length'].unique()

array([  15,   21,   33,   10,   23,   22,    5,   17,   12,    8,   13,
          6,   18,   25,   19,  868,   14,   16,    9,    7,   26,   35,
         11,    4,   27,   28,    3,   20,   37,   24,   36,  201, 1393,
         49,   30,  306,   31,   32,   44,   56,   45,   34,   38,   64,
         58,   40,   47,   29,   73,   41,   39,   55,   53,   42,  800],
      dtype=int64)

# Data Preprocessing

# observe count vectorizer

In [55]:
count_vectorizer = CountVectorizer(ngram_range=(1,1),lowercase=True, stop_words='english')
count_data = count_vectorizer.fit_transform(dt["clean_review"])
cv_dataframe = pd.DataFrame(count_data.toarray(), columns=count_vectorizer.get_feature_names())

cv_dataframe.head()

Unnamed: 0,010,10,1010,110,12,15,18th,1928,1947,1948,...,younger,youre,youthful,youtube,youve,yun,zillion,zombie,zombiestudents,zombiez
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# observe Tfidf

In [56]:
tf_idf_vec = TfidfVectorizer(use_idf=True, 
                        smooth_idf=True,  
                        ngram_range=(1,1),stop_words='english')

tf_idf_data = tf_idf_vec.fit_transform(dt["clean_review"])
tf_idf_dataframe = pd.DataFrame(tf_idf_data.toarray(), columns=tf_idf_vec.get_feature_names())
tf_idf_dataframe.head()

Unnamed: 0,010,10,1010,110,12,15,18th,1928,1947,1948,...,younger,youre,youthful,youtube,youve,yun,zillion,zombie,zombiestudents,zombiez
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Train CountVectorizer with BernoulliNB

# Initialise Pipeline with count vectorizer

In [57]:

X = dt["clean_review"]
y = dt['rating']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [58]:
print(f"train shape input:{x_train.shape}, output:{y_train.shape}")
print(f"test shape  input:{x_test.shape}, output:{y_test.shape}")

train shape input:(598,), output:(598,)
test shape  input:(150,), output:(150,)


In [59]:
cv_NB = Pipeline([
      ('bow', CountVectorizer(analyzer=clean_text, stop_words="english")),
      ('classifier', BernoulliNB())
])

In [60]:
cv_NB.fit(x_train, y_train)

Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function clean_text at 0x000001F3E45C5268>,
                                 stop_words='english')),
                ('classifier', BernoulliNB())])

In [61]:
y_pred = cv_NB.predict(x_test) #predict testing data

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.56      0.47      0.51        80
           1       0.49      0.57      0.53        70

    accuracy                           0.52       150
   macro avg       0.52      0.52      0.52       150
weighted avg       0.53      0.52      0.52       150



# Running cross validation with count vectorizer and BernoulliNB

In [62]:
accuracy = []


skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=100) 
skf.get_n_splits(X, y) 

for train_index, test_index in skf.split(X, y):
    x_train_fold, x_test_fold = X.iloc[train_index], X.iloc[test_index] 
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

    cv_NB.fit(x_train_fold, y_train_fold)
    result = cv_NB.score(x_test_fold, y_test_fold)
    accuracy.append(result)

accuracy = np.array(accuracy)

# Print the output
print('List of first 10 possible accuracy:')
for index, acc in enumerate(accuracy[:10]):
    print(f"{index+1:3d}. {acc:.4f}")

print('\nMetrics that were obtained from this model:')
print(f' Maximum Accuracy:   {accuracy.max()*100:.2f}%') 
print(f' Minimum Accuracy:   {accuracy.min()*100:.2f}%') 
print(f' Mean Accuracy:   {accuracy.mean()*100:.2f}%') 
print(f' Standard Deviation: {accuracy.std():.4f}')

List of first 10 possible accuracy:
  1. 0.5600
  2. 0.4267
  3. 0.6133
  4. 0.5733
  5. 0.6800
  6. 0.4667
  7. 0.5467
  8. 0.6533
  9. 0.5676
 10. 0.6216

Metrics that were obtained from this model:
 Maximum Accuracy:   68.00%
 Minimum Accuracy:   42.67%
 Mean Accuracy:   57.09%
 Standard Deviation: 0.0745


# Train Tfidf with BernoulliNB and Passing it into Pipeline

In [64]:
vectorizer = Pipeline([
   ('Term freq', TfidfVectorizer()),
   ('classifier', BernoulliNB()),
   ])

# Run Cross Validation with Tfidf using BernoulliNB

In [65]:
accuracy = []
stratifiedkf_predict = StratifiedKFold(n_splits=10, shuffle=True, random_state=100)
stratifiedkf_predict.get_n_splits(X, y)

for train_index, test_index in stratifiedkf_predict.split(X, y):
    
    x_train, x_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    vectorizer.fit(x_train, y_train)
    result = vectorizer.score(x_test, y_test)
    accuracy.append(result)
    
accuracy = np.array(accuracy)

# Print the output
print('List of first 10 possible accuracy:')
for index, acc in enumerate(accuracy[:10]):
    print(f"{index+1:3d}. {acc:.4f}")

print('\nMetrics that were obtained from this model:')
print(f' Maximum Accuracy:   {accuracy.max()*100:.2f}%') 
print(f' Minimum Accuracy:   {accuracy.min()*100:.2f}%') 
print(f' Mean Accuracy:   {accuracy.mean()*100:.2f}%') 
print(f' Standard Deviation: {accuracy.std():.4f}')
print(accuracy)

List of first 10 possible accuracy:
  1. 0.6933
  2. 0.8133
  3. 0.7067
  4. 0.6667
  5. 0.7600
  6. 0.8133
  7. 0.8400
  8. 0.7867
  9. 0.7162
 10. 0.7297

Metrics that were obtained from this model:
 Maximum Accuracy:   84.00%
 Minimum Accuracy:   66.67%
 Mean Accuracy:   75.26%
 Standard Deviation: 0.0558
[0.69333333 0.81333333 0.70666667 0.66666667 0.76       0.81333333
 0.84       0.78666667 0.71621622 0.72972973]
