# Autoreload modules and utilities

In [1]:
%load_ext autoreload
%autoreload 2

# Import all necessary libraries/packages

In [2]:
import joblib

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

ModuleNotFoundError: No module named 'machine_algo'

# Utility functions

In [3]:
from utils import clean_text, DenseTransformer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Load Data

In [4]:
data = pd.read_csv('imdb_labelled.txt', sep='\t', names=['review', 'label'])
data.head()

Unnamed: 0,review,label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


# Create Label

In [5]:
data['sentiment'] = data["label"].apply(lambda x: "positive" if x else "negative")
data.head()

Unnamed: 0,review,label,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0,negative
1,Not sure who was more lost - the flat characte...,0,negative
2,Attempting artiness with black & white and cle...,0,negative
3,Very little music or anything to speak of.,0,negative
4,The best scene in the movie was when Gerardo i...,1,positive


# Missing values

In [6]:
data.isnull().sum()

review       0
label        0
sentiment    0
dtype: int64

# Label frequency

In [7]:
Index = [1, 0]

print(data["sentiment"].value_counts())
print()

barlist = plt.bar(Index, data["sentiment"].value_counts())

plt.title("Frequency of Sentiments")
plt.xticks(Index, ['positive', 'negative'])
plt.ylabel('Number of Reviews')
plt.xlabel('Sentiment expressed in Reviews')

barlist[Index[1]].set_color('green')
barlist[Index[0]].set_color('red')
plt.show()

positive    386
negative    362
Name: sentiment, dtype: int64



NameError: name 'plt' is not defined

In [8]:

data['clean_review'] = data["review"].apply(clean_text)
data.head()

Unnamed: 0,review,label,sentiment,clean_review
0,"A very, very, very slow-moving, aimless movie ...",0,negative,a very very very slowmoving aimless movie abou...
1,Not sure who was more lost - the flat characte...,0,negative,not sure who wa more lost the flat character o...
2,Attempting artiness with black & white and cle...,0,negative,attempting artiness with black white and cleve...
3,Very little music or anything to speak of.,0,negative,very little music or anything to speak of
4,The best scene in the movie was when Gerardo i...,1,positive,the best scene in the movie wa when gerardo is...


# Text preprocessing

# observe count vectorizer

In [9]:
count_vectorizer = CountVectorizer(stop_words='english', binary=True)
count_data = count_vectorizer.fit_transform(data["clean_review"])
cv_dataframe = pd.DataFrame(count_data.toarray(), columns=count_vectorizer.get_feature_names())

cv_dataframe.head()

NameError: name 'CountVectorizer' is not defined

In [12]:
assert cv_dataframe.max().max()==1, "Maximum value must be One"


In [13]:
X = data["clean_review"]
y = data['label']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [14]:
print(f"train shape input:{x_train.shape}, output:{y_train.shape}")
print(f"test shape  input:{x_test.shape}, output:{y_test.shape}")

train shape input:(598,), output:(598,)
test shape  input:(150,), output:(150,)


# Train with Count vectorizer and CategoricalNB

In [15]:

cv_NB = Pipeline([
    ('bow', CountVectorizer(stop_words="english", binary=True)),
    ("dense", DenseTransformer()),
    ('classifier', CategoricalNB())
])

In [16]:
cv_NB.fit(x_train, y_train)


Pipeline(steps=[('bow', CountVectorizer(binary=True, stop_words='english')),
                ('dense', DenseTransformer()),
                ('classifier', CategoricalNB())])

In [17]:
joblib.dump(cv_NB, "models/categorical_naive_bayes_with_count_vectorizer.joblib")


['models/categorical_naive_bayes_with_count_vectorizer.joblib']

# Predict testing data

In [18]:

y_pred = cv_NB.predict(x_test) 

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.81      0.79        80
           1       0.77      0.73      0.75        70

    accuracy                           0.77       150
   macro avg       0.77      0.77      0.77       150
weighted avg       0.77      0.77      0.77       150



# Running cross validation with count vectorizer  and CategoricalNB

In [20]:
accuracy = []

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=100) 
skf.get_n_splits(X, y) 

for train_index, test_index in skf.split(X, y):
    x_train_fold, x_test_fold = X.iloc[train_index], X.iloc[test_index] 
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

    cv_NB.fit(x_train_fold, y_train_fold)
    result = cv_NB.score(x_test_fold, y_test_fold)
    accuracy.append(result)

accuracy = np.array(accuracy)

# Print the output
print('List of first 10 possible accuracy:')
for index, acc in enumerate(accuracy[:10]):
    print(f"{index+1:3d}. {acc:.4f}")

print('\nMetrics that were obtained from this model:')
print(f' Maximum Accuracy:   {accuracy.max()*100:.2f}%') 
print(f' Minimum Accuracy:   {accuracy.min()*100:.2f}%') 
print(f' Mean Accuracy:   {accuracy.mean()*100:.2f}%') 
print(f' Standard Deviation: {accuracy.std():.4f}')

List of first 10 possible accuracy:
  1. 0.7333
  2. 0.8267
  3. 0.6933
  4. 0.6933
  5. 0.7600
  6. 0.7200
  7. 0.7333
  8. 0.7200
  9. 0.7568
 10. 0.6892

Metrics that were obtained from this model:
 Maximum Accuracy:   82.67%
 Minimum Accuracy:   68.92%
 Mean Accuracy:   73.26%
 Standard Deviation: 0.0393
