# IMBD reviews model selection

In [8]:
# Dependencies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
import numpy as np
import tensorflow as tf
import nltk
from collections import Counter
import string
import itertools
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import tensorflow_text as text
import tensorflow_hub as hub
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

plt.style.use('ggplot')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data preprocessing

In [9]:
# Data imports
with open('reviews.txt', 'r') as f:
  reviews = f.read()
with open('labels.txt', 'r') as f:
  labels = f.read()
  
# Remove punctuation
reviews = "".join([char for char in reviews if char not in string.punctuation])

reviews = reviews.split('\n')
labels = labels.split('\n')

# Tokenization, Lemmatization, Stemming, Stopwords. Label numerical encoding
reviews_tokenized = []
reviews_tokenized_joined = []
for review in reviews:
  splitted_review = nltk.word_tokenize(review)
  splitted_review = [word for word in splitted_review if word not in stop_words]
  splitted_review = [WordNetLemmatizer().lemmatize(w) for w in splitted_review]
  splitted_review = [PorterStemmer().stem(w).strip() for w in splitted_review]
  reviews_tokenized.append(splitted_review)
  joined_review = ' '.join(splitted_review)
  reviews_tokenized_joined.append(joined_review)
  
reviews_unrolled = list(itertools.chain(*reviews_tokenized))
labels = [1 if label == "positive" else 0 for label in labels]

### Part 1. Multinomial Naive Bayes with Count Vectorizer

In [10]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size = 0.2, random_state = 1)

MultinomialNB_CountVectorizer_start_time = time.time()

# Count vectorizer feature transformation
count_vector = CountVectorizer(stop_words = 'english', binary = False)

training_data = count_vector.fit_transform(X_train)
test_data = count_vector.transform(X_test)

# Multinomial Naive Bayes model predictions
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB_CountVectorizer_predictions = naive_bayes.predict(test_data)

MultinomialNB_CountVectorizer_end_time = time.time()
MultinomialNB_CountVectorizer_execution_time = MultinomialNB_CountVectorizer_end_time - MultinomialNB_CountVectorizer_start_time

# Model evaluation
print('MultinomialNB CountVectorizer Training time: {} seconds'. format(round(MultinomialNB_CountVectorizer_execution_time, 2)))
print('MultinomialNB CountVectorizer Accuracy score: ', format(accuracy_score(y_test, MultinomialNB_CountVectorizer_predictions)))
print('MultinomialNB CountVectorizer Precision score: ', format(precision_score(y_test, MultinomialNB_CountVectorizer_predictions)))
print('MultinomialNB CountVectorizer Recall score: ', format(recall_score(y_test, MultinomialNB_CountVectorizer_predictions)))
print('MultinomialNB CountVectorizer F1 score: ', format(f1_score(y_test, MultinomialNB_CountVectorizer_predictions)))

# Model evaluation dictionary
MultinomialNB_CountVectorizer_results = {'Name': "MultinomialNB CountVectorizer", 
                                         "Training Time": round(MultinomialNB_CountVectorizer_execution_time, 2),
                                         "Accuracy score": accuracy_score(y_test, MultinomialNB_CountVectorizer_predictions),
                                         "Precision score": precision_score(y_test, MultinomialNB_CountVectorizer_predictions),
                                         "Recall score": recall_score(y_test, MultinomialNB_CountVectorizer_predictions),
                                         "F1 score": f1_score(y_test, MultinomialNB_CountVectorizer_predictions)}

MultinomialNB CountVectorizer Training time: 2.22 seconds
MultinomialNB CountVectorizer Accuracy score:  0.8604279144171165
MultinomialNB CountVectorizer Precision score:  0.8769359564671411
MultinomialNB CountVectorizer Recall score:  0.8383353341336535
MultinomialNB CountVectorizer F1 score:  0.8572013093289689


### Part 2. Multinomial Naive Bayes with TF-IDF

In [11]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size = 0.2, random_state = 1)

MultinomialNB_tfidf_start_time = time.time()

# TF-IDF feature transformation
tfidf = TfidfVectorizer(stop_words = 'english', binary = False)

training_data = tfidf.fit_transform(X_train)
test_data = tfidf.transform(X_test)

# Multinomial Naive Bayes model predictions
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB_tfidf_predictions = naive_bayes.predict(test_data)

MultinomialNB_tfidf_end_time = time.time()
MultinomialNB_tfidf_execution_time = MultinomialNB_tfidf_end_time - MultinomialNB_tfidf_start_time

# Model evaluation
print('MultinomialNB tfidf Training time: {} seconds'. format(round(MultinomialNB_tfidf_execution_time, 2)))
print('MultinomialNB tfidf Accuracy score: ', format(accuracy_score(y_test, MultinomialNB_tfidf_predictions)))
print('MultinomialNB tfidf Precision score: ', format(precision_score(y_test, MultinomialNB_tfidf_predictions)))
print('MultinomialNB tfidf Recall score: ', format(recall_score(y_test, MultinomialNB_tfidf_predictions)))
print('MultinomialNB tfidf F1 score: ', format(f1_score(y_test, MultinomialNB_tfidf_predictions)))

# Model evaluation dictionary
MultinomialNB_tfidf_results = {'Name': "MultinomialNB tfidf", 
                                         "Training Time": round(MultinomialNB_tfidf_execution_time, 2),
                                         "Accuracy score": accuracy_score(y_test, MultinomialNB_tfidf_predictions),
                                         "Precision score": precision_score(y_test, MultinomialNB_tfidf_predictions),
                                         "Recall score": recall_score(y_test, MultinomialNB_tfidf_predictions),
                                         "F1 score": f1_score(y_test, MultinomialNB_tfidf_predictions)}

MultinomialNB tfidf Training time: 2.19 seconds
MultinomialNB tfidf Accuracy score:  0.8686262747450509
MultinomialNB tfidf Precision score:  0.8734793187347932
MultinomialNB tfidf Recall score:  0.8619447779111644
MultinomialNB tfidf F1 score:  0.8676737160120847


### Part 3. Vaders sentiment classifier

#### Extra preprocessing step

In [14]:
# Create dataframe for convenience

df = pd.DataFrame({'Text': reviews, 'Label': labels})
df = df.reset_index()
df['Id'] = df['index'] + 1
df.drop('index', axis = 1, inplace = True)
df = df[['Id', 'Text', 'Label']]
df.head()

Unnamed: 0,Id,Text,Label
0,1,bromwell high is a cartoon comedy it ran at t...,1
1,2,story of a man who has unnatural feelings for ...,0
2,3,homelessness or houselessness as george carli...,1
3,4,airport starts as a brand new luxury pla...,0
4,5,brilliant over acting by lesley ann warren b...,1


In [29]:
vader_start_time = time.time()

# Define Vader Sentiment analyzer model
sia = SentimentIntensityAnalyzer()

# Get the results
res = {}
for i, row in tqdm(df.iterrows(), total = len(df)):
  text = row['Text']
  myid = row['Id']
  res[myid] = sia.polarity_scores(text)
  
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns = {'index': 'Id'})
vaders = vaders.merge(df, how = 'left')
vaders['Predicted_Label'] = np.where(vaders['compound'] >= 0, 1, 0)

vader_end_time = time.time()
vader_execution_time = vader_end_time - vader_start_time

vaders.head()

  0%|          | 0/25001 [00:00<?, ?it/s]

Unnamed: 0,Id,neg,neu,pos,compound,Text,Label,Predicted_Label
0,1,0.044,0.916,0.041,-0.1027,bromwell high is a cartoon comedy it ran at t...,1,0
1,2,0.108,0.746,0.146,0.7003,story of a man who has unnatural feelings for ...,0,1
2,3,0.12,0.733,0.147,0.9311,homelessness or houselessness as george carli...,1,1
3,4,0.161,0.692,0.147,-0.918,airport starts as a brand new luxury pla...,0,0
4,5,0.077,0.738,0.185,0.9657,brilliant over acting by lesley ann warren b...,1,1


In [33]:
# Model evaluation
print('Vader Training time: {} seconds'. format(round(vader_execution_time, 2)))
print('Vader Accuracy score: ', format(accuracy_score(vaders['Label'], vaders['Predicted_Label'])))
print('Vader Precision score: ', format(precision_score(vaders['Label'], vaders['Predicted_Label'])))
print('Vader Recall score: ', format(recall_score(vaders['Label'], vaders['Predicted_Label'])))
print('Vader F1 score: ', format(f1_score(vaders['Label'], vaders['Predicted_Label'])))

# Model evaluation dictionary
vader_results = {'Name': "Vaders Analyser", 
                                         "Training Time": round(vader_execution_time, 2),
                                         "Accuracy score": accuracy_score(vaders['Label'], vaders['Predicted_Label']),
                                         "Precision score": precision_score(vaders['Label'], vaders['Predicted_Label']),
                                         "Recall score": recall_score(vaders['Label'], vaders['Predicted_Label']),
                                         "F1 score": f1_score(vaders['Label'], vaders['Predicted_Label'])}

Vader Training time: 35.85 seconds
Vader Accuracy score:  0.6908523659053638
Vader Precision score:  0.6434714620797498
Vader Recall score:  0.85592
Vader F1 score:  0.7346448312562228
