In [1]:
# Dependencies

import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
import numpy as np
import tensorflow as tf
import nltk
from collections import Counter
import string
import itertools
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import tensorflow_text as text
import tensorflow_hub as hub
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import GlobalMaxPool1D, BatchNormalization, Dense, RNN, GRU, LSTM, TimeDistributed, Bidirectional, Activation, Embedding, Input, Conv1D, Dropout
import tensorflow as tf
import keras.backend as K
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from transformers import AutoTokenizer, pipeline
from transformers import TFAutoModelForSequenceClassification, TFTrainingArguments, TFTrainer
from scipy.special import softmax
from datasets import DatasetDict, Dataset
from tensorflow.keras import mixed_precision
from tensorflow.keras.utils import to_categorical
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification

plt.style.use('ggplot')

# Define mixed precision policy
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 3070, compute capability 8.6


In [14]:
# Data read and formatting
df = pd.read_csv('yelp.csv')
df = df.drop(['business_id', 'date', 'review_id', 'type', 'user_id', 'useful', 'funny', 'cool'], axis = 1)
df['Label'] = np.where(df['stars'] >= 3, 1, 0)
df['Id'] = range(1,len(df)+1)
df = df.drop('stars', axis = 1)
df.columns = ['Text', 'Label', 'Id']
df.head()

Unnamed: 0,Text,Label,Id
0,My wife took me here on my birthday for breakf...,1,1
1,I have no idea why some people give bad review...,1,2
2,love the gyro plate. Rice is so good and I als...,1,3
3,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",1,4
4,General Manager Scott Petello is a good egg!!!...,1,5


In [15]:
# Remove punctuation
translator = str.maketrans('', '', string.punctuation)
reviews = [s.translate(translator) for s in list(df['Text'])]

# Tokenization, Lemmatization, Stemming, Stopwords. Label numerical encoding
reviews_tokenized = []
reviews_tokenized_joined = []
for review in reviews:
  splitted_review = nltk.word_tokenize(review)
  splitted_review = [word for word in splitted_review if word not in stop_words]
  splitted_review = [WordNetLemmatizer().lemmatize(w) for w in splitted_review]
  splitted_review = [PorterStemmer().stem(w).strip() for w in splitted_review]
  reviews_tokenized.append(splitted_review)
  joined_review = ' '.join(splitted_review)
  reviews_tokenized_joined.append(joined_review)
  
# Remove empty reviews and the corresponding labels
empty_idx = []
for i, review in enumerate(reviews_tokenized):
  if len(review) == 0:
    empty_idx.append(i)

print(f'Empty indices: {empty_idx}')
labels = list(df['Label'])

for i in empty_idx:
  reviews_tokenized.pop(i)
  reviews_tokenized_joined.pop(i)
  reviews.pop(i)
  labels.pop(i)
  
reviews_unrolled = list(itertools.chain(*reviews_tokenized))

Empty indices: []


### Part 1. Multinomial Naive Bayes with Count Vectorizer

In [24]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(reviews_tokenized_joined, labels, test_size = 0.2, random_state = 1)

MultinomialNB_CountVectorizer_start_time = time.time()

# Count vectorizer feature transformation
count_vector = CountVectorizer(stop_words = 'english', binary = False)

training_data = count_vector.fit_transform(X_train)
test_data = count_vector.transform(X_test)

# Multinomial Naive Bayes model predictions
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB_CountVectorizer_predictions = naive_bayes.predict(test_data)

MultinomialNB_CountVectorizer_end_time = time.time()
MultinomialNB_CountVectorizer_execution_time = MultinomialNB_CountVectorizer_end_time - MultinomialNB_CountVectorizer_start_time

# Model evaluation
print('MultinomialNB CountVectorizer Training time: {} seconds'. format(round(MultinomialNB_CountVectorizer_execution_time, 2)))
print('MultinomialNB CountVectorizer Accuracy score: ', format(accuracy_score(y_test, MultinomialNB_CountVectorizer_predictions)))
print('MultinomialNB CountVectorizer Precision score: ', format(precision_score(y_test, MultinomialNB_CountVectorizer_predictions)))
print('MultinomialNB CountVectorizer Recall score: ', format(recall_score(y_test, MultinomialNB_CountVectorizer_predictions)))
print('MultinomialNB CountVectorizer F1 score: ', format(f1_score(y_test, MultinomialNB_CountVectorizer_predictions)))

# Model evaluation dictionary
MultinomialNB_CountVectorizer_results = {'Name': "MultinomialNB CountVectorizer", 
                                         "Training Time": round(MultinomialNB_CountVectorizer_execution_time, 2),
                                         "Accuracy score": accuracy_score(y_test, MultinomialNB_CountVectorizer_predictions),
                                         "Precision score": precision_score(y_test, MultinomialNB_CountVectorizer_predictions),
                                         "Recall score": recall_score(y_test, MultinomialNB_CountVectorizer_predictions),
                                         "F1 score": f1_score(y_test, MultinomialNB_CountVectorizer_predictions)}

MultinomialNB CountVectorizer Training time: 0.35 seconds
MultinomialNB CountVectorizer Accuracy score:  0.872
MultinomialNB CountVectorizer Precision score:  0.8908587257617728
MultinomialNB CountVectorizer Recall score:  0.9646070785842832
MultinomialNB CountVectorizer F1 score:  0.9262672811059908


In [25]:
# Write the results into a pickle file
filename = 'results_dict_list_yelp.pickle'

results_dict_list = [MultinomialNB_CountVectorizer_results]

with open(filename, 'wb') as f:
    pickle.dump(results_dict_list, f)  

### Part 2. Multinomial Naive Bayes with TF-IDF

In [26]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(reviews_tokenized_joined, labels, test_size = 0.2, random_state = 1)

MultinomialNB_tfidf_start_time = time.time()

# TF-IDF feature transformation
tfidf = TfidfVectorizer(stop_words = 'english', binary = False)

training_data = tfidf.fit_transform(X_train)
test_data = tfidf.transform(X_test)

# Multinomial Naive Bayes model predictions
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB_tfidf_predictions = naive_bayes.predict(test_data)

MultinomialNB_tfidf_end_time = time.time()
MultinomialNB_tfidf_execution_time = MultinomialNB_tfidf_end_time - MultinomialNB_tfidf_start_time

# Model evaluation
print('MultinomialNB tfidf Training time: {} seconds'. format(round(MultinomialNB_tfidf_execution_time, 2)))
print('MultinomialNB tfidf Accuracy score: ', format(accuracy_score(y_test, MultinomialNB_tfidf_predictions)))
print('MultinomialNB tfidf Precision score: ', format(precision_score(y_test, MultinomialNB_tfidf_predictions)))
print('MultinomialNB tfidf Recall score: ', format(recall_score(y_test, MultinomialNB_tfidf_predictions)))
print('MultinomialNB tfidf F1 score: ', format(f1_score(y_test, MultinomialNB_tfidf_predictions)))

# Model evaluation dictionary
MultinomialNB_tfidf_results = {'Name': "MultinomialNB tfidf", 
                                         "Training Time": round(MultinomialNB_tfidf_execution_time, 2),
                                         "Accuracy score": accuracy_score(y_test, MultinomialNB_tfidf_predictions),
                                         "Precision score": precision_score(y_test, MultinomialNB_tfidf_predictions),
                                         "Recall score": recall_score(y_test, MultinomialNB_tfidf_predictions),
                                         "F1 score": f1_score(y_test, MultinomialNB_tfidf_predictions)}

MultinomialNB tfidf Training time: 0.37 seconds
MultinomialNB tfidf Accuracy score:  0.834
MultinomialNB tfidf Precision score:  0.8339169584792396
MultinomialNB tfidf Recall score:  1.0
MultinomialNB tfidf F1 score:  0.9094380796508457


In [27]:
# Update the results pickle file
def update_the_results_file(model_name, filename=filename):
  with open(filename, 'rb') as f:
    results_dict_list = pickle.load(f)
    
  results_dict_list.append(model_name)

  with open(filename, 'wb') as f:
      pickle.dump(results_dict_list, f)
      
update_the_results_file(model_name = MultinomialNB_tfidf_results)

### Part 3. Vaders sentiment classifier

#### Extra preprocessing step

In [28]:
# Create dataframe for convenience
df = pd.DataFrame({'Text': reviews, 'Label': labels})
df = df.reset_index()
df['Id'] = df['index'] + 1
df.drop('index', axis = 1, inplace = True)
df = df[['Id', 'Text', 'Label']]
df.head()

Unnamed: 0,Id,Text,Label
0,1,My wife took me here on my birthday for breakf...,1
1,2,I have no idea why some people give bad review...,1
2,3,love the gyro plate Rice is so good and I also...,1
3,4,Rosie Dakota and I LOVE Chaparral Dog Park Its...,1
4,5,General Manager Scott Petello is a good egg No...,1


In [29]:
vader_start_time = time.time()

# Define Vader Sentiment analyzer model
sia = SentimentIntensityAnalyzer()

# Get the results
res = {}
for i, row in tqdm(df.iterrows(), total = len(df)):
  text = row['Text']
  myid = row['Id']
  res[myid] = sia.polarity_scores(text)
  
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns = {'index': 'Id'})
vaders = vaders.merge(df, how = 'left')
vaders['Predicted_Label'] = np.where(vaders['compound'] >= 0, 1, 0)

vader_end_time = time.time()
vader_execution_time = vader_end_time - vader_start_time

vaders.head()

  0%|          | 0/10000 [00:00<?, ?it/s]

Unnamed: 0,Id,neg,neu,pos,compound,Text,Label,Predicted_Label
0,1,0.015,0.688,0.297,0.995,My wife took me here on my birthday for breakf...,1,1
1,2,0.048,0.736,0.215,0.9945,I have no idea why some people give bad review...,1,1
2,3,0.0,0.602,0.398,0.8377,love the gyro plate Rice is so good and I also...,1,1
3,4,0.0,0.805,0.195,0.9436,Rosie Dakota and I LOVE Chaparral Dog Park Its...,1,1
4,5,0.031,0.648,0.321,0.9848,General Manager Scott Petello is a good egg No...,1,1


In [30]:
# Model evaluation
print('Vader Training time: {} seconds'. format(round(vader_execution_time, 2)))
print('Vader Accuracy score: ', format(accuracy_score(vaders['Label'], vaders['Predicted_Label'])))
print('Vader Precision score: ', format(precision_score(vaders['Label'], vaders['Predicted_Label'])))
print('Vader Recall score: ', format(recall_score(vaders['Label'], vaders['Predicted_Label'])))
print('Vader F1 score: ', format(f1_score(vaders['Label'], vaders['Predicted_Label'])))

# Model evaluation dictionary
vader_results = {'Name': "Vaders Analyser", 
                                         "Training Time": round(vader_execution_time, 2),
                                         "Accuracy score": accuracy_score(vaders['Label'], vaders['Predicted_Label']),
                                         "Precision score": precision_score(vaders['Label'], vaders['Predicted_Label']),
                                         "Recall score": recall_score(vaders['Label'], vaders['Predicted_Label']),
                                         "F1 score": f1_score(vaders['Label'], vaders['Predicted_Label'])}

Vader Training time: 8.74 seconds
Vader Accuracy score:  0.8689
Vader Precision score:  0.8883597297596633
Vader Recall score:  0.9635992311388756
Vader F1 score:  0.9244511035555812


In [31]:
update_the_results_file(model_name = vader_results)

### Part 4. RNN manual implementation

In [34]:
# Create vocabulary, word2index reference and convert the reviews into numerical form
vocab_size = 10000

word_counter = Counter(reviews_unrolled)
word_counter = dict(word_counter.most_common(vocab_size))
word2index = {k:i for i,k in enumerate(word_counter.keys(), start = 3)}

# Convert reveies to integers
reviews_int = []
for review in reviews_tokenized:
  cur_review = [1]
  for word in review:
    if word in word2index.keys():
      cur_review.append(word2index[word])
    else:
      cur_review.append(2)
  reviews_int.append(cur_review)
  
# Pad sequences
padded_reviews = pad_sequences(reviews_int, maxlen = 500, padding = 'pre', truncating = 'pre')

# Train test split on padded sequences
X_train, X_test, y_train, y_test = train_test_split(padded_reviews, labels, test_size = 0.2, random_state = 1)

X_train = np.array(X_train).reshape(8000, 500)
X_test = np.array(X_test).reshape(2000, 500)

y_train = np.array(y_train).reshape(8000, 1)
y_test = np.array(y_test).reshape(2000, 1)

In [35]:
# Define the model
dropout_rate = 0.5

# Model checkpoint callback
checkpoint = ModelCheckpoint(filepath='best_RNN_manual_model_amazon.h5', 
                             monitor='val_loss', 
                             save_best_only=True,
                             save_weights_only=False,
                             mode='auto')

# Early stopping callback
early_stop = EarlyStopping(monitor='val_loss', 
                           patience=3, 
                           verbose=1, 
                           restore_best_weights=True)

inputs = Input(shape = (X_train.shape[1:]))
mask = tf.keras.layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs)
x = Embedding(input_dim = vocab_size, output_dim = 128, input_length = 200)(inputs)
x = Conv1D(filters = 200, kernel_size = 13, strides = 1, padding = 'same', activation = 'relu')(x)
x = BatchNormalization()(x)
x = Dropout(dropout_rate)(x)
x = GRU(128, return_sequences = True)(x)
x = BatchNormalization()(x)
x = Dropout(dropout_rate)(x)
x = GRU(128, return_sequences = False)(x)
x = BatchNormalization()(x)
x = Dropout(dropout_rate)(x)
x = Dense(512, activation = 'relu')(x)
x = Dropout(dropout_rate)(x)
outputs = Dense(1, activation = 'sigmoid')(x)

model = Model(inputs = inputs, outputs = outputs)

model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001), loss = 'binary_crossentropy', metrics = ['accuracy'])

# Print model summary
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 500)]             0         
                                                                 
 embedding (Embedding)       (None, 500, 128)          1280000   
                                                                 
 conv1d (Conv1D)             (None, 500, 200)          333000    
                                                                 
 batch_normalization (BatchN  (None, 500, 200)         800       
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 500, 200)          0         
                                                                 
 gru (GRU)                   (None, 500, 128)          126720    
                                                             

In [None]:
# Train the model
Manual_RNN_start_time = time.time()

history = model.fit(X_train, y_train, epochs = 10, batch_size = 128, validation_data = (X_test, y_test), callbacks = [checkpoint, early_stop])

Manual_RNN_end_time = time.time()
Manual_RNN_execution_time = Manual_RNN_end_time - Manual_RNN_start_time