# Imports and Input

In [None]:
#To connect google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Imports

import string

import html
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import os.path
import pandas as pd
from math import inf
import numpy as np
import json
from tqdm import tqdm
import statistics as st
from tabulate import tabulate
from IPython.display import clear_output
import random

import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import KFold,train_test_split

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
#NN
import keras
from keras.models import Sequential
from keras.layers import Dense

# preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

import tensorflow as tf

seed=22
def reset_seeds(seed=seed):
  keras.utils.set_random_seed(seed)
  tf.random.set_seed(seed)
  tf.keras.utils.set_random_seed(seed)
  random.seed(seed)
  np.random.seed(seed)
  tf.config.experimental.enable_op_determinism()
  os.environ['PYTHONHASHSEED']=str(seed)
reset_seeds()

from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

!pip install datasets
import datasets

# !pip install sentence-transformers
# from sentence_transformers import SentenceTransformer

clear_output()
# !pip install nltk
# !pip install pandas
# !pip install tabulate
# !pip install matplotlib
# !pip install gensim

In [None]:
# Read input data
# Input Data

# Original
# rt_reviews_data = pd.read_csv('./rotten_tomatoes_movie_reviews.csv')

#Processed
# rt_reviews_data = pd.read_csv('./rt_reviews_data_processed.csv')

#Sampled
# rt_reviews_data = pd.read_csv('./rt_reviews_data_sampled.csv')

# rt_movies_data = pd.read_csv('./rotten_tomatoes_movies.csv')

#IMDB
rt_reviews_data = pd.read_csv('./IMDB Dataset.csv')

In [None]:
# rt_reviews_data.head()

In [None]:
keykey='rt'
if('review' in rt_reviews_data.keys()):
    keykey='imdb'
    rt_reviews_data['reviewText']=rt_reviews_data['review']
    rt_reviews_data['scoreSentiment']=rt_reviews_data['sentiment']

In [None]:
# Subset of Original data
rt_input_data = rt_reviews_data[['reviewText','scoreSentiment']]

# Remove Null columns
rt_input_data = rt_input_data.dropna(subset=['reviewText', 'scoreSentiment'])

# Convert html symbols into text
rt_input_data['reviewText'] = rt_input_data['reviewText'].apply(lambda x: html.unescape(x))

# Save processed data for future reuse.
# rt_input_data.to_csv('/content/drive/MyDrive/DO NOT DELETE/SHARE/rt_reviews_data_processed.csv', index=False)

# Utility

### Glove model utility

In [None]:
# Create custom Vectorizer using glove model

class Word2VecVectorizer:
  def __init__(self, model):
    self.word_vectors = model
  def fit(self, data):
    pass

  def transform(self, data):
    v = self.word_vectors.get_vector('king')
    self.D = v.shape[0]

    X = np.zeros((len(data), self.D))
    n = 0
    emptycount = 0
    for sentence in data:
      tokens = sentence.split()
      vecs = []
      m = 0
      for word in tokens:
        try:
          vec = self.word_vectors.get_vector(word)
          vecs.append(vec)
          m += 1
        except KeyError:
          pass
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    return X

  def fit_transform(self, data):
    self.fit(data)
    return self.transform(data)

In [None]:
# Encode training and test data using glove model
glove_encodings_fetched=None

if(os.path.exists('/content/drive/MyDrive/DO NOT DELETE/SHARE/glove_encodings_project_{}.txt'.format(keykey))):
    with open('/content/drive/MyDrive/DO NOT DELETE/SHARE/glove_encodings_project_{}.txt'.format(keykey),'r') as json_file:
        glove_encodings_fetched = json.load(json_file)
else:
    # load the Stanford GloVe model
    glove_path='/content/drive/MyDrive/glove.6B.100d.txt' #Colab
    # glove_path='./glove.6B.100d.txt' #Local
    glove_model_data = KeyedVectors.load_word2vec_format(glove_path, binary=False, no_header=True)
    def X_tensor_glove_original(corpus):
        corpus=corpus.copy()
        vectorizer = Word2VecVectorizer(glove_model_data)
        X_train = vectorizer.fit_transform(corpus)
        X_train=tf.convert_to_tensor(X_train, dtype=tf.float32)
        return X_train

    input_converted_glove=X_tensor_glove_original(rt_reviews_data['reviewText'])

    glove_encodings_fetched={}

    for i in range(rt_reviews_data.shape[0]):
        glove_encodings_fetched[rt_reviews_data['reviewText'][i]]=input_converted_glove[i].numpy().tolist()

    with open('/content/drive/MyDrive/DO NOT DELETE/SHARE/glove_encodings_project_{}.txt'.format(keykey),'w') as fp:
        fp.write(json.dumps(glove_encodings_fetched))

### bert model utility

In [None]:
# Encode training and test data using all-distilroberta-v1 bert model
bert_encodings_fetched=None

if(os.path.exists('/content/drive/MyDrive/DO NOT DELETE/SHARE/bert_encodings_project_{}.txt'.format(keykey))):
    with open('/content/drive/MyDrive/DO NOT DELETE/SHARE/bert_encodings_project_{}.txt'.format(keykey),'r') as json_file:
        bert_encodings_fetched = json.load(json_file)
else:
    # Load all-distilroberta-v1 model
    bert_model = SentenceTransformer('all-distilroberta-v1')
    clear_output()
    def X_tensor_bert_original(corpus):
        corpus=corpus.copy()
        X_train = bert_model.encode(corpus)
        X_train=tf.convert_to_tensor(X_train, dtype=tf.float32)
        return X_train

    input_converted_bert=X_tensor_bert_original(rt_reviews_data['reviewText'])

    bert_encodings_fetched={}

    for i in range(rt_reviews_data.shape[0]):
        bert_encodings_fetched[rt_reviews_data['reviewText'][i]]=input_converted_bert[i].numpy().tolist()

    with open('/content/drive/MyDrive/DO NOT DELETE/SHARE/bert_encodings_project_{}.txt'.format(keykey),'w') as fp:
        fp.write(json.dumps(bert_encodings_fetched))

### mpnet model utility

In [None]:
# Encode training and test data using all-mpnet-base-v2 mpnet model
mpnet_encodings_fetched=None

if(os.path.exists('/content/drive/MyDrive/DO NOT DELETE/SHARE/mpnet_encodings_project_{}.txt'.format(keykey))):
    with open('/content/drive/MyDrive/DO NOT DELETE/SHARE/mpnet_encodings_project_{}.txt'.format(keykey),'r') as json_file:
        mpnet_encodings_fetched = json.load(json_file)
else:
    # Load all-mpnet-base-v2 model
    mpnet_model = SentenceTransformer('all-mpnet-base-v2')
    clear_output()
    def X_tensor_mpnet_original(corpus):
        corpus=corpus.copy()
        X_train = mpnet_model.encode(corpus)
        X_train=tf.convert_to_tensor(X_train, dtype=tf.float32)
        return X_train

    input_converted_mpnet=X_tensor_mpnet_original(rt_reviews_data['reviewText'])

    mpnet_encodings_fetched={}

    for i in range(rt_reviews_data.shape[0]):
        mpnet_encodings_fetched[rt_reviews_data['reviewText'][i]]=input_converted_mpnet[i].numpy().tolist()

    with open('/content/drive/MyDrive/DO NOT DELETE/SHARE/mpnet_encodings_project_{}.txt'.format(keykey),'w') as fp:
        fp.write(json.dumps(mpnet_encodings_fetched))

### Count Vectorizer utility

In [None]:
# Encode training and test data using countvec model
countvec_encodings_fetched=None

if(os.path.exists('/content/drive/MyDrive/DO NOT DELETE/SHARE/countvec_encodings_project_{}.txt'.format(keykey))):
    with open('/content/drive/MyDrive/DO NOT DELETE/SHARE/countvec_encodings_project_{}.txt'.format(keykey),'r') as json_file:
        countvec_encodings_fetched = json.load(json_file)
else:
    class LemmaTokenizer:
      def __init__(self):
          self.wnl = WordNetLemmatizer()
      def __call__(self, doc):
          return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

    def X_tensor_countvec_original(corpus):
      corpus=corpus.copy()
      vectorizer = CountVectorizer(max_features=1000, stop_words='english', lowercase=True, tokenizer=LemmaTokenizer())
      X_train = vectorizer.fit_transform(corpus)
      X_train=X_train.toarray()
      X_train=tf.convert_to_tensor(X_train, dtype=tf.float32)
      return X_train

    input_converted_countvec=X_tensor_countvec_original(rt_reviews_data['reviewText'])

    countvec_encodings_fetched={}

    for i in range(rt_reviews_data.shape[0]):
        countvec_encodings_fetched[rt_reviews_data['reviewText'][i]]=input_converted_countvec[i].numpy().tolist()

    with open('/content/drive/MyDrive/DO NOT DELETE/SHARE/countvec_encodings_project_{}.txt'.format(keykey),'w') as fp:
        fp.write(json.dumps(countvec_encodings_fetched))

### TFIDF utility

In [None]:
# Encode training and test data using tfidf model
tfidf_encodings_fetched=None

if(os.path.exists('/content/drive/MyDrive/DO NOT DELETE/SHARE/tfidf_encodings_project_{}.txt'.format(keykey))):
    with open('/content/drive/MyDrive/DO NOT DELETE/SHARE/tfidf_encodings_project_{}.txt'.format(keykey),'r') as json_file:
        tfidf_encodings_fetched = json.load(json_file)
else:
    class LemmaTokenizer:
      def __init__(self):
        self.wnl = WordNetLemmatizer()
      def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

    def X_tensor_tfidf_original(corpus):
      corpus=corpus.copy()
      vectorizer = TfidfVectorizer(max_features=1000, stop_words='english', lowercase=True, tokenizer=LemmaTokenizer())
      X_train = vectorizer.fit_transform(corpus)
      X_train=X_train.toarray()
      X_train=tf.convert_to_tensor(X_train, dtype=tf.float32)
      return X_train

    input_converted_tfidf=X_tensor_tfidf_original(rt_reviews_data['reviewText'])

    tfidf_encodings_fetched={}

    for i in range(rt_reviews_data.shape[0]):
        tfidf_encodings_fetched[rt_reviews_data['reviewText'][i]]=input_converted_tfidf[i].numpy().tolist()

    with open('/content/drive/MyDrive/DO NOT DELETE/SHARE/tfidf_encodings_project_{}.txt'.format(keykey),'w') as fp:
        fp.write(json.dumps(tfidf_encodings_fetched))

### General Utility

In [None]:
# Create Dictionary to convert category into 1-hot vector and vice verse
category_to_vector={}
vector_to_category={}

index=0
set_of_list_categories=set(list(rt_reviews_data['scoreSentiment']))
set(list(rt_reviews_data['scoreSentiment']))
for i in list(set_of_list_categories):
  newarray=[0]*len(set_of_list_categories)
  newarray[index]=1
  category_to_vector[i]=newarray
  vector_to_category[index]=i
  index+=1

# print(category_to_vector)
# print(vector_to_category)

In [None]:
# utility functions to convert category into 1-hot vector and vice verse
def Y_tensor(Y_train):
  Y_train=Y_train.copy()

  for i in range(len(Y_train)):
    Y_train[i]=category_to_vector[Y_train[i]]

  Y_train=np.array(Y_train)
  Y_train=tf.convert_to_tensor(Y_train, dtype=tf.float32)
  return Y_train

def vec2cat(input):
    categories = []
    for i in range(len(input)):
        categories.append(vector_to_category[np.argmax(input[i])])
    return categories

In [None]:
# Neural Network Model
def generateNNModel(learning_rate, input_dim, optimizer_name):
  reset_seeds()
  model = Sequential()
  model.add(keras.Input(shape=(input_dim,)))
  model.add(Dense(128, activation='relu'))
  model.add(Dense(128, activation='relu'))
  model.add(Dense(len(set_of_list_categories), activation='softmax'))
  optimzer=None
  if optimizer_name=='adam':
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
  elif optimizer_name=='sgd':
    optimizer = keras.optimizers.SGD(learning_rate=learning_rate)
  elif optimizer_name=='rmsprop':
    optimizer = keras.optimizers.RMSprop(learning_rate=learning_rate)
  model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
  return model

In [None]:
# initialize array to save the various features statistics
all_train_acc=[]
all_train_std=[]
all_train_f1=[]
all_train_auc=[]

all_val_acc=[]
all_val_std=[]
all_val_f1=[]
all_val_auc=[]
def reset_stat_array():
  global all_train_acc,all_train_std,all_val_acc,all_val_std, all_train_f1, all_val_f1, all_train_auc, all_val_auc
  all_train_acc=[]
  all_train_std=[]
  all_train_f1=[]
  all_train_auc=[]

  all_val_acc=[]
  all_val_std=[]
  all_val_f1=[]
  all_val_auc=[]



In [None]:
#A common function to run Neural Network on training data and test data. Uses arguments to change various parameters.
def run_nn(X,Y, input_size, nn_epochs, filename_suffix, run_test_data, learning_rate, optimizer_name):
    #Kfold
    def kfold_analysis(train_val_X,train_val_y):
        kf = KFold(n_splits = 5)
        train_acc = []
        val_acc = []

        train_f1 = []
        val_f1 = []

        train_auc = []
        val_auc = []
        for train_index, val_index in tqdm(kf.split(train_val_X)):
            train_X = train_val_X[train_index[0]:train_index[-1]+1]
            train_y = train_val_y[train_index[0]:train_index[-1]+1]

            val_X = train_val_X[val_index[0]:val_index[-1]+1]
            val_y = train_val_y[val_index[0]:val_index[-1]+1]

            # train_X_tensor=X_tensor(train_X)
            #train
            nn_model = generateNNModel(learning_rate, input_size,optimizer_name)
            reset_seeds()
            train_X=tf.convert_to_tensor(train_X, dtype=tf.float32)
            history = nn_model.fit(train_X, Y_tensor(train_y), epochs=nn_epochs, batch_size=64,verbose=0)

            #accuracy of training
            reset_seeds()
            training_data_predicted_values=nn_model.predict(train_X,verbose=0)
            # print(training_data_predicted_values)
            training_data_predicted_categories = vec2cat(training_data_predicted_values)

            # print(set(train_y), set(training_data_predicted_categories))
            train_acc.append(accuracy_score(train_y, training_data_predicted_categories))
            train_f1.append(f1_score(train_y, training_data_predicted_categories, pos_label='positive'))
            train_auc.append(roc_auc_score(train_y, training_data_predicted_values[:, 0]))

            #accuracy of validation
            reset_seeds()
            val_X=tf.convert_to_tensor(val_X, dtype=tf.float32)
            validation_data_predicted_values=nn_model.predict(val_X,verbose=0)
            validation_data_predicted_categories = vec2cat(validation_data_predicted_values)

            # print(st(val_y), set(validation_data_predicted_categories))
            val_acc.append(accuracy_score(val_y, validation_data_predicted_categories))
            val_f1.append(f1_score(val_y, validation_data_predicted_categories, pos_label='positive'))
            val_auc.append(roc_auc_score(val_y, validation_data_predicted_values[:, 0]))

        avg_train_acc = sum(train_acc) / len(train_acc)
        avg_val_acc = sum(val_acc) / len(val_acc)

        avg_train_f1 = sum(train_f1) / len(train_f1)
        avg_val_f1 = sum(val_f1) / len(val_f1)

        avg_train_auc = sum(train_auc) / len(train_auc)
        avg_val_auc = sum(val_auc) / len(val_auc)

        print("\nThe average training accuracy is {} with standard deviation of {}".format(avg_train_acc,st.pstdev(train_acc)))
        print("The average validation accuracy is {} with standard deviation of {}".format(avg_val_acc,st.pstdev(val_acc)))

        return avg_train_acc, st.pstdev(train_acc), avg_val_acc, st.pstdev(val_acc), avg_train_f1, avg_val_f1, avg_train_auc, avg_val_auc
    return kfold_analysis(X,Y)

# Feature Extraction and NN

### Utility


In [None]:
def X_tensor_countvec(corpus):
  X_train=[]
  for i in corpus:
    X_train.append(countvec_encodings_fetched[i])
  X_train=tf.convert_to_tensor(X_train, dtype=tf.float32)
  return X_train

X_countvec=X_tensor_countvec(rt_reviews_data['reviewText'])
Y=rt_reviews_data['scoreSentiment']



def X_tensor_tfidf(corpus):
  X_train=[]
  for i in corpus:
    X_train.append(tfidf_encodings_fetched[i])
  X_train=tf.convert_to_tensor(X_train, dtype=tf.float32)
  return X_train


X_tfidf=X_tensor_tfidf(rt_reviews_data['reviewText'])

def X_tensor_glove(corpus):
  X_train=[]
  for i in corpus:
    X_train.append(glove_encodings_fetched[i])
  X_train=tf.convert_to_tensor(X_train, dtype=tf.float32)
  return X_train

X_glove=X_tensor_glove(rt_reviews_data['reviewText'])


def X_tensor_bert(corpus):
  corpus=corpus.copy()
  X_train=[]
  for i in corpus:
    X_train.append(bert_encodings_fetched[i])
  X_train=tf.convert_to_tensor(X_train, dtype=tf.float32)
  return X_train

X_bert=X_tensor_bert(rt_reviews_data['reviewText'])



def X_tensor_mpnet(corpus):
  corpus=corpus.copy()
  X_train=[]
  for i in corpus:
    X_train.append(mpnet_encodings_fetched[i])
  X_train=tf.convert_to_tensor(X_train, dtype=tf.float32)
  return X_train


X_mpnet=X_tensor_mpnet(rt_reviews_data['reviewText'])

### CountVectorizer

In [None]:
# Run Neural Network using Count Vectorizer feature
train_acc, train_std, val_acc, val_std, train_f1, val_f1, train_auc, val_auc = run_nn(list(X_countvec),list(Y), 1000, 10, 'countvec', False, 0.001, 'adam')

all_train_acc.append(train_acc)
all_train_std.append(train_std)
all_train_f1.append(train_f1)
all_train_auc.append(train_auc)

all_val_acc.append(val_acc)
all_val_std.append(val_std)
all_val_f1.append(val_f1)
all_val_auc.append(val_auc)

### TFIDF

In [None]:
# Run Neural Network using TFIDF Vectorizer feature
train_acc, train_std, val_acc, val_std, train_f1, val_f1, train_auc, val_auc = run_nn(list(X_tfidf),list(Y), 1000, 15, '_tfidf', False, 0.001, 'adam')

all_train_acc.append(train_acc)
all_train_std.append(train_std)
all_train_f1.append(train_f1)
all_train_auc.append(train_auc)

all_val_acc.append(val_acc)
all_val_std.append(val_std)
all_val_f1.append(val_f1)
all_val_auc.append(val_auc)

### Glove

In [None]:
# Run Neural Network using Glove Vectorizer feature
train_acc, train_std, val_acc, val_std, train_f1, val_f1, train_auc, val_auc = run_nn(list(X_glove),list(Y), 100, 50, '_glove', False, 0.001, 'adam')

all_train_acc.append(train_acc)
all_train_std.append(train_std)
all_train_f1.append(train_f1)
all_train_auc.append(train_auc)

all_val_acc.append(val_acc)
all_val_std.append(val_std)
all_val_f1.append(val_f1)
all_val_auc.append(val_auc)

### Bert

In [None]:
# Run Neural Network using BERT Vectorizer feature
dimension_sbert=768
train_acc, train_std, val_acc, val_std, train_f1, val_f1, train_auc, val_auc = run_nn(list(X_bert),list(Y), dimension_sbert, 15, '_bert', False, 0.001, 'adam')

all_train_acc.append(train_acc)
all_train_std.append(train_std)
all_train_f1.append(train_f1)
all_train_auc.append(train_auc)

all_val_acc.append(val_acc)
all_val_std.append(val_std)
all_val_f1.append(val_f1)
all_val_auc.append(val_auc)

### MPNET

In [None]:
# Run Neural Network using MPnet Vectorizer feature
dimension_mpnet=768
train_acc, train_std, val_acc, val_std, train_f1, val_f1, train_auc, val_auc = run_nn(list(X_mpnet),list(Y), dimension_mpnet, 15, '_mpnet', False, 0.001, 'adam')

all_train_acc.append(train_acc)
all_train_std.append(train_std)
all_train_f1.append(train_f1)
all_train_auc.append(train_auc)

all_val_acc.append(val_acc)
all_val_std.append(val_std)
all_val_f1.append(val_f1)
all_val_auc.append(val_auc)

In [None]:
# Create a table to compare the statistics
def drawTable(all_val_acc,all_val_f1,all_val_auc):
    table={"Method":['Count Vectorizer','TFIDF','GloVe','BERT','MPNET'],"Accuracy":all_val_acc,"F1 Score":all_val_f1,"AUC":all_val_auc}
    print(tabulate(table , headers="keys", tablefmt="grid"))

drawTable(all_val_acc,all_val_f1,all_val_auc)

In [None]:
# #Bar chart showing the training accuracy and validation accuracy w.r.t. different parameter values
# labels = ("Training Accuracy", "Validation Accuracy")
# pdata_means = {
#     'Count Vectorizer': (all_train_acc[0], all_val_acc[0]),
#     'TFIDF': (all_train_acc[1], all_val_acc[1]),
#     'GloVe': (all_train_acc[2], all_val_acc[2]),
#     'BERT': (all_train_acc[3], all_val_acc[3]),
#     'MPNET': (all_train_acc[4], all_val_acc[4]),
# }

# x = np.arange(len(labels))  # the label locations
# width = 0.10  # the width of the bars
# multiplier = 0

# fig, ax = plt.subplots(layout='constrained')

# for attribute, measurement in pdata_means.items():
#     print(attribute,measurement)
#     offset = width * multiplier
#     rects = ax.bar(x + offset, measurement, width, label=attribute)
#     ax.bar_label(rects, padding=3)
#     multiplier += 1


# ax.set_ylabel('Accuracy')
# ax.set_title('Accuracy w.r.t Features')
# ax.set_xticks(x + width, labels)
# ax.legend(loc='best')

# plt.show()

# Other Classifiers

### RandomForests

In [None]:
reset_stat_array()
def randomForestsRun(train_val_X, train_val_y):

    kf = KFold(n_splits = 5)
    train_acc = []
    val_acc = []

    train_f1 = []
    val_f1 = []

    train_auc = []
    val_auc = []

    for train_index, val_index in tqdm(kf.split(train_val_X)):
        train_X = train_val_X[train_index[0]:train_index[-1]+1]
        train_y = train_val_y[train_index[0]:train_index[-1]+1]

        val_X = train_val_X[val_index[0]:val_index[-1]+1]
        val_y = train_val_y[val_index[0]:val_index[-1]+1]

        dtc = RandomForestClassifier(n_estimators=128, min_samples_leaf=1, max_features=10, random_state=seed)
        dtc.fit(train_X, train_y)
        train_acc.append(dtc.score(train_X, train_y))
        train_f1.append(f1_score(train_y, dtc.predict(train_X), pos_label='positive'))
        train_auc.append(roc_auc_score(train_y, dtc.predict_proba(train_X)[:, 1]))

        val_acc.append(dtc.score(val_X, val_y))
        val_f1.append(f1_score(val_y, dtc.predict(val_X), pos_label='positive'))
        val_auc.append(roc_auc_score(val_y, dtc.predict_proba(val_X)[:, 1]))

    avg_train_acc = sum(train_acc) / len(train_acc)
    avg_val_acc = sum(val_acc) / len(val_acc)

    avg_train_f1 = sum(train_f1) / len(train_f1)
    avg_val_f1 = sum(val_f1) / len(val_f1)

    avg_train_auc = sum(train_auc) / len(train_auc)
    avg_val_auc = sum(val_auc) / len(val_auc)
    print("\nThe average training accuracy is {} with standard deviation of {}".format(avg_train_acc,st.pstdev(train_acc)))
    print("The average validation accuracy is {} with standard deviation of {}".format(avg_val_acc,st.pstdev(val_acc)))
    return avg_train_acc, st.pstdev(train_acc), avg_val_acc, st.pstdev(val_acc), avg_train_f1, avg_val_f1, avg_train_auc, avg_val_auc

In [None]:
# Evaluate the Random Forests model using 5-fold cross-validation
#Count
train_acc, train_std, val_acc, val_std, train_f1, val_f1, train_auc, val_auc = randomForestsRun(X_countvec,Y)

all_train_acc.append(train_acc)
all_train_std.append(train_std)
all_train_f1.append(train_f1)
all_train_auc.append(train_auc)

all_val_acc.append(val_acc)
all_val_std.append(val_std)
all_val_f1.append(val_f1)
all_val_auc.append(val_auc)

#TFIDF
train_acc, train_std, val_acc, val_std, train_f1, val_f1, train_auc, val_auc = randomForestsRun(X_tfidf,Y)

all_train_acc.append(train_acc)
all_train_std.append(train_std)
all_train_f1.append(train_f1)
all_train_auc.append(train_auc)

all_val_acc.append(val_acc)
all_val_std.append(val_std)
all_val_f1.append(val_f1)
all_val_auc.append(val_auc)

#Glove
train_acc, train_std, val_acc, val_std, train_f1, val_f1, train_auc, val_auc = randomForestsRun(X_glove,Y)

all_train_acc.append(train_acc)
all_train_std.append(train_std)
all_train_f1.append(train_f1)
all_train_auc.append(train_auc)

all_val_acc.append(val_acc)
all_val_std.append(val_std)
all_val_f1.append(val_f1)
all_val_auc.append(val_auc)

#BERT
train_acc, train_std, val_acc, val_std, train_f1, val_f1, train_auc, val_auc = randomForestsRun(X_bert,Y)

all_train_acc.append(train_acc)
all_train_std.append(train_std)
all_train_f1.append(train_f1)
all_train_auc.append(train_auc)

all_val_acc.append(val_acc)
all_val_std.append(val_std)
all_val_f1.append(val_f1)
all_val_auc.append(val_auc)

#MPNET
train_acc, train_std, val_acc, val_std, train_f1, val_f1, train_auc, val_auc = randomForestsRun(X_mpnet,Y)

all_train_acc.append(train_acc)
all_train_std.append(train_std)
all_train_f1.append(train_f1)
all_train_auc.append(train_auc)

all_val_acc.append(val_acc)
all_val_std.append(val_std)
all_val_f1.append(val_f1)
all_val_auc.append(val_auc)

In [None]:
drawTable(all_val_acc,all_val_f1,all_val_auc)

### Naive Bayes

In [None]:
reset_stat_array()
def naiveBayesRun(train_val_X, train_val_y):
    kf = KFold(n_splits = 5)
    train_acc = []
    val_acc = []

    train_f1 = []
    val_f1 = []

    train_auc = []
    val_auc = []

    for train_index, val_index in tqdm(kf.split(train_val_X)):
        train_X = train_val_X[train_index[0]:train_index[-1]+1]
        train_y = train_val_y[train_index[0]:train_index[-1]+1]

        val_X = train_val_X[val_index[0]:val_index[-1]+1]
        val_y = train_val_y[val_index[0]:val_index[-1]+1]

        gnb = GaussianNB()
        gnb.fit(train_X, train_y)

        train_acc.append(gnb.score(train_X, train_y))
        train_f1.append(f1_score(train_y, gnb.predict(train_X), pos_label='positive'))
        train_auc.append(roc_auc_score(train_y, gnb.predict_proba(train_X)[:, 1]))

        val_acc.append(gnb.score(val_X, val_y))
        val_f1.append(f1_score(val_y, gnb.predict(val_X), pos_label='positive'))
        val_auc.append(roc_auc_score(val_y, gnb.predict_proba(val_X)[:, 1]))

    avg_train_acc = sum(train_acc) / len(train_acc)
    avg_val_acc = sum(val_acc) / len(val_acc)

    avg_train_f1 = sum(train_f1) / len(train_f1)
    avg_val_f1 = sum(val_f1) / len(val_f1)

    avg_train_auc = sum(train_auc) / len(train_auc)
    avg_val_auc = sum(val_auc) / len(val_auc)
    print("\nThe average training accuracy is {} with standard deviation of {}".format(avg_train_acc,st.pstdev(train_acc)))
    print("The average validation accuracy is {} with standard deviation of {}".format(avg_val_acc,st.pstdev(val_acc)))
    return avg_train_acc, st.pstdev(train_acc), avg_val_acc, st.pstdev(val_acc), avg_train_f1, avg_val_f1, avg_train_auc, avg_val_auc

In [None]:
# Evaluate the Naive Bayes model using 5-fold cross-validation
#Count
train_acc, train_std, val_acc, val_std, train_f1, val_f1, train_auc, val_auc = naiveBayesRun(X_countvec,Y)

all_train_acc.append(train_acc)
all_train_std.append(train_std)
all_train_f1.append(train_f1)
all_train_auc.append(train_auc)

all_val_acc.append(val_acc)
all_val_std.append(val_std)
all_val_f1.append(val_f1)
all_val_auc.append(val_auc)

#TFIDF
X_tfidf=X_tensor_tfidf(rt_reviews_data['reviewText'])
train_acc, train_std, val_acc, val_std, train_f1, val_f1, train_auc, val_auc = naiveBayesRun(X_tfidf,Y)

all_train_acc.append(train_acc)
all_train_std.append(train_std)
all_train_f1.append(train_f1)
all_train_auc.append(train_auc)

all_val_acc.append(val_acc)
all_val_std.append(val_std)
all_val_f1.append(val_f1)
all_val_auc.append(val_auc)

#Glove
X_glove=X_tensor_glove(rt_reviews_data['reviewText'])
train_acc, train_std, val_acc, val_std, train_f1, val_f1, train_auc, val_auc = naiveBayesRun(X_glove,Y)

all_train_acc.append(train_acc)
all_train_std.append(train_std)
all_train_f1.append(train_f1)
all_train_auc.append(train_auc)

all_val_acc.append(val_acc)
all_val_std.append(val_std)
all_val_f1.append(val_f1)
all_val_auc.append(val_auc)

#BERT
X_bert=X_tensor_bert(rt_reviews_data['reviewText'])
train_acc, train_std, val_acc, val_std, train_f1, val_f1, train_auc, val_auc = naiveBayesRun(X_bert,Y)

all_train_acc.append(train_acc)
all_train_std.append(train_std)
all_train_f1.append(train_f1)
all_train_auc.append(train_auc)

all_val_acc.append(val_acc)
all_val_std.append(val_std)
all_val_f1.append(val_f1)
all_val_auc.append(val_auc)

#MPNET
X_mpnet=X_tensor_mpnet(rt_reviews_data['reviewText'])
train_acc, train_std, val_acc, val_std, train_f1, val_f1, train_auc, val_auc = naiveBayesRun(X_mpnet,Y)

all_train_acc.append(train_acc)
all_train_std.append(train_std)
all_train_f1.append(train_f1)
all_train_auc.append(train_auc)

all_val_acc.append(val_acc)
all_val_std.append(val_std)
all_val_f1.append(val_f1)
all_val_auc.append(val_auc)

In [None]:
drawTable(all_val_acc,all_val_f1,all_val_auc)

### SGD

In [None]:
reset_stat_array()
def sgdRun(train_val_X, train_val_y):

    kf = KFold(n_splits = 5)
    train_acc = []
    val_acc = []

    train_f1 = []
    val_f1 = []

    train_auc = []
    val_auc = []

    for train_index, val_index in tqdm(kf.split(train_val_X)):
        train_X = train_val_X[train_index[0]:train_index[-1]+1]
        train_y = train_val_y[train_index[0]:train_index[-1]+1]

        val_X = train_val_X[val_index[0]:val_index[-1]+1]
        val_y = train_val_y[val_index[0]:val_index[-1]+1]

        sgd = make_pipeline(StandardScaler(), SGDClassifier(max_iter=1000, tol=1e-3))
        sgd.fit(train_X, train_y)

        train_acc.append(sgd.score(train_X, train_y))
        train_f1.append(f1_score(train_y, sgd.predict(train_X), pos_label='positive'))
        train_auc.append(roc_auc_score(train_y, sgd.decision_function(train_X)))

        val_acc.append(sgd.score(val_X, val_y))
        val_f1.append(f1_score(val_y, sgd.predict(val_X), pos_label='positive'))
        val_auc.append(roc_auc_score(val_y, sgd.decision_function(val_X)))

    avg_train_acc = sum(train_acc) / len(train_acc)
    avg_val_acc = sum(val_acc) / len(val_acc)

    avg_train_f1 = sum(train_f1) / len(train_f1)
    avg_val_f1 = sum(val_f1) / len(val_f1)

    avg_train_auc = sum(train_auc) / len(train_auc)
    avg_val_auc = sum(val_auc) / len(val_auc)
    print("\nThe average training accuracy is {} with standard deviation of {}".format(avg_train_acc,st.pstdev(train_acc)))
    print("The average validation accuracy is {} with standard deviation of {}".format(avg_val_acc,st.pstdev(val_acc)))
    return avg_train_acc, st.pstdev(train_acc), avg_val_acc, st.pstdev(val_acc), avg_train_f1, avg_val_f1, avg_train_auc, avg_val_auc

In [None]:
# Evaluate the SGD model using 5-fold cross-validation
#Count
train_acc, train_std, val_acc, val_std, train_f1, val_f1, train_auc, val_auc = sgdRun(X_countvec,Y)

all_train_acc.append(train_acc)
all_train_std.append(train_std)
all_train_f1.append(train_f1)
all_train_auc.append(train_auc)

all_val_acc.append(val_acc)
all_val_std.append(val_std)
all_val_f1.append(val_f1)
all_val_auc.append(val_auc)

#TFIDF
X_tfidf=X_tensor_tfidf(rt_reviews_data['reviewText'])
train_acc, train_std, val_acc, val_std, train_f1, val_f1, train_auc, val_auc = sgdRun(X_tfidf,Y)

all_train_acc.append(train_acc)
all_train_std.append(train_std)
all_train_f1.append(train_f1)
all_train_auc.append(train_auc)

all_val_acc.append(val_acc)
all_val_std.append(val_std)
all_val_f1.append(val_f1)
all_val_auc.append(val_auc)

#Glove
X_glove=X_tensor_glove(rt_reviews_data['reviewText'])
train_acc, train_std, val_acc, val_std, train_f1, val_f1, train_auc, val_auc = sgdRun(X_glove,Y)

all_train_acc.append(train_acc)
all_train_std.append(train_std)
all_train_f1.append(train_f1)
all_train_auc.append(train_auc)

all_val_acc.append(val_acc)
all_val_std.append(val_std)
all_val_f1.append(val_f1)
all_val_auc.append(val_auc)

#BERT
X_bert=X_tensor_bert(rt_reviews_data['reviewText'])
train_acc, train_std, val_acc, val_std, train_f1, val_f1, train_auc, val_auc = sgdRun(X_bert,Y)

all_train_acc.append(train_acc)
all_train_std.append(train_std)
all_train_f1.append(train_f1)
all_train_auc.append(train_auc)

all_val_acc.append(val_acc)
all_val_std.append(val_std)
all_val_f1.append(val_f1)
all_val_auc.append(val_auc)

#MPNET
X_mpnet=X_tensor_mpnet(rt_reviews_data['reviewText'])
train_acc, train_std, val_acc, val_std, train_f1, val_f1, train_auc, val_auc = sgdRun(X_mpnet,Y)

all_train_acc.append(train_acc)
all_train_std.append(train_std)
all_train_f1.append(train_f1)
all_train_auc.append(train_auc)

all_val_acc.append(val_acc)
all_val_std.append(val_std)
all_val_f1.append(val_f1)
all_val_auc.append(val_auc)

In [None]:
drawTable(all_val_acc,all_val_f1,all_val_auc)

# Transformers

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-distilroberta-v1")
model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/all-distilroberta-v1")

In [None]:
# !pip install datasets
import datasets
X_train, X_test, y_train, y_test = train_test_split(rt_reviews_data['reviewText'],rt_reviews_data['scoreSentiment'],test_size=0.2, shuffle=True, random_state=seed)

dataset_train={'text':X_train, 'labels':[1 if x=='positive' else 0 for x in list(y_train)]}
dataset_test={'text':X_test, 'labels':[1 if x=='positive' else 0 for x in list(y_test)]}

dataset_train=datasets.Dataset.from_pandas(pd.DataFrame(data=dataset_train))
dataset_test=datasets.Dataset.from_pandas(pd.DataFrame(data=dataset_test))

In [None]:
def preprocess_function(examples):
   return tokenizer(examples["text"], truncation=True)

tokenized_train = dataset_train.map(preprocess_function, batched=True)
tokenized_test = dataset_test.map(preprocess_function, batched=True)

In [None]:
# tokenized_train = tokenized_train.remove_columns('text')
# tokenized_test = tokenized_test.remove_columns('text')

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# from transformers import AutoModelForSequenceClassification
# model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/all-mpnet-base-v2", num_labels=2)

In [None]:
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [None]:
# RUN this first, Comment it and then restart session and run all.
# !pip install accelerate -U

In [None]:
!huggingface-cli login

In [None]:
from transformers import TrainingArguments, Trainer

repo_name = "finetuning-sentiment-model-distilbert-imdb"

training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=True,
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.push_to_hub()