<a href="https://colab.research.google.com/github/AbbyDerton/NEURO140_FinalProject/blob/main/BERT_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ktrain

In [None]:
# Import packages
import pickle # Needed to load data
import time   # Needed to track runtimes
import pandas as pd
import numpy as np
from string import digits
import ktrain #This is sometimes difficult to load in Jupyter Notebook. May need to check that the correct version of scikit-learn is installed (I believe ktrain requires scikit-learn version 0.24)
from ktrain import text as txt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# GPU Check

### If using Colab Pro, run the following block:

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

### If using regular Colab, run these 3 cells instead:

In [None]:
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
physical_devices

In [None]:
!nvidia-smi

# Download the Amazon and Goodreads datasets



> These datasets have already been cleaned and the outcome has been defined as follows (0: 1-2 stars, 1: 4-5 stars). All digits (including written digits "one", "two", etc.) have been removed from the review text. In addition, the words "rated", "rating", "rate", "star", and "stars" have been removed.



In [None]:
goodreads_data = pd.read_csv("goodreads_clean_concat.csv", engine='python')

In [None]:
amazon_data = pd.read_csv("amazon_clean_concat.csv").iloc[0:50000]

# Take a look at the two datasets:

In [None]:
goodreads_data

In [None]:
amazon_data

### Look at the distribution of positive and negative reviews in both dataset

In [None]:
amazon_ones = 0
for i in amazon_data["Rating"]:
  if i == 1:
    amazon_ones = amazon_ones + 1
amazon_ones/len(amazon_data["Rating"])

In [None]:
goodreads_ones = 0
for i in goodreads_data["Rating"]:
  if i == 1:
    goodreads_ones = goodreads_ones + 1
goodreads_ones/len(goodreads_data["Rating"])

# BERT Model



> Use the BERT model included in the ktrain (v0.29.x) library which is a wrapper for TensorFlow Keras



## Train and test BERT Model on Amazon dataset using full reviews (Model 1):

#### Split the datasets to training, testing, and validation sets

In [None]:
X_train_Amazon_full, X_val_and_test_Amazon_full, y_train_Amazon_full, y_val_and_test_Amazon_full = train_test_split(amazon_data["Review_Clean"],
                                                                                                        amazon_data["Rating"],
                                                                                                        test_size = 0.30)

In [None]:
X_val_Amazon_full, X_test_Amazon_full, y_val_Amazon_full, y_test_Amazon_full = train_test_split(X_val_and_test_Amazon_full,
                                                                                                y_val_and_test_Amazon_full,
                                                                                                test_size = 0.5)

In [None]:
### Create preproc and training and testing datasets     
start = time.time()
(x_train_fullA, y_train_fullA), (x_val_fullA, y_val_fullA), preproc_fullA = txt.texts_from_array(x_train = list(X_train_Amazon_full),
                                                                                                 y_train = list(y_train_Amazon_full),
                                                                                                 x_test = list(X_val_Amazon_full),
                                                                                                 y_test = list(y_val_Amazon_full),
                                                                                                 class_names = ['0','1'],
                                                                                                 preprocess_mode = 'bert',
                                                                                                 ngram_range=1,                    ### Change?
                                                                                                 maxlen = 200)                     # Was 175          
end = time.time()
print(end - start)

In [None]:
### Define and train the model ###
start = time.time()
model_fullA = txt.text_classifier(name='bert',
                             train_data=(x_train_fullA, y_train_fullA),
                             preproc=preproc_fullA)
end = time.time()
print(end - start)

In [None]:
start = time.time()
learner_fullA = ktrain.get_learner(model=model_fullA,
                             train_data=(x_train_fullA, y_train_fullA),
                             val_data=(x_val_fullA, y_val_fullA),
                             batch_size=16)
end = time.time()
print(end - start)

In [None]:
start = time.time()
learner_fullA.fit_onecycle(lr=2e-5,
                     epochs=3)
end = time.time()
print(end - start)

### Test this model on the Amazon reviews

In [None]:
start = time.time()
predictor_fullA = ktrain.get_predictor(learner_fullA.model, preproc_fullA)
end = time.time()
print(end - start)

In [None]:
start = time.time()
predicted_ratings_fullA = predictor_fullA.predict(list(X_test_Amazon_full))
end = time.time()
print(end - start)

In [None]:
#How big is the test set?
len(y_test_Amazon_full)

In [None]:
#How many of the reviews in the test set were actually negative?
num_neg = 0
for i in range(len(y_test_Amazon_full)):
  if list(y_test_Amazon_full)[i] == 0:
    num_neg = num_neg + 1
num_neg

In [None]:
#How many of the 7500 predictions were correct?
num_correct = 0
for i in range(len(predicted_ratings_fullA)):
  if int(predicted_ratings_fullA[i]) == list(y_test_Amazon_full)[i]:
    num_correct = num_correct + 1
num_correct

In [None]:
#What proportion of the predictions were correct?
num_correct/len(list(y_test_Amazon_full))

In [None]:
#How many of the positive predictions were correct?
num_1s_correct = 0
for i in range(len(predicted_ratings_fullA)):
  if int(predicted_ratings_fullA[i]) == list(y_test_Amazon_full)[i] and int(predicted_ratings_fullA[i]) == 1:
    num_1s_correct = num_1s_correct + 1
num_1s_correct

In [None]:
#How many of the negative predictions were correct?
num_0s_correct = 0
for i in range(len(predicted_ratings_fullA)):
  if int(predicted_ratings_fullA[i]) == list(y_test_Amazon_full)[i] and int(predicted_ratings_fullA[i]) == 0:
    num_0s_correct = num_0s_correct + 1
num_0s_correct

## Test this model on the Goodreads reviews (Model 5):



> Use the trained model above



In [None]:
# Prepare the goodreads dataset
X_train_Goodreads_full, X_val_and_test_Goodreads_full, y_train_Goodreads_full, y_val_and_test_Goodreads_full = train_test_split(goodreads_data["Review_Clean"],
                                                                                                        goodreads_data["Rating"],
                                                                                                        test_size = 0.30)
X_val_Goodreads_full, X_test_Goodreads_full, y_val_Goodreads_full, y_test_Goodreads_full = train_test_split(X_val_and_test_Goodreads_full,
                                                                                                y_val_and_test_Goodreads_full,
                                                                                                test_size = 0.5)

In [None]:
start = time.time()
predictor_fullA = ktrain.get_predictor(learner_fullA.model, preproc_fullA)
end = time.time()
print(end - start)

In [None]:
start = time.time()
predicted_ratings_fullGR = predictor_fullA.predict(list(X_test_Goodreads_full))
end = time.time()
print(end - start)

In [None]:
#How big is the training dataset?
len(y_test_Goodreads_full)

In [None]:
#How many of the reviews in the test set were actually negative?
num_neg = 0
for i in range(len(y_test_Goodreads_full)):
  if list(y_test_Goodreads_full)[i] == 0:
    num_neg = num_neg + 1
num_neg

In [None]:
#How many of the 5357 predictions were correct?
num_correct = 0
for i in range(len(predicted_ratings_fullGR)):
  if int(predicted_ratings_fullGR[i]) == list(y_test_Goodreads_full)[i]:
    num_correct = num_correct + 1
num_correct

In [None]:
#What proportion of the predictions were correct?
num_correct/len(list(y_test_Goodreads_full))

In [None]:
#How many of the positive predictions were correct?
num_1s_correct = 0
for i in range(len(predicted_ratings_fullGR)):
  if int(predicted_ratings_fullGR[i]) == list(y_test_Goodreads_full)[i] and int(predicted_ratings_fullGR[i]) == 1:
    num_1s_correct = num_1s_correct + 1
num_1s_correct

In [None]:
#How many of the negative predictions were correct?
num_0s_correct = 0
for i in range(len(predicted_ratings_fullGR)):
  if int(predicted_ratings_fullGR[i]) == list(y_test_Goodreads_full)[i] and int(predicted_ratings_fullGR[i]) == 0:
    num_0s_correct = num_0s_correct + 1
num_0s_correct

# Train and Test BERT Model on Goodreads Dataset (Model 3):

### Start by balancing the training data to include more negative reviews

In [None]:
# Split the training data into positive and negative outputs
data = {"Review": list(X_train_Goodreads_full), "Rating": list(y_train_Goodreads_full)}
training_data = pd.DataFrame(data)
neg_rows = training_data["Rating"] == 0
df_train_pos = training_data.loc[~neg_rows]
df_train_neg = training_data.loc[neg_rows]
# Merge the balanced data
df_train = pd.concat([df_train_pos.sample(n = len(df_train_neg), random_state=42), df_train_neg], axis = 0)
# Shuffle the order of training samples
df_train = df_train.sample(n = len(df_train), random_state = 42).reset_index(drop=True)
print("Training set prevalence (n = {:d}):".format(len(df_train)), "{:.2f}%".format((df_train["Rating"].sum()/len(df_train))*100))

In [None]:
### Create preproc and training and testing datasets     
(x_train_fullGR, y_train_fullGR), (x_val_fullGR, y_val_fullGR), preproc_fullGR = txt.texts_from_array(x_train = list(df_train["Review"]),
                                                                                                 y_train = list(df_train["Rating"]),
                                                                                                 x_test = list(X_val_Goodreads_full),
                                                                                                 y_test = list(y_val_Goodreads_full),
                                                                                                 class_names = ['0','1'],
                                                                                                 preprocess_mode = 'bert',
                                                                                                 ngram_range=1,                    ### Change?
                                                                                                 maxlen = 200)                     # Was 175          


In [None]:
### Define and train the model ###
start = time.time()
model_fullGR = txt.text_classifier(name='bert',
                             train_data=(x_train_fullGR, y_train_fullGR),
                             preproc=preproc_fullGR)
end = time.time()
print(end - start)


In [None]:
start = time.time()
learner_fullGR = ktrain.get_learner(model=model_fullGR,
                             train_data=(x_train_fullGR, y_train_fullGR),
                             val_data=(x_val_fullGR, y_val_fullGR),
                             batch_size=16)
end = time.time()
print(end - start)

In [None]:
start = time.time()
learner_fullGR.fit_onecycle(lr=2e-5,
                     epochs=3)
end = time.time()
print(end - start)

### Test this model on the Goodreads data:

In [None]:
start = time.time()
predictor_fullGR = ktrain.get_predictor(learner_fullGR.model, preproc_fullGR)
end = time.time()
print(end - start)

In [None]:
start = time.time()
predicted_ratings_fullGR = predictor_fullGR.predict(list(X_test_Goodreads_full))
end = time.time()
print(end - start)

In [None]:
#How many reviews are in the test set:
len(y_test_Goodreads_full)

In [None]:
#How many of the reviews in the test set were actually negative?
num_neg = 0
for i in range(len(y_test_Goodreads_full)):
  if list(y_test_Goodreads_full)[i] == 0:
    num_neg = num_neg + 1
num_neg

In [None]:
#How many of the predictions were correct?
num_correct = 0
for i in range(len(predicted_ratings_fullGR)):
  if int(predicted_ratings_fullGR[i]) == list(y_test_Goodreads_full)[i]:
    num_correct = num_correct + 1
num_correct

In [None]:
#What proportion of predictions were correct?
num_correct/len(list(y_test_Goodreads_full))

In [None]:
#How many of the positive predictions were correct?
num_1s_correct = 0
for i in range(len(predicted_ratings_fullGR)):
  if int(predicted_ratings_fullGR[i]) == list(y_test_Goodreads_full)[i] and int(predicted_ratings_fullGR[i]) == 1:
    num_1s_correct = num_1s_correct + 1
num_1s_correct

In [None]:
#How many of the negative predictions were correct?
num_0s_correct = 0
for i in range(len(predicted_ratings_fullGR)):
  if int(predicted_ratings_fullGR[i]) == list(y_test_Goodreads_full)[i] and int(predicted_ratings_fullGR[i]) == 0:
    num_0s_correct = num_0s_correct + 1
num_0s_correct

# Rerun the above models, this time using concatenated reviews

# Train and Test Model on Concatenated Amazon Reviews (Model 2):

In [None]:
X_train_Amazon_concat, X_val_and_test_Amazon_concat, y_train_Amazon_concat, y_val_and_test_Amazon_concat = train_test_split(amazon_data["Review_Concat"],
                                                                                                        amazon_data["Rating"],
                                                                                                        test_size = 0.30)

In [None]:
X_val_Amazon_concat, X_test_Amazon_concat, y_val_Amazon_concat, y_test_Amazon_concat = train_test_split(X_val_and_test_Amazon_concat,
                                                                                                y_val_and_test_Amazon_concat,
                                                                                                test_size = 0.5)

In [None]:
### Create preproc and training and testing datasets     
start = time.time()
(x_train_concatA, y_train_concatA), (x_val_concatA, y_val_concatA), preproc_concatA = txt.texts_from_array(x_train = list(X_train_Amazon_concat),
                                                                                                 y_train = list(y_train_Amazon_concat),
                                                                                                 x_test = list(X_val_Amazon_concat),
                                                                                                 y_test = list(y_val_Amazon_concat),
                                                                                                 class_names = ['0','1'],
                                                                                                 preprocess_mode = 'bert',
                                                                                                 ngram_range=1,                    ### Change?
                                                                                                 maxlen = 200)                     # Was 175          
end = time.time()
print(end - start)

In [None]:
### Define and train the model ###
start = time.time()
model_concatA = txt.text_classifier(name='bert',
                             train_data=(x_train_concatA, y_train_concatA),
                             preproc=preproc_concatA)
end = time.time()
print(end - start)

In [None]:
start = time.time()
learner_concatA = ktrain.get_learner(model=model_concatA,
                             train_data=(x_train_concatA, y_train_concatA),
                             val_data=(x_val_concatA, y_val_concatA),
                             batch_size=16)
end = time.time()
print(end - start)

In [None]:
start = time.time()
learner_concatA.fit_onecycle(lr=2e-5,
                     epochs=3)
end = time.time()
print(end - start)

### Make predictions on concatenated Amazon reviews

In [None]:
start = time.time()
predictor_concatA = ktrain.get_predictor(learner_concatA.model, preproc_concatA)
end = time.time()
print(end - start)

In [None]:
start = time.time()
predicted_ratings_concatA = predictor_concatA.predict(list(X_test_Amazon_concat))
end = time.time()
print(end - start)

In [None]:
#How many reviews were in the test set?
len(y_test_Amazon_concat)

In [None]:
#How many of the reviews in the test set were actually negative?
num_neg = 0
for i in range(len(y_test_Amazon_concat)):
  if list(y_test_Amazon_concat)[i] == 0:
    num_neg = num_neg + 1
num_neg

In [None]:
#How many of the predictions were correct?
num_correct = 0
for i in range(len(predicted_ratings_concatA)):
  if int(predicted_ratings_concatA[i]) == list(y_test_Amazon_concat)[i]:
    num_correct = num_correct + 1
num_correct

In [None]:
#What proportion of predictions were correct?
num_correct/len(list(y_test_Amazon_concat))

In [None]:
#How many of the positive predictions were correct?
num_1s_correct = 0
for i in range(len(predicted_ratings_concatA)):
  if int(predicted_ratings_concatA[i]) == list(y_test_Amazon_concat)[i] and int(predicted_ratings_concatA[i]) == 1:
    num_1s_correct = num_1s_correct + 1
num_1s_correct

In [None]:
#How many of the negative predictions were correct?
num_0s_correct = 0
for i in range(len(predicted_ratings_concatA)):
  if int(predicted_ratings_concatA[i]) == list(y_test_Amazon_concat)[i] and int(predicted_ratings_concatA[i]) == 0:
    num_0s_correct = num_0s_correct + 1
num_0s_correct

# Test the model trained on concat amazon data on concat goodreads data (Model 6):

In [None]:
# Prepare the goodreads data
X_train_Goodreads_concat, X_val_and_test_Goodreads_concat, y_train_Goodreads_concat, y_val_and_test_Goodreads_concat = train_test_split(goodreads_data["Review_Concat"],
                                                                                                        goodreads_data["Rating"],
                                                                                                        test_size = 0.30)
X_val_Goodreads_concat, X_test_Goodreads_concat, y_val_Goodreads_concat, y_test_Goodreads_concat = train_test_split(X_val_and_test_Goodreads_concat,
                                                                                                y_val_and_test_Goodreads_concat,
                                                                                                test_size = 0.5)

In [None]:
start = time.time()
predicted_ratings_concatGR = predictor_concatA.predict(list(X_test_Goodreads_concat))
end = time.time()
print(end - start)

In [None]:
#How many reviews were in the test set?
len(y_test_Goodreads_concat)

In [None]:
#How many of the reviews in the test set were actually negative?
num_neg = 0
for i in range(len(y_test_Goodreads_concat)):
  if list(y_test_Goodreads_concat)[i] == 0:
    num_neg = num_neg + 1
num_neg

In [None]:
#How many of the predictions were correct?
num_correct = 0
for i in range(len(predicted_ratings_concatGR)):
  if int(predicted_ratings_concatGR[i]) == list(y_test_Goodreads_concat)[i]:
    num_correct = num_correct + 1
num_correct

In [None]:
#What proportion of the predictions were correct?
num_correct/len(list(y_test_Goodreads_concat))

In [None]:
#How many of the positive predictions were correct?
num_1s_correct = 0
for i in range(len(predicted_ratings_concatGR)):
  if int(predicted_ratings_concatGR[i]) == list(y_test_Goodreads_concat)[i] and int(predicted_ratings_concatGR[i]) == 1:
    num_1s_correct = num_1s_correct + 1
num_1s_correct

In [None]:
#How many of the negative predictions were correct?
num_0s_correct = 0
for i in range(len(predicted_ratings_concatGR)):
  if int(predicted_ratings_concatGR[i]) == list(y_test_Goodreads_concat)[i] and int(predicted_ratings_concatGR[i]) == 0:
    num_0s_correct = num_0s_correct + 1
num_0s_correct

# Train and Test a BERT Model on the Concat Goodreads Data (Model 4):

## Start by balancing the training data to include more negative reviews

In [None]:
# Split the training data into positive and negative outputs
data = {"Review": list(X_train_Goodreads_concat), "Rating": list(y_train_Goodreads_concat)}
training_data = pd.DataFrame(data)
neg_rows = training_data["Rating"] == 0
df_train_pos = training_data.loc[~neg_rows]
df_train_neg = training_data.loc[neg_rows]
# Merge the balanced data
df_train = pd.concat([df_train_pos.sample(n = len(df_train_neg), random_state=42), df_train_neg], axis = 0)
# Shuffle the order of training samples
df_train = df_train.sample(n = len(df_train), random_state = 42).reset_index(drop=True)
print("Training set prevalence (n = {:d}):".format(len(df_train)), "{:.2f}%".format((df_train["Rating"].sum()/len(df_train))*100))

In [None]:
### Create preproc and training and testing datasets   
start = time.time()
(x_train_concatGR, y_train_concatGR), (x_val_concatGR, y_val_concatGR), preproc_concatGR = txt.texts_from_array(x_train = list(df_train["Review"]),
                                                                                                 y_train = list(df_train["Rating"]),
                                                                                                 x_test = list(X_val_Goodreads_concat),
                                                                                                 y_test = list(y_val_Goodreads_concat),
                                                                                                 class_names = ['0','1'],
                                                                                                 preprocess_mode = 'bert',
                                                                                                 ngram_range=1,                    ### Change?
                                                                                                 maxlen = 200)                     # Was 175          
end = time.time()
print(end - start)

In [None]:
### Define and train the model ###
start = time.time()
model_concatGR = txt.text_classifier(name='bert',
                             train_data=(x_train_concatGR, y_train_concatGR),
                             preproc=preproc_concatGR)
end = time.time()
print(end - start)

In [None]:
start = time.time()
learner_concatGR = ktrain.get_learner(model=model_concatGR,
                             train_data=(x_train_concatGR, y_train_concatGR),
                             val_data=(x_val_concatGR, y_val_concatGR),
                             batch_size=16)
end = time.time()
print(end - start)

In [None]:
start = time.time()
learner_concatGR.fit_onecycle(lr=2e-5,
                     epochs=3)
end = time.time()
print(end - start)

### Make predictions on the Goodreads concatenated reviews:

In [None]:
start = time.time()
predictor_concatGR = ktrain.get_predictor(learner_concatGR.model, preproc_concatGR)
end = time.time()
print(end - start)

In [None]:
start = time.time()
predicted_ratings_concatGR = predictor_concatGR.predict(list(X_test_Goodreads_concat))
end = time.time()
print(end - start)

In [None]:
#How many reviews were in the test set?
len(y_test_Goodreads_concat)

In [None]:
#How many of the reviews in the test set were actually negative?
num_neg = 0
for i in range(len(y_test_Goodreads_concat)):
  if list(y_test_Goodreads_concat)[i] == 0:
    num_neg = num_neg + 1
num_neg

In [None]:
#How many of the predictions were correct?
num_correct = 0
for i in range(len(predicted_ratings_concatGR)):
  if int(predicted_ratings_concatGR[i]) == list(y_test_Goodreads_concat)[i]:
    num_correct = num_correct + 1
num_correct

In [None]:
#What proportion of the predictions were correct?
num_correct/len(list(y_test_Goodreads_concat))

In [None]:
#How many of the positive predictions were correct?
num_1s_correct = 0
for i in range(len(predicted_ratings_concatGR)):
  if int(predicted_ratings_concatGR[i]) == list(y_test_Goodreads_concat)[i] and int(predicted_ratings_concatGR[i]) == 1:
    num_1s_correct = num_1s_correct + 1
num_1s_correct

In [None]:
#How many of the negative predictions were correct?
num_0s_correct = 0
for i in range(len(predicted_ratings_concatGR)):
  if int(predicted_ratings_concatGR[i]) == list(y_test_Goodreads_concat)[i] and int(predicted_ratings_concatGR[i]) == 0:
    num_0s_correct = num_0s_correct + 1
num_0s_correct

## All done!