In [None]:
'''
NOTE: Large Movie Review Dataset is used in this file

Large Movie Review Dataset: https://ai.stanford.edu/~amaas/data/sentiment/

@InProceedings{maas-EtAl:2011:ACL-HLT2011,
  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
  title     = {Learning Word Vectors for Sentiment Analysis},
  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
  month     = {June},
  year      = {2011},
  address   = {Portland, Oregon, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {142--150},
  url       = {http://www.aclweb.org/anthology/P11-1015}
}
'''

'\nNOTE: Large Movie Review Dataset is used in this file\n\nLarge Movie Review Dataset: https://ai.stanford.edu/~amaas/data/sentiment/\n\n@InProceedings{maas-EtAl:2011:ACL-HLT2011,\n  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},\n  title     = {Learning Word Vectors for Sentiment Analysis},\n  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},\n  month     = {June},\n  year      = {2011},\n  address   = {Portland, Oregon, USA},\n  publisher = {Association for Computational Linguistics},\n  pages     = {142--150},\n  url       = {http://www.aclweb.org/anthology/P11-1015}\n}\n'

In [None]:
# Skip this cell if you've installed transformers
%pip install transformers



In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import libraries
import pandas as pd
import string
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy

In [None]:
# Load the csv files
train = pd.read_csv('drive/MyDrive/Sentiment Analysis-BERT/train2.csv')
test = pd.read_csv('drive/MyDrive/Sentiment Analysis-BERT/test2.csv')

# Concatenate the two dataframes
# ignore_index omits previous indexs of the dataframes
result = pd.concat([train, test], ignore_index=True)
result

Unnamed: 0,REVIEW,LABEL
0,Bromwell High is a cartoon comedy. It ran at ...,positive
1,Homelessness (or Houselessness as George Carl...,positive
2,Brilliant over-acting by Lesley Ann Warren. B...,positive
3,This is easily the most underrated film inn t...,positive
4,This is not the typical Mel Brooks film. It w...,positive
...,...,...
49995,I occasionally let my kids watch this garbage...,negative
49996,When all we have anymore is pretty much reali...,negative
49997,The basic genre is a thriller intercut with a...,negative
49998,Four things intrigued me as to this film - fi...,negative


In [None]:
# Split the dataset into 3-parts namely: train, cross-validation and test
# train: 60%, cv: 20%, test: 20%
x, w = train_test_split(result, test_size=0.4, shuffle=True)
cv, test = train_test_split(w, test_size=0.5)

In [None]:
# print the distribution of each dataset
dataset = [x, cv, test]
name = ['Train', 'Cross-Validation', 'Test']
for idx, item in enumerate(dataset):
    col = pd.value_counts(item.iloc[:,1].values.ravel())
    print('Distibution for {0}:\n{1}\n'.format(name[idx], col))

Distibution for Train:
positive    15015
negative    14985
dtype: int64

Distibution for Cross-Validation:
negative    5057
positive    4943
dtype: int64

Distibution for Test:
positive    5042
negative    4958
dtype: int64



In [None]:
# Review preprocessing
def processing(dataframe):
    review = list(dataframe['REVIEW'])
    label = dataframe['LABEL']

    # BERT tokenizer
    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Empty arrays to hold preprocess data
    processed_review = []
    labels = []

    # review preprocessing
    for i in range(len(review)):
        data = review[i]
        # Remove punctuations
        data = data.translate(str.maketrans('', '', string.punctuation))
        processed_review.append(data)

    # Using BERT to Tokenize the processed text
    bert_inputs = bert_tokenizer(processed_review, padding=True, truncation=True, return_tensors="tf")
    bert_inputs = dict(bert_inputs)

    # Label preprocessing
    for i in label:
        # Assign 'positive' to 1, 'negative' to 0
        temp = 1 if i == 'positive' else 0
        labels.append(temp)

    # Convert list to tensor
    label_tf = tf.convert_to_tensor(labels)

    return bert_inputs, label_tf

In [None]:
# Process train text
bert_train_text, train_label = processing(x)

# Process cross-validation text
bert_cv_text, cv_label = processing(cv)

# Process test text
bert_test_text, test_label = processing(test)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Using BERT pretrained layers
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# Adam optimization with 3*10^(-5) learning rate
optimizer = Adam(learning_rate=3e-5)
loss = SparseCategoricalCrossentropy(from_logits=True)
metric = SparseCategoricalAccuracy('accuracy')

# BERT model compilation
bert_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

print('models are now ready to use')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


models are now ready to use


In [None]:
def eval(model, test_text, test_label, model_type):
  # Evaluate model
  score = model.evaluate(test_text, test_label, verbose =1)
  print(model_type + ' Test loss: ', score[0])
  print(model_type + ' Test accuracy: ', score[1])

  # Make prediction using test dataset
  pred = model.predict(test_text)

  # Invert the result
  invert = np.argmax(pred.logits, axis=1)

  # Print confusion matrix
  print(model_type + ' Confusion Matrix')
  print(confusion_matrix(test_label, invert))

  # Print classification report
  print(model_type + ' Classification Report')
  print(classification_report(test_label, invert))

  return score, invert

def save_model(model, name):
  filename = 'drive/MyDrive/Sentiment Analysis-BERT/' + name
  # Saving the model
  model.save(filename)
  print('Model saved')

In [None]:
# Training the BERT model
bert_history = bert_model.fit(bert_train_text, train_label, batch_size=4, epochs=1, verbose=1, validation_data = (bert_cv_text, cv_label))



In [None]:
bert_score, bert_pred = eval(bert_model, bert_test_text, test_label, 'BERT')

BERT Test loss:  0.20389963686466217
BERT Test accuracy:  0.9194999933242798
BERT Confusion Matrix
[[4375  583]
 [ 222 4820]]
BERT Classification Report
              precision    recall  f1-score   support

           0       0.95      0.88      0.92      4958
           1       0.89      0.96      0.92      5042

    accuracy                           0.92     10000
   macro avg       0.92      0.92      0.92     10000
weighted avg       0.92      0.92      0.92     10000



In [None]:
save_model(bert_model, 'BERT Model')

Model saved


Testing the models

In [None]:
model_dict = {'BERT': [BertTokenizer, 'bert-base-uncased', bert_model]}

In [None]:
def example(model_dict, name, text):
    tokenize = model_dict[name][0]
    tokenize_type= model_dict[name][1]
    model = model_dict[name][2]

    # Define the tokenizer
    tokenizer = tokenize.from_pretrained(tokenize_type)

    if name == 'BERT':
      # Remove punctuations
      text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the processed text
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="tf")
    inputs = dict(inputs)

    # Make predictions using the model
    outputs = model.predict(inputs)

    # Get the prediction class
    pred = np.argmax(outputs.logits, axis=1)

    # Print the prediction class
    print(pred)

    # Define the label class
    labels = ['Negative','Positive']

    # Print the predicted label
    print(text, ": \n", labels[pred[0]])

In [None]:
sentence = 'BERT is awesome...'
example(model_dict, 'BERT', sentence)

[1]
BERT is awesome : 
 Positive


In [None]:
sentence = 'That performance was awful!'
example(model_dict, 'BERT', sentence)

[0]
That performance was awful : 
 Negative
