In [None]:
'''
NOTE: Large Movie Review Dataset is used in this file

Large Movie Review Dataset: https://ai.stanford.edu/~amaas/data/sentiment/

@InProceedings{maas-EtAl:2011:ACL-HLT2011,
  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
  title     = {Learning Word Vectors for Sentiment Analysis},
  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
  month     = {June},
  year      = {2011},
  address   = {Portland, Oregon, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {142--150},
  url       = {http://www.aclweb.org/anthology/P11-1015}
}
'''

'\nNOTE: Large Movie Review Dataset is used in this file\n\nLarge Movie Review Dataset: https://ai.stanford.edu/~amaas/data/sentiment/\n\n@InProceedings{maas-EtAl:2011:ACL-HLT2011,\n  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},\n  title     = {Learning Word Vectors for Sentiment Analysis},\n  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},\n  month     = {June},\n  year      = {2011},\n  address   = {Portland, Oregon, USA},\n  publisher = {Association for Computational Linguistics},\n  pages     = {142--150},\n  url       = {http://www.aclweb.org/anthology/P11-1015}\n}\n'

In [None]:
# Skip this cell if you've installed transformers
%pip install transformers



In [None]:
%pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Import libraries
import pandas as pd
import string
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy

In [None]:
# Load the csv files
train = pd.read_csv('drive/MyDrive/Sentiment Analysis-BERT/train2.csv')
test = pd.read_csv('drive/MyDrive/Sentiment Analysis-BERT/test2.csv')
# Concatenate the two dataframes
# ignore_index omits previous indexs of the dataframes
result = pd.concat([train, test], ignore_index=True)
result

Unnamed: 0,REVIEW,LABEL
0,Bromwell High is a cartoon comedy. It ran at ...,positive
1,Homelessness (or Houselessness as George Carl...,positive
2,Brilliant over-acting by Lesley Ann Warren. B...,positive
3,This is easily the most underrated film inn t...,positive
4,This is not the typical Mel Brooks film. It w...,positive
...,...,...
49995,I occasionally let my kids watch this garbage...,negative
49996,When all we have anymore is pretty much reali...,negative
49997,The basic genre is a thriller intercut with a...,negative
49998,Four things intrigued me as to this film - fi...,negative


In [None]:
# Split the dataset into 3-parts namely: train, cross-validation and test
# train: 60%, cv: 20%, test: 20%
x, w = train_test_split(result, test_size=0.4, shuffle=True)
cv, test = train_test_split(w, test_size=0.5)

In [None]:
# print the distribution of each dataset
dataset = [x, cv, test]
name = ['Train', 'Cross-Validation', 'Test']
for idx, item in enumerate(dataset):
    col = pd.value_counts(item.iloc[:,1].values.ravel())
    print('Distibution for {0}:\n{1}\n'.format(name[idx], col))

Distibution for Train:
negative    15035
positive    14965
dtype: int64

Distibution for Cross-Validation:
positive    5016
negative    4984
dtype: int64

Distibution for Test:
positive    5019
negative    4981
dtype: int64



In [None]:
# Review preprocessing
def processing(dataframe):
    review = list(dataframe['REVIEW'])
    label = dataframe['LABEL']


    # RoBERTa tokenizer
    roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    # Empty arrays to hold preprocess data
    processed_review = []
    labels = []

    # review preprocessing
    for i in range(len(review)):
        data = review[i]
        processed_review.append(data)


    # Using RoBERTa to Tokenize the processed text
    roberta_inputs = roberta_tokenizer(processed_review, padding=True, truncation=True, return_tensors="tf")
    roberta_inputs = dict(roberta_inputs)

    # Label preprocessing
    for i in label:
        # Assign 'positive' to 1, 'negative' to 0
        temp = 1 if i == 'positive' else 0
        labels.append(temp)
    # Convert list to tensor
    label_tf = tf.convert_to_tensor(labels)
    return roberta_inputs, label_tf

In [None]:
# Process train text
roberta_train_text, train_label = processing(x)

# Process cross-validation text
roberta_cv_text, cv_label = processing(cv)

# Process test text
roberta_test_text, test_label = processing(test)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
# Using RoBERTa pretrained layers
roberta_model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')

# Adam optimization with 3*10^(-5) learning rate
optimizer = Adam(learning_rate=3e-5)
loss = SparseCategoricalCrossentropy(from_logits=True)
metric = SparseCategoricalAccuracy('accuracy')

# RoBERTa model compilation
roberta_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

print('models are now ready to use')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

models are now ready to use


In [None]:
def eval(model, test_text, test_label, model_type):
  # Evaluate model
  score = model.evaluate(test_text, test_label, verbose =1)
  print(model_type + ' Test loss: ', score[0])
  print(model_type + ' Test accuracy: ', score[1])

  # Make prediction using test dataset
  pred = model.predict(test_text)

  # Invert the result
  invert = np.argmax(pred.logits, axis=1)

  # Print confusion matrix
  print(model_type + ' Confusion Matrix')
  print(confusion_matrix(test_label, invert))

  # Print classification report
  print(model_type + ' Classification Report')
  print(classification_report(test_label, invert))

  return score, invert

def save_model(model, name):
  filename = 'drive/MyDrive/Sentiment Analysis-BERT/' + name
  model.save(filename)
  print('Model saved')

In [None]:
# Training the RoBERTa model
roberta_history = roberta_model.fit(roberta_train_text, train_label, batch_size=4, epochs=1, verbose=1, validation_data = (roberta_cv_text, cv_label))



In [None]:
# Evaluating the model performance
roberta_score, roberta_pred = eval(roberta_model, roberta_test_text, test_label, 'RoBERTa')

RoBERTa Test loss:  0.19923518598079681
RoBERTa Test accuracy:  0.9271000027656555
RoBERTa Confusion Matrix
[[4618  363]
 [ 366 4653]]
RoBERTa Classification Report
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      4981
           1       0.93      0.93      0.93      5019

    accuracy                           0.93     10000
   macro avg       0.93      0.93      0.93     10000
weighted avg       0.93      0.93      0.93     10000



In [None]:
# Saving the model
save_model(roberta_model, 'RoBERTa Model')

Model saved


Testing the models

In [None]:
model_dict = {'RoBERTa': [RobertaTokenizer, 'roberta-base', roberta_model]}

In [None]:
def example(model_dict, name, text):
    tokenize = model_dict[name][0]
    tokenize_type= model_dict[name][1]
    model = model_dict[name][2]

    # Define the tokenizer
    tokenizer = tokenize.from_pretrained(tokenize_type)

    # Tokenize the processed text
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="tf")
    inputs = dict(inputs)

    # Make predictions using the model
    outputs = model.predict(inputs)

    # Get the prediction class
    pred = np.argmax(outputs.logits, axis=1)

    # Print the prediction class
    print(pred)

    # Define the label class
    labels = ['Negative','Positive']

    # Print the predicted label
    print(text, ": \n", labels[pred[0]])

In [None]:
sentence = 'BERT is awesome...'
example(model_dict, 'RoBERTa', sentence)

[1]
BERT is awesome... : 
 Positive


In [None]:
sentence = 'I don\'t like it'
example(model_dict, 'RoBERTa', sentence)

[0]
I don't like it : 
 Negative
