# Computational Intellignece Project - NLP
## BERT Model Training Script

## Setting up environment
---

In [None]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m69.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import unicodedata
import numpy as np
import random
import re
import nltk
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay
import transformers
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import BertTokenizer, TFBertForSequenceClassification
from keras.utils import to_categorical
import warnings
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from datetime import datetime

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
warnings.simplefilter('ignore')

## Load Data
---

In [None]:
dataset = pd.read_csv("sentiment.csv")

In [None]:
classes = list(np.unique(dataset['sentiment'].to_numpy()))
print(f"Categories: {classes}")

Categories: ['Negative', 'Neutral', 'Positive']


In [None]:
# Seperating senteces and label
data = dataset[['text','sentiment']]

# Converting categorical labels to numerical
labels = [classes.index(x) for x in data['sentiment']]

## Preprocessing
---

In [None]:
# Preprocessing
data['text'] = data['text'].apply(lambda x: x.lower())
# removing special chars
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
data['text'] = data['text'].str.replace('rt','')

In [None]:
data.head()

In [None]:
# Removing Stopwords
stop = set(stopwords.words("english"))
stop.remove('not')
def remove_stopwords(text):
    filtered_words = [word for word in text.split() if word not in stop]
    return " ".join(filtered_words)

In [None]:
data["text"] = data.text.map(remove_stopwords)

In [None]:
x = list(data['text'])
y = list(labels)
# y = to_categorical(y)
data['sentiment'] = list(data['sentiment'])
X_train, X_val, y_train, y_val = train_test_split(x,y, test_size=0.2,stratify=y)

## Tokenization
---

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def convert_example_to_feature(text):
  return bert_tokenizer.encode_plus(text, 
                add_special_tokens = True,     # add [CLS], [SEP]
                max_length = 50,              # max length of the text that can go to BERT
                padding='max_length',
                truncation=True,
                return_attention_mask = True,  # add attention mask to not focus on pad tokens
              )

In [None]:
# map to the expected input to TFBertForSequenceClassification
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  }, label

def encode_examples(ds):
  # prepare list, so that we can build up final TensorFlow dataset from slices.
  input_ids_list = []
  token_type_ids_list = []
  attention_mask_list = []
  label_list = []
  for text, label in ds:
    model_input = convert_example_to_feature(text)
    input_ids_list.append(model_input['input_ids'])
    token_type_ids_list.append(model_input['token_type_ids'])
    attention_mask_list.append(model_input['attention_mask'])
    label_list.append([label])

  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

In [None]:
batch_size = 32
# train dataset
ds_train = zip(X_train, y_train)
ds_test = zip(X_val, y_val)
ds_train_encoded = encode_examples(ds_train).shuffle(len(X_train)).batch(batch_size)
ds_test_encoded = encode_examples(ds_test).batch(batch_size)

In [None]:
bert_model =  TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
bert_model.bert.trainable = False
bert_model.summary()

In [None]:
learning_rate = 1e-3
# multiple epochs might be better as long as we will not overfit the model
number_of_epochs = 20

# choosing Adam optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)
# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = [
    tf.keras.metrics.SparseCategoricalAccuracy('accuracy'),
    # tf.keras.metrics.Recall(),
    # tf.keras.metrics.Precision()
]


bert_model.compile(loss=loss,
                   optimizer=optimizer,
                   metrics=metric)

In [None]:
def exp_decay_scheduler(epoch, lr):
  if epoch < 5:
    return lr
  else:
    return lr * tf.math.exp(-0.1)
  
callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=3, monitor="val_loss"),
    tf.keras.callbacks.LearningRateScheduler(exp_decay_scheduler),
]

In [None]:
history = bert_model.fit(
    ds_train_encoded,
    batch_size=batch_size,
    epochs=number_of_epochs,
    validation_data=ds_test_encoded,
    callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20


# Prediction
----

In [None]:
text=['i do not like to spend money ,']

In [None]:
def convert_example_to_feature(text):
  return bert_tokenizer.encode_plus(
      text, 
      add_special_tokens = True,     # add [CLS], [SEP]
      max_length = 512,              # max length of the text that can go to BERT
      padding='max_length',
      truncation=True,
      return_attention_mask = True,  # add attention mask to not focus on pad tokens
      )
# map to the expected input to TFBertForSequenceClassification
def map_example_to_dict(input_ids, attention_masks, token_type_ids):
  return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
  },

def encode_examples(ds):
  # prepare list, so that we can build up final TensorFlow dataset from slices.
  input_ids_list = []
  token_type_ids_list = []
  attention_mask_list = []
  label_list = []
  for review in ds:
    bert_input = convert_example_to_feature(review)
    input_ids_list.append(bert_input['input_ids'])
    token_type_ids_list.append(bert_input['token_type_ids'])
    attention_mask_list.append(bert_input['attention_mask'])

  return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list)).map(map_example_to_dict)

In [None]:
batch_size=8
ds_test_encoded = encode_examples(text).batch(batch_size)

In [None]:
y_pred = bert_model.predict(ds_test_encoded)
print(f"Prediciont is: {classes[np.argmax(y_pred['logits']} - The Real impression is: Negative")

Prediciont is: Negative - The Real impression is: Negative


In [None]:
y_pred_train = bert_model.predict(ds_train_encoded)
y_true = y_train

