# Warning :
# Do "File -> Save a copy in Drive" before you start modifying the notebook, otherwise your modifications will not be saved.

# BERT for Sentiment Analysis
# A) Compute BERT embedding for each review => CLS token

In [None]:
#! pip install transformers

In [None]:
import transformers
import tensorflow as tf

# Downloading large review movie dataset (25000 reviews)

In [None]:
!wget https://thome.isir.upmc.fr/classes/RITAL/json_pol.json

In [None]:
import json
from collections import Counter

# Loading json
file = './json_pol.json'
with open(file,encoding="utf-8") as f:
    data = json.load(f)


# Quick Check
counter = Counter((x[1] for x in data))
print("Number of reviews : ", len(data))
print("----> # of positive : ", counter[1])
print("----> # of negative : ", counter[0])
print("")
print(data[0])

# Getting the Tokenizer

In [None]:
model_name = "rttl-ai/bert-base-uncased-yelp-reviews"


from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)


# Experiment the Tokenizer on the first train review

In [None]:
maxL = 512 # Max length of the sequence

string_tokenized = tokenizer.encode_plus(data[0][0], return_tensors="pt",
                                        add_special_tokens=True,  # add '[CLS]' and '[SEP]'
                            max_length=maxL,  # set max length
                            truncation=True,  # truncate longer messages
                            #pad_to_max_length=True
                            padding='max_length',  # add padding
                            return_attention_mask=True)

The output of the tokenizer string_tokenized (class BatchEncoding) returns two elements:


*   string_tokenized['input_ids']: the index of each token in the dictionary
*   string_tokenized['attention_mask']: a binary mask (0 to ignore the token, 1 to consider it). This is because we need tensor a fixed length and we have reviews with a variable number of words



In [None]:
print(string_tokenized['input_ids'])
print(string_tokenized['attention_mask'])

# Let's tokenize the whole dataset

In [None]:
import numpy as np

maxL = 512
temb = 768

inputs_tokens = []
attention_masks = []

for i in range(len(data)):
    if(i%2500==0):
        print(i)
    string_tokenized = tokenizer.encode_plus(data[i][0], return_tensors="pt",
                                        add_special_tokens=True,  # add '[CLS]' and '[SEP]'
                            max_length=maxL,  # set max length
                            truncation=True,  # truncate longer messages
                            #pad_to_max_length=True
                            padding='max_length',  # add padding
                            return_attention_mask=True)

    # APPEND inputs token and input masks. YOUR CODE HERE
    inputs_tokens.append(string_tokenized['input_ids'])
    attention_masks.append(string_tokenized['attention_mask'])

# Let's create a 'TensorDataSet' FOR THE SAMPLES where each element is a triplet composed of token word index, token mask, and label

In [None]:
import torch

# Converting input tokens to torch tensors
inputs_tokens = torch.cat(inputs_tokens, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)



# Converting labels to numpy then torch tensor
y = torch.zeros((len(data),))
for i in range(len(data)):
    y[i] = data[i][1]
#y = torch.from_numpy(y)

from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
dataset = TensorDataset(inputs_tokens, attention_masks, y)

# Lets download a BERT model for word embedding

In [None]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained(model_name)

In [None]:
print(model)

**You can use the BERT model for directly predicting polarity.** Let us apply that on the first review which has been tokenized with string_tokenized.

In [None]:
# Some preliminary test
import torch
import numpy as np
b_input_ids = string_tokenized['input_ids']
b_input_mask = string_tokenized['attention_mask']

model.eval()

output = model(input_ids=b_input_ids,attention_mask=b_input_mask, output_hidden_states=True)
print(output.logits) # The output of the logit of the two classes (polarity pos/neg)
last_hidden_states = output.hidden_states[-1] # The last layer before the class prediction: tensor of size nBatch (1 here) x MaxL (512) x temb (768)
print(last_hidden_states.shape)
print(last_hidden_states[0,0,1:10]) # The first 10 value of the first elements (=[CLS] TOKEN)
print(f" norm cls token={np.linalg.norm(last_hidden_states.detach().numpy()[0,0,:])}")

In [None]:
# If you need to clean GPU memory
#import gc
#gc.collect()
#torch.cuda.empty_cache()

# Most important STEP: we want to extract the [CLS] representation (1st token of the last layer before logits) for each review, and store it.  

In [None]:
# create DataLoaders with samplers
tb = int(100) # batch size
dataloader = DataLoader(dataset, batch_size=tb,shuffle=False) # dataloader
nbTrain = len(data)
features = np.zeros((nbTrain, temb))
nbatch = int(nbTrain/tb)
print(f"nb batches={nbatch}")
# Comuting CLS features
model.cuda()
for idx,batch in enumerate(dataloader):
        # Unpack this training batch from our dataloader:
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        if(idx%10==0):
            print(f"batch {idx} / {nbatch}")
        b_input_ids = batch[0].cuda()
        b_input_mask = batch[1].cuda()
        b_labels = batch[2].cuda().long()

        with torch.no_grad():
            # forward propagation (evaluate model on training batch)
            output = model(input_ids=b_input_ids,
                                 attention_mask=b_input_mask,
                                 #labels=b_labels,
                               output_hidden_states=True)
            last_hidden_states = output.hidden_states[-1] # WARNING: it is now a batch of size tbatch x nToken x embsize
            features[idx*tb:idx*tb+tb,:] = last_hidden_states.detach().cpu().numpy()[:,0,:] # YOUR CODE HERE. Think in applying .detach().cpu().numpy()


# Now save the embedding of each review into disk!

In [None]:
# Saving the features and labels
import pickle
# Open a file and use dump()
with open('data.pkl', 'wb') as file:
    # A new file will be created
    pickle.dump([features,y], file)

In [None]:
import pickle

# Open the file in binary mode
with open('data.pkl', 'rb') as file:
    # Call load method to deserialze
    [features, y] = pickle.load(file)

In [None]:
import numpy as np
print(features.shape[0])
print(y)
print(np.linalg.norm(features[10]))

# B) Train a logistic regression model on top of extracted embeddings. Conclude on the performances of BERT for the sentiment classification task

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

np.random.seed(0)

rs=10
[X_train, X_test, y_train, y_test]  = train_test_split(features, y, test_size=0.5, random_state=rs, shuffle=True)

lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
print(lr_clf.score(X_test, y_test))

# C) Fine-tuning BERT for sentiment classification

In [None]:
# We will fine-tune a smaller model
model_name = "haisongzhang/roberta-tiny-cased"
#model_name = "rttl-ai/bert-base-uncased-yelp-reviews"

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)


# Let's tokenize the whole dataset

In [None]:
maxL = 512 # Max length of the sequence

string_tokenized = tokenizer.encode_plus(data[0][0], return_tensors="pt",
                                        add_special_tokens=True,  # add '[CLS]' and '[SEP]'
                            max_length=maxL,  # set max length
                            truncation=True,  # truncate longer messages
                            #pad_to_max_length=True
                            padding='max_length',  # add padding
                            return_attention_mask=True)

In [None]:
import numpy as np

maxL = 512


inputs_tokens = []
attention_masks = []

for i in range(len(data)):
    if(i%2500==0):
        print(i)
    string_tokenized = tokenizer.encode_plus(data[i][0], return_tensors="pt",
                                        add_special_tokens=True,  # add '[CLS]' and '[SEP]'
                            max_length=maxL,  # set max length
                            truncation=True,  # truncate longer messages
                            #pad_to_max_length=True
                            padding='max_length',  # add padding
                            return_attention_mask=True)

    inputs_tokens.append(string_tokenized['input_ids'])
    attention_masks.append(string_tokenized['attention_mask'])

# Let's create 'TensorDataSets' FOR THE TRAIN/TEST SAMPLES where each element is a triplet composed of token word index, token mask, and label

In [None]:
import torch
# Converting input tokens to torch tensors
inputs_tokens = torch.cat(inputs_tokens, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
# Converting labels to torch tensor
y = torch.zeros((len(data),2), dtype=torch.float)
for i in range(len(data)):
    y[i][data[i][1]] = 1

from sklearn.model_selection import train_test_split

np.random.seed(0)
rs=10

inputs_tokens_train, inputs_tokens_test, attention_masks_train, attention_masks_test, y_train, y_test =train_test_split(inputs_tokens, attention_masks, y, test_size=0.5, random_state=rs)

print(inputs_tokens_train.shape)
print(inputs_tokens_test.shape)

print(attention_masks_train.shape)
print(attention_masks_test.shape)

print(y_train.shape)
print(y_test.shape)



In [None]:
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

dataset_train = TensorDataset(inputs_tokens_train,  attention_masks_train, y_train)
dataset_test = TensorDataset(inputs_tokens_test,  attention_masks_test, y_test)

# Lets download a BERT model for word embedding

In [None]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained(model_name)
print(model)

# FINE-TUNING THE MODEL

In [None]:
#import gc
#gc.collect()
#torch.cuda.empty_cache()

In [None]:
# Fonction to compute the accuracy on train/test sets
def accuracy(model, dataloader):
  model.eval()
  nbgood =0
  for idx,batch in enumerate(dataloader):
    b_input_ids = batch[0].cuda()
    b_input_mask = batch[1].cuda()
    b_labels = batch[2].cuda()

    with torch.no_grad():
      pred = model(input_ids=b_input_ids, attention_mask=b_input_mask)
      yhat = pred.logits.argmax(axis=1)
      ytrue = b_labels.argmax(axis=1)
      nbgood += (yhat==ytrue).sum()

  acc = nbgood / 125.0
  return acc.item()


In [None]:
import torch.nn as nn
import torch.optim as optim
tb = int(25) # batch size
# create DataLoaders train/test
train_dataloader = DataLoader(dataset_train, batch_size=tb,shuffle=True)
test_dataloader = DataLoader(dataset_test, batch_size=tb,shuffle=False)

nbepochs =2
loss = nn.CrossEntropyLoss() # cross entropy loss
optimizer = optim.Adam(model.parameters(), lr=1e-4)

model.train()
model.cuda()

# TRAINING LOOP
for e in range(nbepochs): # LOOP over epochs
  for idx,batch in enumerate(train_dataloader): # LOOP over batches
    b_input_ids = batch[0].cuda()
    b_input_mask = batch[1].cuda()
    b_labels = batch[2].cuda()

    # TODO: ZERO the gradient accumulator - YOUR CODE HERE
    optimizer.zero_grad()
    # TODO: Compute prediction (forward pass) - YOUR CODE HERE
    pred = model(input_ids=b_input_ids, attention_mask=b_input_mask).logits
    # TODO: Compute loss (cross entropy) between predictions and labels - YOUR CODE HERE
    l = loss(pred, b_labels)
    # TODO: Compute gradients (backward pass) - YOUR CODE HERE
    l.backward()
    # TODO: update parameters
    optimizer.step()

  print("epoch",e," acc train=",accuracy(model,train_dataloader)," acc test=",accuracy(model,test_dataloader) ) # Computing performances at the end of each epoch

