<a href="https://colab.research.google.com/github/CharlesAttend/M1-S2-DAC/blob/main/RITAL/TAL/TME/TME4/4b_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Warning : 
# Do "File -> Save a copy in Drive" before you start modifying the notebook, otherwise your modifications will not be saved.

# BERT for Sentiment Analysis 

In [None]:
! pip install transformers

In [None]:
import transformers
import tensorflow as tf

# Downloading large review movie dataset (50000 reviews in train, 50000 reviews in test)

In [None]:
!wget https://thome.isir.upmc.fr/classes/RITAL/json_pol

In [None]:
import json
from collections import Counter

# Loading json
with open("./json_pol",encoding="utf-8") as f:
    data = f.readlines()
    json_data = json.loads(data[0])
    train = json_data["train"]
    test = json_data["test"]
    

# Quick Check
counter_train = Counter((x[1] for x in train))
counter_test = Counter((x[1] for x in test))
print("Number of train reviews : ", len(train))
print("----> # of positive : ", counter_train[1])
print("----> # of negative : ", counter_train[0])
print("")
print(train[0])
print("")
print("Number of test reviews : ",len(test))
print("----> # of positive : ", counter_test[1])
print("----> # of negative : ", counter_test[0])

print("")
print(test[0])
print("")


# Getting the Tokenizer

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("textattack/bert-base-uncased-yelp-polarity")


# Experiment the Tokenizer on the first train review

In [None]:
maxL = 512 # Max length of the sequence

string_tokenized = tokenizer.encode_plus(train[0][0], return_tensors="pt", 
                                        add_special_tokens=True,  # add '[CLS]' and '[SEP]'
                            max_length=maxL,  # set max length
                            truncation=True,  # truncate longer messages
                            #pad_to_max_length=True
                            padding='max_length',  # add padding
                            return_attention_mask=True)

The output of the tokenizer string_tokenized (class BatchEncoding) returns two elements:


*   string_tokenized['input_ids']: the index of each token in the dictionary
*   string_tokenized['attention_mask']: a binary mask (0 to ignore the token, 1 to consider it). This is because we need tensor a fixed length and we have reviews with a variable number of words



In [None]:
print(string_tokenized['input_ids'])
print(string_tokenized['attention_mask'])

# Lets download a BERT model for word embedding

In [None]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-yelp-polarity")

In [None]:
print(model)

**You can use the BERT model for directly predicting polarity.** Let us apply that on the first review which has been tokenized with string_tokenized.

In [None]:
# Some preliminary test
import torch
import numpy as np
b_input_ids = string_tokenized['input_ids']
b_input_mask = string_tokenized['attention_mask']

model.eval()

output = model(input_ids=b_input_ids,attention_mask=b_input_mask, output_hidden_states=True)
print(output.logits) # The output of the logit of the two classes (polarity pos/neg)  
last_hidden_states = output.hidden_states[-1] # The last layer before the class prediction: tensor of size nBatch (1 here) x MaxL (512) x temb (768)
print(last_hidden_states.shape) 
print(last_hidden_states[0,0,1:10]) # The first 10 value of the first elements (=[CLS] TOKEN)
print(f" norm cls token={np.linalg.norm(last_hidden_states.detach().numpy()[0,0,:])}")

# Let's tokenize the whole dataset 

In [None]:
import numpy as np

maxL = 512
temb = 768


inputs_tokens_train = []
attention_masks_train = []

for i in range(len(train)):
    if(i%2500==0):
        print(i)
    string_tokenized = tokenizer.encode_plus(train[i][0], return_tensors="pt", 
                                        add_special_tokens=True,  # add '[CLS]' and '[SEP]'
                            max_length=maxL,  # set max length
                            truncation=True,  # truncate longer messages
                            #pad_to_max_length=True
                            padding='max_length',  # add padding
                            return_attention_mask=True)
    
    # APPEND inputs token and input masks. YOUR CODE HERE
    
inputs_tokens_test = []
attention_masks_test = []

for i in range(len(test)):
    if(i%2500==0):
        print(i)
    string_tokenized = tokenizer.encode_plus(test[i][0], return_tensors="pt", 
                                        add_special_tokens=True,  # add '[CLS]' and '[SEP]'
                            max_length=maxL,  # set max length
                            truncation=True,  # truncate longer messages
                            #pad_to_max_length=True
                            padding='max_length',  # add padding
                            return_attention_mask=True)
    
    # APPEND inputs token and input masks. YOUR CODE HERE
    

# Let's create a 'TensorDataSet' FOR THE TRAINING SAMPLES where each element is a triplet composed of token word index, token mask, and label

In [None]:
# Converting input tokens to torch tensors 
inputs_tokens_train = torch.cat(inputs_tokens_train, dim=0)
attention_masks_train = torch.cat(attention_masks_train, dim=0)


# Converting labels to numpy then torch tensor
y_train = np.zeros((len(train),))
for i in range(len(train)):
    y_train[i] = train[i][1]
y_train = torch.from_numpy(y_train)

from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
train_dataset = TensorDataset(inputs_tokens_train, attention_masks_train, y_train)

# Let's do the same FOR THE TEST SAMPLES 

In [None]:
# Converting input tokens to torch tensors 
inputs_tokens_test = torch.cat(inputs_tokens_test, dim=0)
attention_masks_test = torch.cat(attention_masks_test, dim=0)
  
y_test = np.zeros((len(test),))
for i in range(len(test)):
    y_test[i] = test[i][1]
y_test = torch.from_numpy(y_test)

test_dataset = TensorDataset(inputs_tokens_test, attention_masks_test, y_test)

In [None]:
# If you need to clean GPU memory
#import gc
#gc.collect()
#torch.cuda.empty_cache()

# Most important STEP: we want to extract the [CLS] representation (1st token of the last layer before logits) for each review, and store it in train and test.  

In [None]:
# create DataLoaders with samplers
tb = int(100)
train_dataloader = DataLoader(train_dataset, batch_size=tb,shuffle=False)
nbTrain = len(train)
f_train = np.zeros((nbTrain, temb))
nbtach = int(nbTrain/tb)
print(f"nb batches={nbtach}")
# Comuting CLS features
model.cuda()
for idx,batch in enumerate(train_dataloader):
        # Unpack this training batch from our dataloader:
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        if(idx%10==0):
            print(f"batch {idx} / {nbtach}")
        b_input_ids = batch[0].cuda()
        b_input_mask = batch[1].cuda()
        b_labels = batch[2].cuda().long()
        
        with torch.no_grad():
            # forward propagation (evaluate model on training batch)
            output = model(input_ids=b_input_ids,
                                 attention_mask=b_input_mask,
                                 #labels=b_labels, 
                               output_hidden_states=True)
            last_hidden_states = output.hidden_states[-1] # WARNING: it is now a batch of size tbatch x nToken x embsize 
            f_train[idx*tb:idx*tb+tb,:] =  # YOUR CODE HERE. Think in applying .detach().cpu().numpy()


# Extract [CLS] token in TEST

In [None]:
# create DataLoaders with samplers
tb = int(100)
test_dataloader = DataLoader(test_dataset, batch_size=tb,shuffle=False)
nbTest = len(test)
f_test = np.zeros((nbTest, temb))
nbtach = int(nbTest/tb)
print(f"nb batches={nbtach}")
# Comuting CLS features
model.cuda()
for idx,batch in enumerate(test_dataloader):
        # Unpack this training batch from our dataloader:
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        if(idx%10==0):
            print(f"batch {idx} / {nbtach}")
        b_input_ids = batch[0].cuda()
        b_input_mask = batch[1].cuda()
        b_labels = batch[2].cuda().long()
        
        with torch.no_grad():
            # forward propagation (evaluate model on training batch)
            output = model(input_ids=b_input_ids,
                                 attention_mask=b_input_mask,
                                 #labels=b_labels, 
                               output_hidden_states=True)
            last_hidden_states = # YOUR CODE HERE.
            #
            f_test[idx*tb:idx*tb+tb,:] = # YOUR CODE HERE.
        

# Now save the embedding of each review into disk!

In [None]:
# Saving the features and labels
import pickle
# Open a file and use dump()
with open('train-data.pkl', 'wb') as file:
    # A new file will be created
    pickle.dump([f_train,y_train], file)

with open('test-data.pkl', 'wb') as file:
    # A new file will be created
    pickle.dump([f_test,y_test], file)  

In [None]:
import pickle
  
# Open the file in binary mode
with open('train-data.pkl', 'rb') as file:    
    # Call load method to deserialze
    [feature_train, ytrain] = pickle.load(file)

# Open the file in binary mode
with open('test-data.pkl', 'rb') as file:    
    # Call load method to deserialze
    [feature_test, ytest] = pickle.load(file)  
    

In [None]:
import numpy as np
print(feature_train.shape[0])
print(feature_test.shape)

print(ytrain)
print(ytest)
print(np.linalg.norm(feature_train[10]))

# Finally: train a logistic regression model on top of extracted embeddings. Conclude on the performances of BERT for the sentiment classification task