This jupyter notebook is a direct copy and adaptation of the code presented [here](https://ledatascientist.com/analyse-de-sentiments-avec-camembert/)

In [None]:
import torch
import seaborn
import pandas as pd
import numpy as np
from sklearn import metrics
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW
from transformers import CamembertForSequenceClassification, CamembertTokenizer

One also need the `SentencePiece library` for `CamembertTokenizer`

In [None]:
#! pip install sentencepiece
! pip install datasets

# Text encoding

## Loading the dataset
One need to encode the text from the dataset (here it will be aclIMDB) into a vectorial space, this is called embedding.

In [None]:
from datasets import load_dataset

dataset = load_dataset("allocine")

In [None]:
print(type(dataset))
print(dataset)

In [None]:
dataset.column_names

In [None]:
dataset_train = dataset["train"]
dataset_validation = dataset["validation"]
dataset_test = dataset["test"]

We can request specific attributes of the dataset, like `description`, `citation`, and `homepage`, by calling them directly.

In [None]:
dataset_train.description

In [None]:
dataset_train.citation

In [None]:
dataset_train.homepage

In [None]:
dataset_train.info

In [None]:
dataset_train.shape

In [None]:
dataset_train.num_columns

In [None]:
dataset_train.num_rows

In [None]:
len(dataset_train)

In [None]:
dataset_train.column_names

## Tokenizer / encoder
* We will use the tokenizer of camemBERT to perform the embedding
* We can change the pre-trained model (2nd parameter):

|             Model                    | #params | Arch. |      Training data                |
| :----------------------------------- | :-----  | :---  | :-------------------------------  |
camembert-base                         | 110M    | Base  | OSCAR (138 GB of text)            |
camembert/camembert-large              | 335M    | Large | CCNet (135 GB of text)            |
camembert/camembert-base-ccnet         | 110M    | Base  | CCNet (135 GB of text)            |
camembert/camembert-base-wikipedia-4gb | 110M    | Base  | Wikipedia (4 GB of text)          |
camembert/camembert-base-oscar-4gb     | 110M    | Base  | Subsample of OSCAR (4 GB of text) |
camembert/camembert-base-ccnet-4gb     | 110M    | Base  | Subsample of CCNet (4 GB of text) |

* `do_lower_case=True` allow to lower all the characters (if there is Upper case characters).

In [None]:
dataset_train.features

In [None]:
full_text = train_text + test_text

In [None]:
Tokenizer = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=True)

In [None]:
# La fonction batch_encode_plus encode un batch de donnees
# This cell can take a long time (~ 1 min for me)
train_encoded_batch = Tokenizer.batch_encode_plus(train_text,
                                                  add_special_tokens=True,
                                                  max_length=MAX_LENGTH,
                                                  padding=True,
                                                  truncation=True,
                                                  return_attention_mask = True,
                                                  return_tensors = 'pt')

test_encoded_batch = Tokenizer.batch_encode_plus(test_text,
                                                 add_special_tokens=True,
                                                 max_length=MAX_LENGTH,
                                                 padding=True,
                                                 truncation=True,
                                                 return_attention_mask = True,
                                                 return_tensors = 'pt')

In [None]:
# We transform the sentiment list into a torch tensor
train_sentiment = torch.tensor(train_sentiment)
test_sentiment = torch.tensor(test_sentiment)

The split of the dataset into a train and test sets are already performed

In [None]:
train_dataset = TensorDataset(train_encoded_batch['input_ids'],
                              train_encoded_batch['attention_mask'],
                              train_sentiment)
test_dataset = TensorDataset(test_encoded_batch['input_ids'],
                                   test_encoded_batch['attention_mask'],
                                   test_sentiment)

In [None]:
batch_size = 16

In [None]:
# We create the DataLoaders object for train and test
# A dataloader is an iterable object
# Here, there are configure so that the batch are constructed randomly
train_dataloader = DataLoader(train_dataset,
                              sampler = RandomSampler(train_dataset),
                              batch_size = batch_size)
 
test_dataloader = DataLoader(test_dataset,
                             sampler = SequentialSampler(test_dataset),
                             batch_size = batch_size)

## Model loading:
Thanks to the module transformers, we only need one line of code to retrieve the pre-trained Camembert model

In [None]:
# loading of the pre-trained model:
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels = 2)

## Fine tuning / Hyperparametrization

In [None]:
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
epochs = 3

In [None]:
# torcch.device is an object representing the device on which a torch.Tensor is or will be allocated.
device = torch.device("cpu")

In [None]:
# Pour enregistrer les stats a chaque epoque
training_stats = []

## Training

In [None]:
# Boucle d'entrainement
for epoch in range(0, epochs):
    print("")
    print(f'########## Epoch {epoch+1} / {epochs} ##########')
    print('Training...')
 
    # initialization of loss for the current epoch
    total_train_loss = 0
 
    # Calld of one round of 'training'
    # Dans ce mode certaines couches du modele agissent differement
    model.train()

    # Pour chaque batch
    for step, batch in enumerate(train_dataloader):
        # On fait un print chaque 40 batchs
        if step % 40 == 0 and not step == 0:
            print(f'Batch {step} of {len(train_dataloader)}.')

        # On recupere les donnees du batch
        input_id = batch[0].to(device)
        attention_mask = batch[1].to(device)
        sentiment = batch[2].to(device)

        # On met le gradient a 0
        model.zero_grad()        

        # On passe la donnee au model et on recupere la loss et le logits (sortie avant fonction d'activation)
        loss, logits = model(input_id,
                             token_type_ids=None,
                             attention_mask=attention_mask,
                             labels=sentiment)

        # On incremente la loss totale
        # .item() donne la valeur numerique de la loss
        total_train_loss += loss.item()

        # Backpropagtion
        loss.backward()

        # On actualise les parametrer grace a l'optimizer
        optimizer.step()

    # On calcule la  loss moyenne sur toute l'epoque
    avg_train_loss = total_train_loss / len(train_dataloader)   

    print("")
    print(f"  Average training loss: {avg_train_loss:.2f}")
    
    # Enregistrement des stats de l'epoque
    training_stats.append({'epoch': epoch + 1,'Training Loss': avg_train_loss})

print("Model saved!")
torch.save(model.state_dict(), "./sentiments.pt")

In [None]:
def preprocess(raw_reviews, sentiments=None):
    encoded_batch = TOKENIZER.batch_encode_plus(raw_reviews,
                                                truncation=True,
                                                pad_to_max_length=True,
                                                return_attention_mask=True,
                                                return_tensors = 'pt')
    if sentiments:
        sentiments = torch.tensor(sentiments)
        return encoded_batch['input_ids'], encoded_batch['attention_mask'], sentiments
    return encoded_batch['input_ids'], encoded_batch['attention_mask']
 
def predict(reviews, model=model):
    with torch.no_grad():
        model.eval()
        input_ids, attention_mask = preprocess(reviews)
        retour = model(input_ids, attention_mask=attention_mask)
         
        return torch.argmax(retour[0], dim=1)
 
 
def evaluate(reviews, sentiments):
    predictions = predict(reviews)
    print(metrics.f1_score(sentiments, predictions, average='weighted', zero_division=0))
    seaborn.heatmap(metrics.confusion_matrix(sentiments, predictions))