In [None]:
import torch
import seaborn
import pandas as pd
from sklearn import metrics
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import CamembertForSequenceClassification, CamembertTokenizer, AdamW

One also need the `SentencePiece library` for `CamembertTokenizer`

In [None]:
#! pip install sentencepiece

# Text encoding

## Loading the dataset
One need to encode the text from the dataset (here it will be aclIMDB) into a vectorial space, this is called embedding.

In [None]:
dataset_path = "../data/processed/aclImdb/"
train_set_file = "aclImdb_train.csv"
test_set_file = "aclImdb_test.csv"

# Dataset loading:
data_train = pd.read_csv(dataset_path + train_set_file)
data_test = pd.read_csv(dataset_path + test_set_file)

In [None]:
text = data_train['text'].to_list()
sentiment = data_train['sentiment'].to_list()

## Tokenizer / encoder
* We will use the tokenizer of camemBERT to perform the embedding
* We can change the pre-trained model (2nd parameter):

|             Model                    | #params | Arch. |      Training data                |
| :----------------------------------- | :-----  | :---  | :-------------------------------  |
camembert-base                         | 110M    | Base  | OSCAR (138 GB of text)            |
camembert/camembert-large              | 335M    | Large | CCNet (135 GB of text)            |
camembert/camembert-base-ccnet         | 110M    | Base  | CCNet (135 GB of text)            |
camembert/camembert-base-wikipedia-4gb | 110M    | Base  | Wikipedia (4 GB of text)          |
camembert/camembert-base-oscar-4gb     | 110M    | Base  | Subsample of OSCAR (4 GB of text) |
camembert/camembert-base-ccnet-4gb     | 110M    | Base  | Subsample of CCNet (4 GB of text) |

* `do_lower_case=True` allow to lower all the characters (if there is Upper case characters).

In [None]:
Tokenizer = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=True)

In [None]:
l_text = list(map(len, text))
MAX_LENGTH = max(l_text)

In [None]:
# La fonction batch_encode_plus encode un batch de donnees
# This cell can take a long time (~ 1 min for me)
encoded_batch = Tokenizer.batch_encode_plus(text,
                                            add_special_tokens=True,
                                            max_length=MAX_LENGTH,
                                            padding=True,
                                            truncation=True,
                                            return_attention_mask = True,
                                            return_tensors = 'pt')

In [None]:
# We transform the sentiment list into a torch tensor
sentiment = torch.tensor(sentiment)