# NLP(DEEP) Lab02
## Authors
- Eliot LECLAIR
- Alex POIRON
- Tom THIL
- Aurélien VISENTIN

In [None]:
%pip install transformers[sentencepiece] datasets torch sklearn evaluate

## Imports

In [95]:
#import to get IMDB dataset
from datasets import load_dataset
#imports to get modelels and their useful functiond
from transformers import BertModel, AutoTokenizer, DataCollatorWithPadding, TrainingArguments, AutoModelForSequenceClassification, Trainer
#Split method to get the validation set
from sklearn.model_selection import train_test_split
#Evaluate our model
import evaluate

#Basics
import torch
import random
import numpy as np

## Load Data

In [16]:
#Load the dataset IMDB
df = load_dataset("imdb")
df



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

## Tokenize the Data

In [17]:
#Create a tokenizer object from the bert-base-uncased pre-trained model.
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

#Create a Data Collector object using the tokenizer.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# These 2 objects are specified in the Trainer function.

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/5546055f03398095e385d7dc625e636cc8910bf2/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/5546055f03398095e385d7dc625e636cc8910bf2/vocab.txt
loading file to

### Function for tokenize

In [18]:
import datasets
def tokenize_function(example: datasets.arrow_dataset.Example) -> datasets.arrow_dataset.Example:
    """
    Function called to map all elements of our dataset and tokenize them
    Args:
        - example : one element in our dataset
    Returns:
        the element tokenized
    """
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = df.map(tokenize_function, batched=True)

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [45]:
#Split the train dataset
X_train, X_valid, y_train, y_valid = train_test_split(tokenized_datasets['train']["text"], tokenized_datasets['train']["label"], test_size=0.2)

As we see, to have the complete train and validation set, we need to concatenate X_train and y_train together and X_valid and y_valid also.
To do so, we will create a function !

In [65]:
import datasets
def create_dataset(texts: list, labels: list) -> datasets.Dataset:
    """
    Function used to create a dataset in a certain format. We want a result 
    like this : [dict{text, label}, dict{text, label}, ...]. In that way, we can create a Dataset object from a list.
    Args:
        - text (list): list of texts
        - labels (list): list of labels
    Returns:
        A dataset with the right format.
    """

    dictionnaries = []

    for text, label in zip(texts, labels):
    dictionnary = {"text": text, "label": label}
    dictionnaries.append(dictionnary)

    return datasets.Dataset.from_list(dictionnaries)

train_splited = create_dataset(X_train, y_train)
validation_splited = create_dataset(X_valid, y_valid)

  0%|          | 0/20 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

We need to tokenize again these 2 new datasets

In [None]:
tokenized_train_split = train_splited.map(tokenize_function, batched="True")
tokenized_valid_split = validation_splited.map(tokenize_function, batched="True")

## Creation of the Model and Training

In [None]:
#Arguments for training
training_args = TrainingArguments("test-trainer", num_train_epochs=1)

In [60]:
#We create our model for Sequence Classification. 
#We choose to use the bert-base-uncased as pre-trained model.
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/5546055f03398095e385d7dc625e636cc8910bf2/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/5546055f03398095e385d7dc625e636cc8910bf2/pytorch_m

In [67]:
trainer = Trainer(
    model, #Our model
    training_args, #Specidifed arguments for training
    train_dataset=tokenized_train_split, #Training tokenized data
    eval_dataset=tokenized_valid_split, #Validation tokenized data
    data_collator=data_collator, #Data Collector object we defined
    tokenizer=tokenizer, #Tokenizer object we defined
)

In [68]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 20000
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2500
  Number of trainable parameters = 109483778


Step,Training Loss
500,0.7044
1000,0.6995
1500,0.6961
2000,0.6958
2500,0.5768


Saving model checkpoint to test-trainer/checkpoint-500
Configuration saved in test-trainer/checkpoint-500/config.json
Model weights saved in test-trainer/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-500/special_tokens_map.json
Saving model checkpoint to test-trainer/checkpoint-1000
Configuration saved in test-trainer/checkpoint-1000/config.json
Model weights saved in test-trainer/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-1000/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to test-trainer/checkpoint-1500
Configuration saved in test-trainer/checkpoint-1500/config.json
Model weights saved in test-trainer/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-1500/tokenizer_config.json
Special tokens file saved

TrainOutput(global_step=2500, training_loss=0.6745090209960938, metrics={'train_runtime': 1833.1745, 'train_samples_per_second': 10.91, 'train_steps_per_second': 1.364, 'total_flos': 4949924728928640.0, 'train_loss': 0.6745090209960938, 'epoch': 1.0})

## Evaluation
We will evaluate our test dataset and see our accuracy on it.

In [70]:
#First we predict our values
predictions = trainer.predict(tokenized_datasets['test'])
print(predictions.predictions.shape, predictions.label_ids.shape)

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 25000
  Batch size = 8


(25000, 2) (25000,)


In [79]:
#As said in the tutorial, we need to take the index with the maximum value on the second axis to transform the logits into predictions to compare to our labels.
preds = np.argmax(predictions.predictions, axis=-1)

#Finally, we compute the accuracy
metric = evaluate.load('accuracy')
metric.compute(predictions=preds, references=predictions.label_ids)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

{'accuracy': 0.78368}

## Wrongly classified

In [98]:
random.seed(667)

#True values for labels
y_true = np.array(tokenized_datasets['test']['label'])

# Retrive misclassification index exemple
wrong_classified_index = [index for index, (index_true, index_pred) in enumerate(zip(y_true, preds)) if index_true != index_pred]

#Get randomly two examples of wrong classfied texts
wrong_classified = [(tokenized_datasets['test']['text'][index], tokenized_datasets['test']['label'][index]) for index in random.sample(wrong_classified_index,2)]

In [99]:
wrong_classified

[('I suppose that in 1997 Hollywood wasn\'t quite at the point of openly celebrating homosexuality, so one might want to give some credit to those who put this movie together for having shown a little bit of courage. One simply wishes that credit could be given them for having put together a really good movie, and in my opinion "In & Out" doesn\'t qualify on that count. It\'s the story of Howard Brackett (Kevin Kline) - a small town high school English teacher who on the eve of his wedding is outed by a former student who happens to win an Oscar and who then has to go through what can only be described as a period of self-discovery as he comes to terms with being homosexual. To me, that was the first problem with this movie. Howard didn\'t really have to turn out to be gay. The movie would have been funnier (and perhaps even more thought provoking) had Brackett remained defiantly straight in spite of the stereotypically gay aspects to his life and the town\'s belief after the Oscar spe