## Installing dependencies

In [1]:
from IPython.display import clear_output

!pip install transformers==4.28.0
!pip install datasets
!pip install accelerate
!pip install evaluate

clear_output()

## Importing libraries

In [2]:
import math
import datasets as ds
import pandas as pd
import numpy as np
import torch
import evaluate
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline, DataCollatorWithPadding
from accelerate import Accelerator
from tqdm.auto import tqdm

# Sentence classification


Load dataset, and tokenize its content

In [3]:
def load_datasets(splits: list[str]) -> list[ds.Dataset]:
    """
    Loads the IMDB dataset from the datasets library.
    Returns:
        datasets: list[ds.Dataset] - List of datasets
    """
    datasets: list[ds.Dataset] = []
    for split in splits:
        dataset: ds.Dataset = ds.load_dataset('imdb', split=split)
        datasets.append(dataset)
    
    return datasets

datasets: list[ds.Dataset] = load_datasets(splits=['train', 'test'])

# Load the model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize the datasets
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

tokenized_datasets = []
for i in range(len(datasets)):
    tokenized_datasets.append(datasets[i].map(tokenize, batched=True))

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0. Subsequent calls will reuse this data.




Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Define training args

In [4]:
EPOCHS = 1
BATCH_SIZE = 32

# Define the training arguments
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/NLP/Deep/lab03/results', # output directory
    num_train_epochs=EPOCHS,              # total number of training epochs
    per_device_train_batch_size=BATCH_SIZE,  # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,   # batch size for evaluation
    evaluation_strategy='epoch',
)

Define a trainer with the previously defined training args

In [5]:
# Define the trainer
trainer = Trainer(
    model=model,                         # the instantiated Transformer model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_datasets[0], # training dataset
    eval_dataset=tokenized_datasets[1],  # evaluation dataset
    data_collator=data_collator          # Dat collator
)

Train the model

In [6]:
# Checking the device used to train the model
print(f'Device: {model.device}')

# Train the model
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Device: cuda:0


Epoch,Training Loss,Validation Loss
1,0.2719,0.180539


TrainOutput(global_step=782, training_loss=0.2508654533444768, metrics={'train_runtime': 375.9601, 'train_samples_per_second': 66.496, 'train_steps_per_second': 2.08, 'total_flos': 3311684966400000.0, 'train_loss': 0.2508654533444768, 'epoch': 1.0})

Evaluate model and display its loss on the 'evaluation set'

In [7]:
# Evaluate the model loss on testing data
loss_evaluation = trainer.evaluate()['eval_loss']
print(f'Testing loss: {loss_evaluation:.2f}%')

Testing loss: 0.18%


2. **\[Bonus\]** Fine-tune your model using the accuracy as evaluation instead of the loss (default).

Define trainer with acccuracy metric

In [10]:
def compute_metrics(eval_preds: tuple[np.ndarray, np.ndarray]) -> float:
    """
    Computes the accuracy of the model.
    Args:
        eval_preds: tuple[np.ndarray, np.ndarray] - Tuple of predictions and labels
    Returns:
        accuracy: float - Accuracy of the model
    """
    metric = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

accuracy_trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_datasets[0],           # training dataset
    eval_dataset=tokenized_datasets[1],            # evaluation dataset
    compute_metrics=compute_metrics,
)

Train the model with the new trainer

In [12]:
accuracy_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1645,0.194507,0.93152


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

TrainOutput(global_step=782, training_loss=0.15378097378079544, metrics={'train_runtime': 391.6848, 'train_samples_per_second': 63.827, 'train_steps_per_second': 1.997, 'total_flos': 3625367766417408.0, 'train_loss': 0.15378097378079544, 'epoch': 1.0})

Evaluate model, and diplay accuracy

In [13]:
accuracy = accuracy_trainer.evaluate()['eval_accuracy']
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 93.15%


Importing pre-trained model from hugging face

In [14]:
fine_tuned_model_name = "mvonwyl/distilbert-base-uncased-imdb"
fine_tuned_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_name)
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained(fine_tuned_model_name, num_labels=2)

Downloading (…)okenizer_config.json:   0%|          | 0.00/360 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

3. (2 points) Evaluate the pre-trained model in term of accuracy on the test data.

In [15]:
trainer = Trainer(
    model=fine_tuned_model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_datasets[0],           # training dataset
    eval_dataset=tokenized_datasets[1],            # evaluation dataset
    compute_metrics=compute_metrics,
)

accuracy = trainer.evaluate()['eval_accuracy']
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 92.95%


4. (2 points) For at least 2 samples which have been wrongly classified in the test set, try explaining why the model could have been wrong.

In [16]:
predictions = trainer.predict(tokenized_datasets[1])
predictions = np.argmax(predictions.predictions, axis=-1)

wrong_predictions = []

true_labels = np.array(tokenized_datasets[1]['label'])
diff_indices = np.where(predictions != true_labels)[0][:2]

wrong_predictions = [predictions[index] for index in diff_indices]
accurate_labels = [true_labels[index] for index in diff_indices]

print('First example')
sample_1 = datasets[1][int(diff_indices[0])]['text']
print(f'Sample: {sample_1}')
print(f'Model prediction: {"Positive" if wrong_predictions[0] == 1 else "Negative"}')
print(f'Ground truth: {"Positive" if accurate_labels[0] == 1 else "Negative"}\n')

print('Second example')
sample_2 = datasets[1][int(diff_indices[1])]['text']
print(f'Sample: {sample_2}')
print(f'Model prediction: {"Positive" if wrong_predictions[1] == 1 else "Negative"}')
print(f'Ground truth: {"Positive" if accurate_labels[1] == 1 else "Negative"}\n')

First example
Sample: First off let me say, If you haven't enjoyed a Van Damme movie since bloodsport, you probably will not like this movie. Most of these movies may not have the best plots or best actors but I enjoy these kinds of movies for what they are. This movie is much better than any of the movies the other action guys (Segal and Dolph) have thought about putting out the past few years. Van Damme is good in the movie, the movie is only worth watching to Van Damme fans. It is not as good as Wake of Death (which i highly recommend to anyone of likes Van Damme) or In hell but, in my opinion it's worth watching. It has the same type of feel to it as Nowhere to Run. Good fun stuff!
Model prediction: Positive
Ground truth: Negative

Second example
Sample: I'm the type of guy who loves hood movies from New Jack City to Baby Boy to Killa Season, from the b grade to the Hollywood. but this movie was something different. i am no hater and this movie was kinda enjoyable. but some bits we

* First example\
The review is describing several good aspects of the movie, but that are in more at the end of the review. We deduce that the beginning has not been taken enough into account, as the review says that the movie is not be liked if the spectator does not like Van Damme.

* Second example\
In this review, there are a lot of negative descriptions, which are mostly located at center of the review. However, terms that can be semantically described as 'positive' at located at the beginning and end of the review.

From these two reviews, we could make the hypothesis that the model makes its decisions mostly based on the inputs edges, and that the center part of the input does not influence the decision as much. That can be explained by the fact that the model takes only into account 512 tokens from a review, as the input is truncated.

5. (3 points) What are the advantages and inconvenient of using this model in production compared to the naive Bayes we implemented in the first part of the course? And compared to a recurrent model like an RNN or an LSTM?

Here is a table representing the advantages and downsides of the different models we are comparing.
</br>
</br>

|                            | Naive Bayes Classifier | RNN/LSTM Models        |
|----------------------------|------------------------|------------------------|
| **Advantages**             |                        |                        |
| Language Understanding    | Yes                    | Yes                    |
| Transfer Learning          | No                     | No                     |
| Flexibility                | No                     | No                     |
| **Inconveniences**         |                        |                        |
| Model Complexity           | Low                    | High                   |
| Inference Time             | Low                    | Moderate               |
| Data Dependency            | Low                    | High                   |


7. **\[Bonus\]** The model only accepts inputs of maximum 512 tokens. Propose and implement a solution that goes around that. For example:
    * Compare using only the beginning (what you already implemented) with only the ending of the review (as review tends to end with a conclusion summarizing the sentiment).
        * You can train a model on the last 512 token of every training input, or use the same model but only predict on the end of each review.
        * Find review longer than 512 tokens, and evaluate on them separately.
        * The [truncation_side](https://huggingface.co/docs/transformers/v4.20.1/en/main_classes/tokenizer#transformers.PreTrainedTokenizer) parameter might help.
    * Or use a sliding window and average the predictions of every part.
        * For example, use a window with a stride of 64 tokens.
        * You can either make the model loop on the tokenized input, or [create your own pipeline](https://huggingface.co/docs/transformers/add_new_pipeline).

In [17]:
def process_long_review(model: AutoModelForSequenceClassification, review_text: str, model_name: str, tokenizer: AutoTokenizer, window_size: int=512, stride: int=64) -> float:
    """
    Processes a long review with a sliding window by splitting it into segments and averaging the scores.
    Args:
        model: AutoModelForSequenceClassification - Model to use for scoring
        review_text: str - Review text to process
        model_name: str - Model name
        tokenizer: AutoTokenizer - Tokenizer to use for tokenization
        window_size: int - Window size for splitting the review
        stride: int - Stride for splitting the review
    Returns:
        scores: float - Average score of the review
    """
    tokens = tokenizer(review_text, padding='max_length', truncation=True, return_tensors="pt")
    input_ids = tokens.input_ids.squeeze(0)
    attention_mask = tokens.attention_mask.squeeze(0)
    num_tokens = len(tokens.input_ids)

    if num_tokens <= window_size:
      outputs = model(input_ids.cuda())
      logits = outputs.logits
      probabilities = torch.softmax(logits, dim=1)
      scores = probabilities[:, 1].item()
      return scores

    num_segments = math.ceil((num_tokens - window_size) / stride) + 1

    segment_predictions = [0]

    for i in range(num_segments):
        start = i * stride
        end = start + window_size
        segment_input_ids = input_ids[start:end]
        segment_attention_mask = attention_mask[start:end]

        outputs = model(input_ids=segment_input_ids.unsqueeze(0), attention_mask=segment_attention_mask.unsqueeze(0))
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        sentiment_scores = probabilities[:, 1].item()  # Assuming binary sentiment classification
        segment_predictions.append(sentiment_scores)

    return sum(segment_predictions) / len(segment_predictions)

def predict_long_review(model: AutoModelForSequenceClassification, dataset: list, model_name: str) -> np.ndarray:
  """
  Predicts the sentiment of a long review by splitting it into segments and averaging the scores.
  Args:
    model: AutoModelForSequenceClassification - Model to use for scoring
    dataset: list - Dataset to predict
    model_name: str - Model name
  Returns:
    predictions: np.ndarray - Predictions of the dataset
  """
  predictions = []
  for i in tqdm(range(len(dataset))):
    review_text = dataset[i]
    prediction: float = process_long_review(model=model, review_text=review_text, model_name=model_name, tokenizer=fine_tuned_tokenizer)
    predictions.append(prediction)
  return np.round(predictions).astype('int')

In [18]:
# Retrieve predictions
predictions = predict_long_review(fine_tuned_model, tokenized_datasets[1]['text'], fine_tuned_model_name)

# Evaluate predictions
testing_labels = np.array(tokenized_datasets[1]['label'])
accuracy = accuracy_score(testing_labels, predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')

  0%|          | 0/25000 [00:00<?, ?it/s]

Accuracy: 81.78%


Surprisingly, we obtain a significantly worse accuracy with a sliding window than with the truncated inputs. However, we could not find the source of this issue. Maybe the score was not correctly computed.