# **NLP05**

In [None]:
!pip install transformers[sentencepiece] datasets torch sklearn evaluate

In [2]:
from transformers import BertModel, AutoModelForSequenceClassification


import evaluate


# Import the dataset

In [3]:
import datasets

df = datasets.load_dataset("imdb")



  0%|          | 0/3 [00:00<?, ?it/s]

# Tokenize

In [4]:
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [5]:
def tokenize_function(element: datasets.arrow_dataset) -> datasets.arrow_dataset:
  """
    Function take and element and tokenize it
    Args:
        - element : element selected
    Returns:
        the element tokenized
    """
  return tokenizer(element["text"], truncation=True)

tokenized_datasets = df.map(tokenize_function, batched=True)


  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

# Split your test and validation dataset

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(tokenized_datasets['train']["text"], tokenized_datasets['train']["label"], test_size=0.2)

# Generate the dataset of test and train

In [7]:
def create_dataset(features: list, labels: list) -> datasets.Dataset:
    """
    Function generate a formated dataset

    Args:
        - features (list): list of features
        - labels (list): list of labels
    Returns:
        A dataset with the right format.
    """

    union = zip(features, labels)
    dataset = []

    for feature, label in union: dataset.append({"text": feature, "label": label})

    return datasets.Dataset.from_list(dataset)

dataset_train = create_dataset(X_train, y_train)
dataset_validation = create_dataset(X_valid, y_valid)

In [8]:
token_train = dataset_train.map(tokenize_function, batched="True")
token_valid = dataset_validation.map(tokenize_function, batched="True")

  0%|          | 0/20 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

# Prepare the training and the model

In [9]:
from transformers import TrainingArguments

#parameters for the train
training_args = TrainingArguments("test-trainer", num_train_epochs=1)

In [10]:
from transformers import AutoModelForSequenceClassification

#creation of the model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
from transformers import Trainer

#Instancite de trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=token_train,
    eval_dataset=token_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [12]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 20000
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2500
  Number of trainable parameters = 109483778
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.4007
1000,0.3397
1500,0.2945
2000,0.2751
2500,0.2549


Saving model checkpoint to test-trainer/checkpoint-500
Configuration saved in test-trainer/checkpoint-500/config.json
Model weights saved in test-trainer/checkpoint-500/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-500/special_tokens_map.json
Saving model checkpoint to test-trainer/checkpoint-1000
Configuration saved in test-trainer/checkpoint-1000/config.json
Model weights saved in test-trainer/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-1000/tokenizer_config.json
Special tokens file saved in test-trainer/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to test-trainer/checkpoint-1500
Configuration saved in test-trainer/checkpoint-1500/config.json
Model weights saved in test-trainer/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in test-trainer/checkpoint-1500/tokenizer_config.json
Special tokens file saved

TrainOutput(global_step=2500, training_loss=0.31297106018066406, metrics={'train_runtime': 1825.9971, 'train_samples_per_second': 10.953, 'train_steps_per_second': 1.369, 'total_flos': 4960893171048960.0, 'train_loss': 0.31297106018066406, 'epoch': 1.0})

# Evaluation

## Prediction

In [13]:
predictions = trainer.predict(tokenized_datasets['test'])

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 25000
  Batch size = 8


## Get the accuracy

In [14]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

import evaluate

metric = evaluate.load('accuracy')
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.93344}

# wrongly classified in the test set

In [15]:
import random

random.seed(42)

wrong = []
for i in range(len(tokenized_datasets['test']['label'])):
    if tokenized_datasets['test']['label'][i] != preds[i]:
      wrong.append((tokenized_datasets['test']['text'][i], tokenized_datasets['test']['label'][i]))
      if(len(wrong) == 2):
        break
wrong

[("First off let me say, If you haven't enjoyed a Van Damme movie since bloodsport, you probably will not like this movie. Most of these movies may not have the best plots or best actors but I enjoy these kinds of movies for what they are. This movie is much better than any of the movies the other action guys (Segal and Dolph) have thought about putting out the past few years. Van Damme is good in the movie, the movie is only worth watching to Van Damme fans. It is not as good as Wake of Death (which i highly recommend to anyone of likes Van Damme) or In hell but, in my opinion it's worth watching. It has the same type of feel to it as Nowhere to Run. Good fun stuff!",
  0),
 ("Ben, (Rupert Grint), is a deeply unhappy adolescent, the son of his unhappily married parents. His father, (Nicholas Farrell), is a vicar and his mother, (Laura Linney), is ... well, let's just say she's a somewhat hypocritical soldier in Jesus' army. It's only when he takes a summer job as an assistant to a fou

# Question 4

**Question 4:** The advantages of Using BERT NLP Model is that it work well for task-specific models.