# Assignment 2
In this second assignment, you are challenged to employ Hugging Face transformers for the same classification task as in the first assignment.

You should explore Hugging Face models to find a pre-trained model that is suitable and promising for fine-tuning to your task. It should make sense to pick one that has been pre-trained for the same language and/or text genre.

As a bonus, you can also employ a domain adaptation approach.

You should compare the performance of your model(s) with the ones developed for the first assignment. For the final delivery, prepare a short presentation (max 10 slides) documenting your approach.

## Imports

In [None]:
! pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m11

In [None]:
import pandas as pd
from datasets import load_dataset
import json
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from datasets import load_metric
import numpy as np
from transformers import AutoModelForMaskedLM
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from transformers import pipeline
from datasets import load_metric
import numpy as np
from transformers import TextClassificationPipeline
from transformers import DataCollatorForLanguageModeling
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import math
from huggingface_hub import notebook_login

## Loading dataset

In [None]:
oos = False

In [None]:
# Importing the dataset

def get_df_hf(oos=False) :
    with open('data_full.json') as json_file: 
        data_dict = json.load(json_file) 

    train_data = data_dict['train']
    val_data = data_dict['val']
    test_data = data_dict['test']

    oos_train = data_dict['oos_train']
    oos_val = data_dict['oos_val']
    oos_test = data_dict['oos_test']


    train_df = pd.DataFrame(train_data, columns =['query', 'label'])
    val_df = pd.DataFrame(val_data, columns =['query', 'label'])
    test_df = pd.DataFrame(test_data, columns =['query', 'label'])

    train_oos_df = pd.DataFrame(oos_train,columns=['query','label'])
    val_oos_df = pd.DataFrame(oos_val,columns=['query','label'])
    test_oos_df = pd.DataFrame(oos_test,columns=['query','label'])

    if oos :
        # Concatenate dataframes to consider oos as a specific intent
        train_df = pd.concat([train_df,train_oos_df])
        val_df = pd.concat([val_df,val_oos_df])
        test_df = pd.concat([test_df,test_oos_df])

    unique_labels = train_df['label'].unique()
    labels_dict = {i: v for i, v in enumerate(unique_labels)}

    train_df['label'], _ = pd.factorize(train_df['label'])
    val_df['label'], _ = pd.factorize(val_df['label'])
    test_df['label'], _ = pd.factorize(test_df['label'])

    # Map the integer labels to string labels
    #df['label'] = label_mapping.take(df['label'])
    
    return Dataset.from_pandas(train_df), Dataset.from_pandas(val_df), Dataset.from_pandas(test_df), labels_dict

train_df, val_df, test_df, label_mapping = get_df_hf(oos)
train_valid_test_dataset = DatasetDict({
    'train': train_df,
    'validation': val_df,
    'test': test_df
})

train_valid_test_dataset


DatasetDict({
    train: Dataset({
        features: ['query', 'label'],
        num_rows: 15000
    })
    validation: Dataset({
        features: ['query', 'label'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['query', 'label'],
        num_rows: 4500
    })
})

## Fine tune a language model 
Models used :
- *Distilled BERT*
- *qanastek/XLMRoberta-Alexa-Intents-Classification* 

### Distilled BERT

#### Tokenizer

In [None]:
model_name = "distilbert-base-uncased"

def preprocess_function(sample):
    return tokenizer(sample["query"], truncation=True, padding=True, return_tensors='pt')

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_dataset = train_valid_test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['query', 'label', 'input_ids', 'attention_mask'],
        num_rows: 15000
    })
    validation: Dataset({
        features: ['query', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['query', 'label', 'input_ids', 'attention_mask'],
        num_rows: 4500
    })
})

#### Load the pretrained model

In [None]:
num_labels = 150 if not oos else 151

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, id2label = label_mapping)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

#### Train the model using a Trainer

In [None]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [None]:
# Using a GPU on GoogleColab it will takes 3 min
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,4.2671,1.913937,0.842667
2,1.4392,0.745073,0.912333
3,0.6204,0.538833,0.935333


TrainOutput(global_step=2814, training_loss=1.8017695683829735, metrics={'train_runtime': 192.2026, 'train_samples_per_second': 234.128, 'train_steps_per_second': 14.641, 'total_flos': 379476929590560.0, 'train_loss': 1.8017695683829735, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.5487826466560364,
 'eval_accuracy': 0.9353333333333333,
 'eval_runtime': 2.648,
 'eval_samples_per_second': 1132.914,
 'eval_steps_per_second': 70.996,
 'epoch': 3.0}

In [None]:
trainer.predict(test_dataset=tokenized_dataset["test"])

PredictionOutput(predictions=array([[ 3.8244395, -3.0208955, -3.0893118, ..., -2.7512243, -2.4678206,
        -2.5011845],
       [ 3.208853 , -2.8454797, -2.725686 , ..., -2.7165296, -2.693016 ,
        -2.8848505],
       [ 1.9757181, -3.3284805, -2.559218 , ..., -3.2669663, -2.845639 ,
        -2.5467784],
       ...,
       [-2.2990177, -2.0223658, -3.4403837, ..., -2.8560865, -2.564628 ,
         3.9957016],
       [-3.1681132, -2.0801158, -3.0570734, ..., -2.8467793, -2.5618637,
         2.2983565],
       [-2.4028819, -2.3529863, -3.533458 , ..., -2.8846614, -2.5357406,
         3.211192 ]], dtype=float32), label_ids=array([  0,   0,   0, ..., 149, 149, 149]), metrics={'test_loss': 0.556429922580719, 'test_accuracy': 0.9297777777777778, 'test_runtime': 4.6743, 'test_samples_per_second': 962.718, 'test_steps_per_second': 60.33})

#### Saving the model and load an existing model

In [None]:
trainer.save_model()

In [None]:
tokenizer2 = AutoTokenizer.from_pretrained("./results")
model2 = AutoModelForSequenceClassification.from_pretrained("./results", num_labels=num_labels)

In [None]:
pipe = TextClassificationPipeline(model=model2, tokenizer=tokenizer2) #, return_all_scores=True)

In [None]:
pipe('set the alarm at 5 o clock')

[{'label': 'alarm', 'score': 0.8262593746185303}]

## Domain Adaptation

### Distilled BERT

In [None]:
model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)


Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

 #### Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def tokenize_function(examples):
    result = tokenizer(examples["query"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

tokenized_datasets = train_valid_test_dataset.map(
    tokenize_function, batched=True, remove_columns=["query", "label"]
)
tokenized_datasets

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 15000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 4500
    })
})

#### Pre-processing dataset

In [None]:
chunk_size = 128

def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1289
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 256
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 380
    })
})

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
'''
# JUST TO SEE HOW MASKING WORKS
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")'''

#### HugginFace login

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

#### Fine-tune

In [None]:
batch_size = 64
# Show the training loss with every epoch
logging_steps = len(lm_datasets["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-clinc150",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Cloning https://huggingface.co/FilippoComastri/distilbert-base-uncased-finetuned-clinc150 into local empty directory.


In [None]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
math.exp(eval_results['eval_loss'])

4.589508031882145

In [None]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,2.4709,1.83876
2,1.815,1.762933
3,1.7434,1.77147


TrainOutput(global_step=63, training_loss=1.9908672363039046, metrics={'train_runtime': 22.8647, 'train_samples_per_second': 169.125, 'train_steps_per_second': 2.755, 'total_flos': 128153497655808.0, 'train_loss': 1.9908672363039046, 'epoch': 3.0})

In [None]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 5.31


In [None]:
math.exp(eval_results['eval_loss'])

4.788957706403931

In [None]:
trainer.push_to_hub()

Upload file pytorch_model.bin:   0%|          | 1.00/256M [00:00<?, ?B/s]

Upload file runs/May01_22-40-08_29ed72ba3bb4/1682980827.0536504/events.out.tfevents.1682980827.29ed72ba3bb4.10…

Upload file training_args.bin:   0%|          | 1.00/3.56k [00:00<?, ?B/s]

Upload file runs/May01_22-40-08_29ed72ba3bb4/events.out.tfevents.1682980827.29ed72ba3bb4.1039.0:   0%|        …

To https://huggingface.co/FilippoComastri/distilbert-base-uncased-finetuned-clinc150
   d8706e5..a52e342  main -> main

   d8706e5..a52e342  main -> main

To https://huggingface.co/FilippoComastri/distilbert-base-uncased-finetuned-clinc150
   a52e342..dcecd62  main -> main

   a52e342..dcecd62  main -> main



'https://huggingface.co/FilippoComastri/distilbert-base-uncased-finetuned-clinc150/commit/a52e34258cc46796a0c7237521f4eccfd3be2649'

#### Testing the fine-tuned LM

In [None]:
text = "Set the [MASK] at 5 am"
#text = input()
mask_filler = pipeline(
    "fill-mask", model="/content/distilbert-base-uncased-finetuned-clinc150"
)
preds = mask_filler(text)

'''for pred in preds:
    print(f">>> {pred['sequence']}")'''

'for pred in preds:\n    print(f">>> {pred[\'sequence\']}")'

In [None]:
preds[0]['sequence']

'set the clock at 5 am'

#### Fine tune the classification head

In [None]:
checkpoint = 'FilippoComastri/distilbert-base-uncased-finetuned-clinc150'
num_labels = 150 if not oos else 151

def preprocess_function(sample):
    return tokenizer(sample["query"], truncation=True, padding=True, return_tensors='pt')

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)

tokenized_dataset = train_valid_test_dataset.map(preprocess_function, batched=True)

metric = load_metric("accuracy")

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Some weights of the model checkpoint at FilippoComastri/distilbert-base-uncased-finetuned-clinc150 were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at FilippoComastri/distilbert-base-uncased-finetuned-clinc150 and are newly

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

In [None]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,4.2507,1.877415,0.850667
2,1.3984,0.712438,0.923
3,0.5896,0.511156,0.943333


TrainOutput(global_step=2814, training_loss=1.7692787316816447, metrics={'train_runtime': 197.4812, 'train_samples_per_second': 227.87, 'train_steps_per_second': 14.249, 'total_flos': 379476929590560.0, 'train_loss': 1.7692787316816447, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.5111560225486755,
 'eval_accuracy': 0.9433333333333334,
 'eval_runtime': 2.7345,
 'eval_samples_per_second': 1097.08,
 'eval_steps_per_second': 68.75,
 'epoch': 3.0}

In [None]:
trainer.predict(test_dataset=tokenized_dataset["test"])

PredictionOutput(predictions=array([[ 4.0404058, -2.3363764, -3.3984244, ..., -3.6140785, -4.9866443,
        -2.687481 ],
       [ 3.2618072, -2.408392 , -3.6718779, ..., -4.0112453, -4.892006 ,
        -3.0774589],
       [ 1.5863073, -2.8651512, -3.9078455, ..., -4.481331 , -5.5019245,
        -3.7027879],
       ...,
       [-2.888769 , -1.6661367, -4.8878207, ..., -3.4183831, -1.473064 ,
         3.6726696],
       [-2.7909164, -1.7764937, -4.6082263, ..., -2.9048455, -1.4416375,
         2.591952 ],
       [-2.603976 , -1.9942297, -4.6437488, ..., -3.1800025, -1.8952566,
         3.680227 ]], dtype=float32), label_ids=array([  0,   0,   0, ..., 149, 149, 149]), metrics={'test_loss': 0.5213555693626404, 'test_accuracy': 0.9344444444444444, 'test_runtime': 4.528, 'test_samples_per_second': 993.816, 'test_steps_per_second': 62.279})

In [None]:
trainer.save_model()

In [None]:
tokenizer2 = AutoTokenizer.from_pretrained("./results")
model2 = AutoModelForSequenceClassification.from_pretrained("./results", num_labels=num_labels)

In [None]:
pipe = TextClassificationPipeline(model=model2, tokenizer=tokenizer2) #, return_all_scores=True)

In [None]:
pipe('tell how much money i have')

[{'label': 'LABEL_145', 'score': 0.3113534450531006}]