# Assignment 2
In this second assignment, you are challenged to employ Hugging Face transformers for the same classification task as in the first assignment.

You should explore Hugging Face models to find a pre-trained model that is suitable and promising for fine-tuning to your task. It should make sense to pick one that has been pre-trained for the same language and/or text genre.

As a bonus, you can also employ a domain adaptation approach.

You should compare the performance of your model(s) with the ones developed for the first assignment. For the final delivery, prepare a short presentation (max 10 slides) documenting your approach.

## Imports

In [1]:
! pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m81.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m109.8 MB/s[0m eta [36m0:00:00[0m


In [2]:
import pandas as pd
from datasets import load_dataset
import json
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from datasets import load_metric
import numpy as np
from transformers import AutoModelForMaskedLM
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from transformers import pipeline
from datasets import load_metric
import numpy as np
from transformers import TextClassificationPipeline
from transformers import DataCollatorForLanguageModeling
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import math
from huggingface_hub import notebook_login

## Loading dataset

In [3]:
oos = False

In [5]:
# Importing the dataset

def get_df_hf(oos=False) :
    with open('/content/drive/MyDrive/colab_files/data_full.json') as json_file: 
        data_dict = json.load(json_file) 

    train_data = data_dict['train']
    val_data = data_dict['val']
    test_data = data_dict['test']

    oos_train = data_dict['oos_train']
    oos_val = data_dict['oos_val']
    oos_test = data_dict['oos_test']


    train_df = pd.DataFrame(train_data, columns =['query', 'label'])
    val_df = pd.DataFrame(val_data, columns =['query', 'label'])
    test_df = pd.DataFrame(test_data, columns =['query', 'label'])

    train_oos_df = pd.DataFrame(oos_train,columns=['query','label'])
    val_oos_df = pd.DataFrame(oos_val,columns=['query','label'])
    test_oos_df = pd.DataFrame(oos_test,columns=['query','label'])

    if oos :
        # Concatenate dataframes to consider oos as a specific intent
        train_df = pd.concat([train_df,train_oos_df])
        val_df = pd.concat([val_df,val_oos_df])
        test_df = pd.concat([test_df,test_oos_df])

    unique_labels = train_df['label'].unique()
    labels_dict = {i: v for i, v in enumerate(unique_labels)}

    train_df['label'], _ = pd.factorize(train_df['label'])
    val_df['label'], _ = pd.factorize(val_df['label'])
    test_df['label'], _ = pd.factorize(test_df['label'])
    
    return Dataset.from_pandas(train_df), Dataset.from_pandas(val_df), Dataset.from_pandas(test_df), labels_dict

train_df, val_df, test_df, label_mapping = get_df_hf(oos)
train_valid_test_dataset = DatasetDict({
    'train': train_df,
    'validation': val_df,
    'test': test_df
})

train_valid_test_dataset


DatasetDict({
    train: Dataset({
        features: ['query', 'label'],
        num_rows: 15000
    })
    validation: Dataset({
        features: ['query', 'label'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['query', 'label'],
        num_rows: 4500
    })
})

## Fine tune a classifier
Models used :
- *Distilled BERT*
- *BERT-large*

### Distilled BERT

#### Tokenizer

In [None]:
model_name = "distilbert-base-uncased"

def preprocess_function(sample):
    return tokenizer(sample["query"], truncation=True, padding=True, return_tensors='pt')

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_dataset = train_valid_test_dataset.map(preprocess_function, batched=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['query', 'label', 'input_ids', 'attention_mask'],
        num_rows: 15000
    })
    validation: Dataset({
        features: ['query', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['query', 'label', 'input_ids', 'attention_mask'],
        num_rows: 4500
    })
})

#### Load the pretrained model

In [None]:
num_labels = 150 if not oos else 151

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, id2label = label_mapping)

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

#### Train the model using a Trainer

In [None]:
metric = load_metric("accuracy")

# TODO compute different metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# TODO hyperparameters tuning
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
# Using a GPU on GoogleColab it will takes 3 min
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,4.2829,1.911801,0.839667
2,1.4293,0.743815,0.912333
3,0.6107,0.536188,0.936


TrainOutput(global_step=2814, training_loss=1.7990695055952797, metrics={'train_runtime': 210.7746, 'train_samples_per_second': 213.498, 'train_steps_per_second': 13.351, 'total_flos': 379476929590560.0, 'train_loss': 1.7990695055952797, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.5361877679824829,
 'eval_accuracy': 0.936,
 'eval_runtime': 2.7737,
 'eval_samples_per_second': 1081.58,
 'eval_steps_per_second': 67.779,
 'epoch': 3.0}

In [None]:
trainer.predict(test_dataset=tokenized_dataset["test"])

PredictionOutput(predictions=array([[ 4.2729006 , -2.0023146 , -3.4231374 , ..., -2.0798864 ,
        -3.127964  , -3.3100307 ],
       [ 3.725225  , -2.3960316 , -3.3146207 , ..., -2.632892  ,
        -3.3387148 , -3.0518765 ],
       [ 2.8965309 , -2.7931068 , -3.3313944 , ..., -2.855633  ,
        -3.3649404 , -3.3334956 ],
       ...,
       [-3.6159642 , -1.9574509 , -3.0849562 , ..., -4.263528  ,
        -0.78766465,  3.835419  ],
       [-3.5660586 , -1.4420491 , -3.5709834 , ..., -4.598283  ,
        -1.6596339 ,  2.1731055 ],
       [-3.6546237 , -1.9909067 , -3.030335  , ..., -4.2192974 ,
        -1.1644701 ,  3.6256542 ]], dtype=float32), label_ids=array([  0,   0,   0, ..., 149, 149, 149]), metrics={'test_loss': 0.5418161153793335, 'test_accuracy': 0.9366666666666666, 'test_runtime': 4.6764, 'test_samples_per_second': 962.284, 'test_steps_per_second': 60.303})

#### Saving the model and load an existing model

In [None]:
trainer.save_model()

In [None]:
tokenizer2 = AutoTokenizer.from_pretrained("./results")
model2 = AutoModelForSequenceClassification.from_pretrained("./results", num_labels=num_labels)

In [None]:
pipe = TextClassificationPipeline(model=model2, tokenizer=tokenizer2) #, return_all_scores=True)

In [None]:
pipe('set the alarm at 5 o clock')

[{'label': 'alarm', 'score': 0.8262593746185303}]

### BERT-large

#### Tokenizer

In [6]:
model_name = "bert-large-uncased"

def preprocess_function(sample):
    return tokenizer(sample["query"], truncation=True, padding=True, return_tensors='pt')

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_dataset = train_valid_test_dataset.map(preprocess_function, batched=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

In [7]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['query', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 15000
    })
    validation: Dataset({
        features: ['query', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['query', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4500
    })
})

#### Load the pretrained model

In [8]:
num_labels = 150 if not oos else 151

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, id2label = label_mapping)

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

#### Train the model using a Trainer

In [10]:
metric = load_metric("accuracy")

# TODO compute different metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# TODO hyperparameters tuning
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [11]:
# Using a GPU on GoogleColab it will takes 22 min
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,3.6996,0.515172,0.949333
2,0.2999,0.173931,0.966
3,0.0657,0.147582,0.972333


TrainOutput(global_step=2814, training_loss=0.9424602438053004, metrics={'train_runtime': 1329.1652, 'train_samples_per_second': 33.856, 'train_steps_per_second': 2.117, 'total_flos': 2663990449407264.0, 'train_loss': 0.9424602438053004, 'epoch': 3.0})

In [12]:
trainer.evaluate()

{'eval_loss': 0.14758244156837463,
 'eval_accuracy': 0.9723333333333334,
 'eval_runtime': 19.2384,
 'eval_samples_per_second': 155.938,
 'eval_steps_per_second': 9.772,
 'epoch': 3.0}

In [13]:
trainer.predict(test_dataset=tokenized_dataset["test"])

PredictionOutput(predictions=array([[ 9.44114876e+00, -5.43887496e-01, -4.13280636e-01, ...,
        -1.10862166e-01,  4.21343774e-01,  3.00723314e-01],
       [ 8.77754688e+00, -5.88021338e-01, -7.18815267e-01, ...,
        -1.74289986e-01,  3.14560920e-01,  3.42070520e-01],
       [ 9.30512333e+00, -5.32194912e-01, -5.29553175e-01, ...,
        -6.22266391e-03,  5.32308340e-01,  3.63225043e-01],
       ...,
       [-1.39880940e-01, -1.10047626e+00, -5.26050150e-01, ...,
         2.92968541e-01,  6.95031166e-01,  9.45474529e+00],
       [-4.52642292e-01, -6.40452981e-01, -6.12972021e-01, ...,
         2.61470497e-01,  6.03887379e-01,  7.21585798e+00],
       [ 9.60774496e-02, -1.34596360e+00,  1.96969375e-01, ...,
         3.16473871e-01,  6.54422820e-01,  9.05060482e+00]], dtype=float32), label_ids=array([  0,   0,   0, ..., 149, 149, 149]), metrics={'test_loss': 0.16450832784175873, 'test_accuracy': 0.9677777777777777, 'test_runtime': 29.5815, 'test_samples_per_second': 152.122, 'te

#### Saving the model and load an existing model

In [14]:
trainer.save_model()

In [15]:
tokenizer2 = AutoTokenizer.from_pretrained("./results")
model2 = AutoModelForSequenceClassification.from_pretrained("./results", num_labels=num_labels)

In [16]:
pipe = TextClassificationPipeline(model=model2, tokenizer=tokenizer2) #, return_all_scores=True)

In [17]:
pipe('set the alarm at 5 o clock')

[{'label': 'alarm', 'score': 0.9780707955360413}]

### RoBERTa-large

#### Tokenizer

In [42]:
model_name = "roberta-large"

def preprocess_function(sample):
    return tokenizer(sample["query"], truncation=True, padding=True, return_tensors='pt')

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_dataset = train_valid_test_dataset.map(preprocess_function, batched=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

In [43]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['query', 'label', 'input_ids', 'attention_mask'],
        num_rows: 15000
    })
    validation: Dataset({
        features: ['query', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['query', 'label', 'input_ids', 'attention_mask'],
        num_rows: 4500
    })
})

#### Load the pretrained model

In [44]:
num_labels = 150 if not oos else 151

In [45]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, id2label = label_mapping)

Downloading pytorch_model.bin:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classi

#### Train the model using a Trainer

In [46]:
metric = load_metric("accuracy")

# TODO compute different metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# TODO hyperparameters tuning
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [47]:
# Using a GPU on GoogleColab it will takes ?? min

trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,5.0507,5.014689,0.006667
2,5.0253,5.011508,0.006667
3,5.0214,5.011306,0.006667


TrainOutput(global_step=2814, training_loss=5.030576487678793, metrics={'train_runtime': 1375.0371, 'train_samples_per_second': 32.726, 'train_steps_per_second': 2.046, 'total_flos': 2929301211774336.0, 'train_loss': 5.030576487678793, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.14758244156837463,
 'eval_accuracy': 0.9723333333333334,
 'eval_runtime': 19.2384,
 'eval_samples_per_second': 155.938,
 'eval_steps_per_second': 9.772,
 'epoch': 3.0}

In [None]:
trainer.predict(test_dataset=tokenized_dataset["test"])

PredictionOutput(predictions=array([[ 9.44114876e+00, -5.43887496e-01, -4.13280636e-01, ...,
        -1.10862166e-01,  4.21343774e-01,  3.00723314e-01],
       [ 8.77754688e+00, -5.88021338e-01, -7.18815267e-01, ...,
        -1.74289986e-01,  3.14560920e-01,  3.42070520e-01],
       [ 9.30512333e+00, -5.32194912e-01, -5.29553175e-01, ...,
        -6.22266391e-03,  5.32308340e-01,  3.63225043e-01],
       ...,
       [-1.39880940e-01, -1.10047626e+00, -5.26050150e-01, ...,
         2.92968541e-01,  6.95031166e-01,  9.45474529e+00],
       [-4.52642292e-01, -6.40452981e-01, -6.12972021e-01, ...,
         2.61470497e-01,  6.03887379e-01,  7.21585798e+00],
       [ 9.60774496e-02, -1.34596360e+00,  1.96969375e-01, ...,
         3.16473871e-01,  6.54422820e-01,  9.05060482e+00]], dtype=float32), label_ids=array([  0,   0,   0, ..., 149, 149, 149]), metrics={'test_loss': 0.16450832784175873, 'test_accuracy': 0.9677777777777777, 'test_runtime': 29.5815, 'test_samples_per_second': 152.122, 'te

#### Saving the model and load an existing model

In [None]:
trainer.save_model()

In [None]:
tokenizer2 = AutoTokenizer.from_pretrained("./results")
model2 = AutoModelForSequenceClassification.from_pretrained("./results", num_labels=num_labels)

In [None]:
pipe = TextClassificationPipeline(model=model2, tokenizer=tokenizer2) #, return_all_scores=True)

In [None]:
pipe('set the alarm at 5 o clock')

[{'label': 'alarm', 'score': 0.9780707955360413}]

## Domain Adaptation
Fine tuning the language model on our specific dataset. Models used:
- *Distilled BERT*
- *BERT-large*

### Distilled BERT

In [None]:
model_checkpoint = "distilbert-base-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

 #### Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def tokenize_function(examples):
    result = tokenizer(examples["query"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

tokenized_datasets = train_valid_test_dataset.map(
    tokenize_function, batched=True, remove_columns=["query", "label"]
)
tokenized_datasets

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 15000
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 4500
    })
})

#### Pre-processing dataset

In [None]:
chunk_size = 8

def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 20691
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 4112
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 6116
    })
})

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
'''
# JUST TO SEE HOW MASKING WORKS
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")'''

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



'>>> [CLS] what [MASK] [MASK] i use to [MASK]'

'>>> i love you if i were [MASK] italian'


#### HugginFace login

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

#### Fine-tune

In [None]:
batch_size = 64
# Show the training loss with every epoch
logging_steps = len(lm_datasets["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-clinc150",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Cloning https://huggingface.co/FilippoComastri/distilbert-base-uncased-finetuned-clinc150 into local empty directory.


Download file pytorch_model.bin:   0%|          | 16.5k/256M [00:00<?, ?B/s]

Download file runs/May01_22-40-08_29ed72ba3bb4/events.out.tfevents.1682980827.29ed72ba3bb4.1039.0: 100%|######…

Download file training_args.bin: 100%|##########| 3.56k/3.56k [00:00<?, ?B/s]

Download file runs/May01_22-40-08_29ed72ba3bb4/1682980827.0536504/events.out.tfevents.1682980827.29ed72ba3bb4.…

Clean file runs/May01_22-40-08_29ed72ba3bb4/events.out.tfevents.1682980827.29ed72ba3bb4.1039.0:  18%|#8       …

Clean file training_args.bin:  28%|##8       | 1.00k/3.56k [00:00<?, ?B/s]

Clean file runs/May01_22-40-08_29ed72ba3bb4/1682980827.0536504/events.out.tfevents.1682980827.29ed72ba3bb4.103…

Clean file pytorch_model.bin:   0%|          | 1.00k/256M [00:00<?, ?B/s]

In [None]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 319.56


In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,3.7299,3.183496
2,3.0921,2.944874
3,2.8778,2.864294


TrainOutput(global_step=972, training_loss=3.2325474980436724, metrics={'train_runtime': 90.4486, 'train_samples_per_second': 686.279, 'train_steps_per_second': 10.746, 'total_flos': 128569822536672.0, 'train_loss': 3.2325474980436724, 'epoch': 3.0})

In [None]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 16.99


In [None]:
trainer.push_to_hub()

Upload file pytorch_model.bin:   0%|          | 1.00/256M [00:00<?, ?B/s]

Upload file runs/May02_10-20-51_a74ed6bdf753/events.out.tfevents.1683023084.a74ed6bdf753.247.3:   0%|         …

Upload file runs/May02_10-20-51_a74ed6bdf753/events.out.tfevents.1683023390.a74ed6bdf753.247.5:   0%|         …

To https://huggingface.co/FilippoComastri/distilbert-base-uncased-finetuned-clinc150
   4d98e40..6af0e14  main -> main

   4d98e40..6af0e14  main -> main

To https://huggingface.co/FilippoComastri/distilbert-base-uncased-finetuned-clinc150
   6af0e14..438128b  main -> main

   6af0e14..438128b  main -> main



'https://huggingface.co/FilippoComastri/distilbert-base-uncased-finetuned-clinc150/commit/6af0e140452b54dd0afd81be2078db6669423aed'

#### Testing the fine-tuned LM

In [None]:
text = "Set the [MASK] at 5 am"
#text = input()
mask_filler = pipeline(
    "fill-mask", model="FilippoComastri/distilbert-base-uncased-finetuned-clinc150"
)
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

>>> set the timer at 5 am
>>> set the alarm at 5 am
>>> set the clock at 5 am
>>> set the temperature at 5 am
>>> set the calendar at 5 am


#### Fine tune the classification head

In [None]:
checkpoint = 'FilippoComastri/distilbert-base-uncased-finetuned-clinc150'
num_labels = 150 if not oos else 151

def preprocess_function(sample):
    return tokenizer(sample["query"], truncation=True, padding=True, return_tensors='pt')

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels, id2label = label_mapping)

tokenized_dataset = train_valid_test_dataset.map(preprocess_function, batched=True)

metric = load_metric("accuracy")

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Some weights of the model checkpoint at FilippoComastri/distilbert-base-uncased-finetuned-clinc150 were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at FilippoComastri/distilbert-base-uncased-finetuned-clinc150 and are newly

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

In [None]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,4.2023,1.822378,0.865667
2,1.359,0.683004,0.931
3,0.5732,0.489556,0.943333


TrainOutput(global_step=2814, training_loss=1.7364569354243709, metrics={'train_runtime': 204.1685, 'train_samples_per_second': 220.406, 'train_steps_per_second': 13.783, 'total_flos': 379476929590560.0, 'train_loss': 1.7364569354243709, 'epoch': 3.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.4895559847354889,
 'eval_accuracy': 0.9433333333333334,
 'eval_runtime': 2.7752,
 'eval_samples_per_second': 1081.005,
 'eval_steps_per_second': 67.743,
 'epoch': 3.0}

In [None]:
trainer.predict(test_dataset=tokenized_dataset["test"])

PredictionOutput(predictions=array([[ 4.0404058, -2.3363764, -3.3984244, ..., -3.6140785, -4.9866443,
        -2.687481 ],
       [ 3.2618072, -2.408392 , -3.6718779, ..., -4.0112453, -4.892006 ,
        -3.0774589],
       [ 1.5863073, -2.8651512, -3.9078455, ..., -4.481331 , -5.5019245,
        -3.7027879],
       ...,
       [-2.888769 , -1.6661367, -4.8878207, ..., -3.4183831, -1.473064 ,
         3.6726696],
       [-2.7909164, -1.7764937, -4.6082263, ..., -2.9048455, -1.4416375,
         2.591952 ],
       [-2.603976 , -1.9942297, -4.6437488, ..., -3.1800025, -1.8952566,
         3.680227 ]], dtype=float32), label_ids=array([  0,   0,   0, ..., 149, 149, 149]), metrics={'test_loss': 0.5213555693626404, 'test_accuracy': 0.9344444444444444, 'test_runtime': 4.528, 'test_samples_per_second': 993.816, 'test_steps_per_second': 62.279})

In [None]:
trainer.save_model()

In [None]:
tokenizer2 = AutoTokenizer.from_pretrained("./results")
model2 = AutoModelForSequenceClassification.from_pretrained("./results", num_labels=num_labels)

In [None]:
pipe = TextClassificationPipeline(model=model2, tokenizer=tokenizer2) #, return_all_scores=True)

In [None]:
pipe('tell how much money i have')

[{'label': 'LABEL_145', 'score': 0.3113534450531006}]

### BERT - large

In [18]:
model_checkpoint = "bert-large-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


 #### Tokenizer

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
def tokenize_function(examples):
    result = tokenizer(examples["query"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

tokenized_datasets = train_valid_test_dataset.map(
    tokenize_function, batched=True, remove_columns=["query", "label"]
)
tokenized_datasets

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 15000
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 4500
    })
})

#### Pre-processing dataset

In [20]:
chunk_size = 8

def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [21]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 20691
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 4112
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 6116
    })
})

In [22]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
'''
# JUST TO SEE HOW MASKING WORKS
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")'''

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



'>>> [CLS] what [MASK] [MASK] i use to [MASK]'

'>>> i love you if i were [MASK] italian'


#### HugginFace login

In [23]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

#### Fine-tune

In [24]:
batch_size = 64
# Show the training loss with every epoch
logging_steps = len(lm_datasets["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-clinc150",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Cloning https://huggingface.co/FilippoComastri/bert-large-uncased-finetuned-clinc150 into local empty directory.


In [25]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


>>> Perplexity: 2459.14


In [26]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,3.6666,2.95891
2,2.8829,2.75073
3,2.6216,2.625599


TrainOutput(global_step=972, training_loss=3.05598468937501, metrics={'train_runtime': 319.5209, 'train_samples_per_second': 194.269, 'train_steps_per_second': 3.042, 'total_flos': 903962426897376.0, 'train_loss': 3.05598468937501, 'epoch': 3.0})

In [27]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 14.23


In [28]:
trainer.push_to_hub()

Upload file pytorch_model.bin:   0%|          | 1.00/1.25G [00:00<?, ?B/s]

Upload file runs/May04_11-34-09_72030e5b1bea/events.out.tfevents.1683200954.72030e5b1bea.163.5:   0%|         …

Upload file runs/May04_11-34-09_72030e5b1bea/events.out.tfevents.1683200077.72030e5b1bea.163.3:   0%|         …

To https://huggingface.co/FilippoComastri/bert-large-uncased-finetuned-clinc150
   a6cd4fa..426be88  main -> main

   a6cd4fa..426be88  main -> main

To https://huggingface.co/FilippoComastri/bert-large-uncased-finetuned-clinc150
   426be88..2cc5e25  main -> main

   426be88..2cc5e25  main -> main



'https://huggingface.co/FilippoComastri/bert-large-uncased-finetuned-clinc150/commit/426be8802e959e4feeb4028cf3d4bb69dc629873'

#### Testing the fine-tuned LM

In [30]:
text = "Set the [MASK] at 5 am"
#text = input()
mask_filler = pipeline(
    "fill-mask", model="FilippoComastri/bert-large-uncased-finetuned-clinc150"
)
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

>>> set the alarm at 5 am
>>> set the timer at 5 am
>>> set the reminder at 5 am
>>> set the time at 5 am
>>> set the meeting at 5 am


#### Fine tune the classification head

In [34]:
checkpoint = 'FilippoComastri/bert-large-uncased-finetuned-clinc150'
num_labels = 150 if not oos else 151

def preprocess_function(sample):
    return tokenizer(sample["query"], truncation=True, padding=True, return_tensors='pt')

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels, id2label = label_mapping)

tokenized_dataset = train_valid_test_dataset.map(preprocess_function, batched=True)

metric = load_metric("accuracy")

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Some weights of the model checkpoint at FilippoComastri/bert-large-uncased-finetuned-clinc150 were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model check

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

In [35]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,2.7724,0.267532,0.960667
2,0.1393,0.150557,0.971333
3,0.0399,0.147491,0.972


TrainOutput(global_step=2814, training_loss=0.626510178187085, metrics={'train_runtime': 1343.0063, 'train_samples_per_second': 33.507, 'train_steps_per_second': 2.095, 'total_flos': 2663990449407264.0, 'train_loss': 0.626510178187085, 'epoch': 3.0})

In [36]:
trainer.evaluate()

{'eval_loss': 0.14749076962471008,
 'eval_accuracy': 0.972,
 'eval_runtime': 16.8884,
 'eval_samples_per_second': 177.637,
 'eval_steps_per_second': 11.132,
 'epoch': 3.0}

In [37]:
trainer.predict(test_dataset=tokenized_dataset["test"])

PredictionOutput(predictions=array([[ 9.903889  , -0.278777  ,  0.08616155, ..., -0.11709055,
        -0.77504003, -0.09273793],
       [ 8.398007  , -0.26689127, -0.351371  , ..., -0.5314776 ,
        -0.65870255, -0.01120482],
       [ 9.599154  , -0.27195373,  0.12788029, ..., -0.27271104,
        -0.7123789 , -0.17122951],
       ...,
       [ 0.05530655, -0.4118124 , -0.8861427 , ...,  0.03686897,
         0.34357184, 10.077849  ],
       [-0.15063727, -0.46834394, -0.8095364 , ..., -0.21227354,
         0.64669514,  8.818914  ],
       [-0.07972962, -0.45698896, -0.83049625, ..., -0.15059598,
         0.57302547,  9.726615  ]], dtype=float32), label_ids=array([  0,   0,   0, ..., 149, 149, 149]), metrics={'test_loss': 0.1551254540681839, 'test_accuracy': 0.9682222222222222, 'test_runtime': 26.0719, 'test_samples_per_second': 172.6, 'test_steps_per_second': 10.816})

In [38]:
trainer.save_model()

In [39]:
tokenizer2 = AutoTokenizer.from_pretrained("./results")
model2 = AutoModelForSequenceClassification.from_pretrained("./results", num_labels=num_labels)

In [40]:
pipe = TextClassificationPipeline(model=model2, tokenizer=tokenizer2) #, return_all_scores=True)

In [41]:
pipe('tell how much money i have')

[{'label': 'balance', 'score': 0.9844793677330017}]