# Fine-Tuning a BERT-based model for sentence classification

## Please refer to the respective sections in the book for further details.


## Step 1. Installing libraries and Data loading.


In [1]:
# !pip install datasets
# !pip install accelerate -U
# !pip install evaluate

## Load dataset

In [2]:
from datasets import load_dataset

ade = load_dataset("ade_corpus_v2", "Ade_corpus_v2_classification")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
ade['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 23516
})

## Step 2. Data pre-processing

### Step 2.1 Split train test dataset

In [4]:
from datasets import Dataset

ade['train'] = ade['train'].train_test_split(test_size=0.2)['train']
ade['test'] = ade['train'].train_test_split(test_size=0.2)['test']

print(f"Number of training examples: {ade['train'].num_rows}")
print(f"Number of test examples: {ade['test'].num_rows}")


Number of training examples: 18812
Number of test examples: 3763


### Step 2.2 Tokenize dataset

In [5]:
from transformers import AutoTokenizer

text_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [6]:
def tokenize_texts(examples):
    return text_tokenizer(examples["text"], truncation=True)

In [7]:
tokenized_ade = ade.map(tokenize_texts, batched=True)

Map:   0%|          | 0/18812 [00:00<?, ? examples/s]

Map:   0%|          | 0/3763 [00:00<?, ? examples/s]

In [8]:
tokenized_ade

tokenized_ade['train'] = tokenized_ade['train'].train_test_split(test_size=0.2)['train']
tokenized_ade['test'] = tokenized_ade['train'].train_test_split(test_size=0.2)['test']

print(f"Number of training examples: {tokenized_ade['train'].num_rows}")
print(f"Number of test examples: {tokenized_ade['test'].num_rows}")

Number of training examples: 15049
Number of test examples: 3010


In [9]:
from transformers import DataCollatorWithPadding

padding_collator = DataCollatorWithPadding(tokenizer=text_tokenizer)

## Step 3. Model training

### Step 3.1 Load Model

In [10]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

classification_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Step 3.2 Setup evaluation utilities

In [11]:
import evaluate

accuracy = evaluate.load("accuracy")

In [12]:
import numpy as np


def calculate_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

### Step 3.3 Fine-tuning

In [22]:
train_parameters = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=classification_model,
    args=train_parameters,
    train_dataset=tokenized_ade["train"],
    eval_dataset=tokenized_ade["test"],
    tokenizer=text_tokenizer,
    data_collator=padding_collator,
    compute_metrics=calculate_metrics,
)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.297
1000,0.2048
1500,0.1478
2000,0.1112
2500,0.077
3000,0.0609
3500,0.0395
4000,0.0345
4500,0.0241


TrainOutput(global_step=4705, training_loss=0.10687520293702989, metrics={'train_runtime': 160.435, 'train_samples_per_second': 469.006, 'train_steps_per_second': 29.327, 'total_flos': 1289114510507892.0, 'train_loss': 0.10687520293702989, 'epoch': 5.0})

### Step 3.4 Save model locally

In [24]:
trainer.save_model("fine_tuned_classification_model")

## Step 4. Model Evaluation

### Step 4.1 Test inference

In [27]:
text = tokenized_ade['test'][0]['text']

from transformers import pipeline

text_classifier = pipeline("text-classification", model="fine_tuned_classification_model")
text_classifier(text)

[{'label': 'LABEL_0', 'score': 0.9995096921920776}]

In [28]:
ade["test"][0]

{'text': 'Severe acute encephalopathy following inadvertent intrathecal doxorubicin administration.',
 'label': 1}

In [29]:
text_classifier('A severe lidocaine intoxication by cutaneous absorption is described.')

[{'label': 'LABEL_0', 'score': 0.9998511075973511}]

### Step 4.2 Inference on the entire dataset

In [31]:
from transformers import pipeline
import pandas as pd

text_classifier = pipeline("text-classification", model="fine_tuned_classification_model")

test_data = ade["test"]

texts = []
true_labels = []
predicted_labels = []

for record in test_data:
    text = record["text"]
    true_label = record["label"]

    prediction = text_classifier(text)
    predicted_label = prediction[0]["label"]

    texts.append(text)
    true_labels.append(true_label)
    predicted_labels.append(predicted_label)

evaluation_results = pd.DataFrame({
    "Text": texts,
    "True Label": true_labels,
    "Predicted Label": predicted_labels
})

print(evaluation_results)


                                                   Text  True Label  \
0     Severe acute encephalopathy following inadvert...           1   
1     Despite close monitoring of therapy, he experi...           0   
2     Increased vigilance is always advised when adm...           0   
3     Myoclonus seen in the abdominal wall was segme...           0   
4     Most patients are treated conservatively becau...           0   
...                                                 ...         ...   
3758  Based on a Naranjo score of 7, this episode wa...           0   
3759  Acute abdomen due to endometriosis in a premen...           1   
3760  We report a patient with recurrent, increasing...           1   
3761  The mean follow-up was 33 months (range, 26-48...           0   
3762  We suspect that a similar immunological pathwa...           0   

     Predicted Label  
0            LABEL_1  
1            LABEL_0  
2            LABEL_0  
3            LABEL_0  
4            LABEL_0  
...      

In [32]:
evaluation_results.head()

Unnamed: 0,Text,True Label,Predicted Label
0,Severe acute encephalopathy following inadvert...,1,LABEL_1
1,"Despite close monitoring of therapy, he experi...",0,LABEL_0
2,Increased vigilance is always advised when adm...,0,LABEL_0
3,Myoclonus seen in the abdominal wall was segme...,0,LABEL_0
4,Most patients are treated conservatively becau...,0,LABEL_0


### Step 4.3 Evaluate the predictions

In [33]:
import pandas as pd


evaluation_results['Predicted Label'] = evaluation_results['Predicted Label'].map({"LABEL_0": 0, "LABEL_1": 1})

evaluation_results['True Label'] = evaluation_results['True Label'].astype(int)

correct_predictions = (evaluation_results['True Label'] == evaluation_results['Predicted Label']).sum()
total_samples = len(evaluation_results)
accuracy = correct_predictions / total_samples

print(f"Accuracy: {accuracy:.2%}")

evaluation_results.to_csv("accuracy_results.csv", index=False)


Accuracy: 98.64%


### Optional: Alternate inferencing using HuggingFace Pipeline

In [None]:
import datasets
from transformers.pipelines.pt_utils import KeyDataset
from tqdm.auto import tqdm

dataset = ade["test"]

for out in tqdm(text_classifier(KeyDataset(dataset, "text"))):
    print(out['label'])

### Optional: Alternate accuracy calculation using HuggingFace library

In [38]:
from transformers import pipeline
from datasets import load_dataset
from evaluate import evaluator
import evaluate

pipe = pipeline("text-classification", model="fine_tuned_classification_model", device=0)
data = ade["test"]
metric = evaluate.load("accuracy")

In [39]:
task_evaluator = evaluator("text-classification")

results = task_evaluator.compute(model_or_pipeline=pipe, data=data, metric=metric,
                       label_mapping={"LABEL_0": 0, "LABEL_1": 1},)

print(results)

{'accuracy': 0.9864469837895297, 'total_time_in_seconds': 22.908854326999972, 'samples_per_second': 164.25963281651298, 'latency_in_seconds': 0.006087923020728135}
