# Notebook Overview

In this notebook, I explore a a fine tuning task for a token classification problem using the BERT-base-cased model.

Use Case:
- The goal is to classify words in a text line as either part of a term or a definition.
- All training samples were collected from a Star Wars encyclopedia and are used solely for educational purposes.

# Module Import

In [None]:
import pandas as pd

#Utility classes from the Hugging Face transformers library
from transformers import DataCollatorForTokenClassification #Prepares input data for the model
from transformers import AutoModelForTokenClassification #Loads a pretrained model suited for token classification
from transformers import TrainingArguments #Class to define all the hyperparameters and configuration for training the model
from transformers import Trainer #API for training Pytorch models
from transformers import AutoTokenizer #

from datasets import Dataset


# Data Load and Preprocessing

In [None]:
term_def_df = pd.read_csv('./sample_train.csv',index_col='idx')
test_df = pd.read_csv('./test.csv',index_col='idx')

term_def_df.drop(columns=['block','page'],inplace=True)
term_def_df['text'] = term_def_df['term'].values + ' ' + term_def_df['definition'].values

test_df.drop(columns=['block','term','page'],inplace=True)
test_df.rename(columns={'definition':'text'},inplace=True)


# Tokenizer Initialization

In [None]:

#Bert model works with a maximum of 512 characters, for text longer than that there are 3 options:
#1) In a posterior function it can be included the option to truncate
#2) Drop the samples with more then 512 characters
#3) Split into chunks
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

# Label Mapping

In [5]:

id2label = {
    0: "O",        # Outside of a term/definition
    1: "B-TERM",   # Beginning of a term
    2: "I-TERM",   # Inside a term
    3: "B-DEF",    # Beginning of a definition
    4: "I-DEF",    # Inside a definition
}
label2id = {label: id for id, label in id2label.items()}
num_labels = len(id2label)

# Tokenize and Aling Labels

In [6]:

def tokenize_and_align_labels(row):

    full_text = row['text']
    term_text = row['term']
    def_text = row['definition']

    tokenized_inputs = tokenizer(
        full_text, 
        truncation=True, 
        max_length=512, #Truncate the text to 512 characters
        return_offsets_mapping=True
    )
    
    labels = [-100] * len(tokenized_inputs['input_ids'])  # -100 is used to ignore tokens in loss calculation
    
    # Find the start and end character positions of the term and definition in the full text
    term_start_char = full_text.find(term_text)
    term_end_char = term_start_char + len(term_text)
    def_start_char = full_text.find(def_text, term_end_char)
    def_end_char = def_start_char + len(def_text)
    
    for i,offset in enumerate(tokenized_inputs['offset_mapping']):
        #Retrieve the start and end character positions of the token respective to the full text
        token_start_char, token_end_char = offset
        
        if token_start_char == token_end_char and token_start_char == 0 and i > 0:
            # If the token is empty (e.g., a space or punctuation), skip it
            continue
        
        #Compare the token's character positions with the term and definition positions to assign corresponding labels
        if  term_start_char <= token_start_char < term_end_char:
            if token_start_char == term_start_char:
                labels[i] = label2id['B-TERM']
            else:
                labels[i] = label2id['I-TERM']
        elif def_start_char <= token_start_char < def_end_char:
            if token_start_char == def_start_char:
                labels[i] = label2id['B-DEF']
            else:
                labels[i] = label2id['I-DEF']
        else:
            labels[i] = label2id['O']
    
    tokenized_inputs['labels'] = labels
    tokenized_inputs.pop('offset_mapping')  # Remove offset_mapping as it's not needed for training
    tokenized_inputs.pop('token_type_ids')  # Remove offset_mapping as it's not needed for training
    
    return tokenized_inputs
    



In [None]:
processed = term_def_df.apply(tokenize_and_align_labels,axis=1).tolist()
dataset_dict = {key: [d[key] for d in processed] for key in processed[0].keys()}

In [None]:

term_def_dataset = Dataset.from_dict(dataset_dict)
term_def_dataset = term_def_dataset.train_test_split(test_size=0.2,seed=42)
term_def_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 800
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
})

# Data Collation

In [None]:

data_collator = DataCollatorForTokenClassification(tokenizer)

# Model Definition

In [10]:


model = AutoModelForTokenClassification.from_pretrained(
    'bert-base-cased', 
    num_labels= num_labels,
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from transformers import TrainerCallback

class PrintCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        print(f"[Step {state.global_step}] {logs}")

In [12]:
args = TrainingArguments(
    output_dir='./term_def_model',
    eval_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_strategy='steps',
    logging_steps=10,
    load_best_model_at_end=True,
    #push_to_hub=False,
    disable_tqdm=False,
    fp16=True,
    no_cuda=False # Set to True if you don't have a GPU
)

trainer = Trainer(
    model = model,
    args = args,
    train_dataset = term_def_dataset['train'],
    eval_dataset = term_def_dataset['test'],
    #processing_class = tokenizer,
    data_collator=data_collator,
    callbacks=[PrintCallback()]
)

# Model Training

In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0125,0.014588
2,0.0093,0.009608
3,0.0007,0.011521


[Step 10] {'loss': 0.9207, 'grad_norm': 1.6174964904785156, 'learning_rate': 4.9e-05, 'epoch': 0.1}
[Step 20] {'loss': 0.1327, 'grad_norm': 0.4402577579021454, 'learning_rate': 4.7333333333333336e-05, 'epoch': 0.2}
[Step 30] {'loss': 0.0793, 'grad_norm': 0.40364736318588257, 'learning_rate': 4.566666666666667e-05, 'epoch': 0.3}
[Step 40] {'loss': 0.0233, 'grad_norm': 0.9799392223358154, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.4}
[Step 50] {'loss': 0.0077, 'grad_norm': 0.03879417106509209, 'learning_rate': 4.233333333333334e-05, 'epoch': 0.5}
[Step 60] {'loss': 0.0063, 'grad_norm': 0.587995171546936, 'learning_rate': 4.066666666666667e-05, 'epoch': 0.6}
[Step 70] {'loss': 0.0084, 'grad_norm': 0.5140938758850098, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.7}
[Step 80] {'loss': 0.0064, 'grad_norm': 0.8027336597442627, 'learning_rate': 3.733333333333334e-05, 'epoch': 0.8}
[Step 90] {'loss': 0.0132, 'grad_norm': 0.6628711223602295, 'learning_rate': 3.566666666666667e-05

TrainOutput(global_step=300, training_loss=0.04265157299737136, metrics={'train_runtime': 87.5969, 'train_samples_per_second': 27.398, 'train_steps_per_second': 3.425, 'total_flos': 293913746976720.0, 'train_loss': 0.04265157299737136, 'epoch': 3.0})

# Model Testing

In [None]:
from transformers import pipeline
best_ckpt = trainer.state.best_model_checkpoint
print('Best checkpoint: {}'.format(best_ckpt))

if best_ckpt:
    model = AutoModelForTokenClassification.from_pretrained(best_ckpt)
    tokenizer = AutoTokenizer.from_pretrained(best_ckpt)

pipe = pipeline(
    'token-classification',
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple",
    device=0 #0 for GPU and -1 for CPU
)

Device set to use cuda:0


Best checkpoint: ./term_def_model/checkpoint-200


In [15]:
raw_texts = list(test_df['text'])
predictions = pipe(raw_texts)

In [16]:
records = []

for i, (text,pred) in enumerate(zip(raw_texts,predictions)):
    terms_entities = [p for p in pred if p['entity_group'] == 'TERM']
    defs_entities = [p for p in pred if p['entity_group'] == 'DEF']
    
    extrated_term = ""
    extrated_def = ""
    
    if terms_entities:
        extrated_term = terms_entities[0]['word']
    
    if defs_entities:
        extrated_def = defs_entities[0]['word']
        
    records.append({
        'original_text': text,
        'predicted_term': extrated_term,
        'predicted_definition': extrated_def
    })
    


In [17]:
output_df = pd.DataFrame(records)
output_df.head(30)

Unnamed: 0,original_text,predicted_term,predicted_definition
0,AS assassin droid A repurposed assassin droid ...,AS assassin droid,A repurposed assassin droid programmed to act ...
1,A99 Aquata Breather A compact breathing appara...,A99 Aquata B,##reath
2,"AA-23, Detention Block The block that held Pri...","AA - 23, Detention Block",The block that held Princess Leia Organa aboar...
3,Aarrba the Hutt A kindhearted Hutt who owned a...,Aarrba the Hutt,A kindhearted Hutt who owned a starship dock a...
4,AAT (armored assault tank) The front line of T...,AAT ( armored assault tank ),The front line of Trade Federation armored inf...
5,Aavman Extravagance The manufacturer of such l...,Aavman Extravagance,The manufacturer of such luxury vessels as the...
6,Abran system Site of the Abran Belt and the B’...,Abran system,Site of the Abran Belt and the B ’ Knos mining...
7,Abrion sector A bread basket of sorts for the ...,Abrion sector,"A bread basket of sorts for the galaxy, the Ab..."
8,Abyssin grafting patch A medical supply from K...,Abyssin grafting patch,A medical supply from Kirgalis Pharmaceutical ...
9,"acceleration chair, acceleration couch A gener...","acceleration chair, acceleration couch",A generic term for g - force - absorbing seats...
