In [1]:
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model
import joblib


In [2]:
train_df = pd.read_csv("data/processed/train.csv")
test_df = pd.read_csv("data/processed/test.csv")

train_df.head()


Unnamed: 0,text,label
0,"race context the 2025 spanish grand prix, offi...",5
1,the pandemic-era austrian gp marked f1's retur...,8
2,race context the 2023 saudi arabian grand prix...,7
3,"race context the 2025 miami grand prix, offici...",5
4,"race context the 2025 austrian grand prix, off...",2


In [3]:
model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(train_df['label'].unique())
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_lin", "v_lin"],   # DistilBERT attention layers
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 892,425 || all params: 67,852,818 || trainable%: 1.3152


In [5]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )


In [6]:
# Convert pandas DataFrames to Hugging Face Datasets and tokenize
train_data = Dataset.from_pandas(train_df.reset_index(drop=True))
test_data = Dataset.from_pandas(test_df.reset_index(drop=True))

# Tokenize datasets
train_data_tokenized = train_data.map(tokenize, batched=True)
test_data_tokenized = test_data.map(tokenize, batched=True)


Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Build training args as a dict, then filter to supported kwargs
training_kwargs = {
    "output_dir": "distilbert_lora_finetuned",
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 8,
    "learning_rate": 2e-4,
    "num_train_epochs": 4,
    "logging_steps": 10,
    "evaluation_strategy": "epoch",
    "save_strategy": "epoch",
    "skip_memory_metrics": True,
    "push_to_hub": False,
}

# Filter out unsupported kwargs for compatibility with different transformers versions
from inspect import signature
try:
    sig = signature(TrainingArguments.__init__)
    supported = set(sig.parameters.keys()) - {"self", "args", "kwargs"}
    filtered_kwargs = {k: v for k, v in training_kwargs.items() if k in supported}
except Exception:
    # Fallback: if inspection fails, pass the full dict and let TrainingArguments raise a clear error
    filtered_kwargs = training_kwargs

training_args = TrainingArguments(**filtered_kwargs)



In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data_tokenized,
    eval_dataset=test_data_tokenized,
    data_collator=data_collator
)
trainer.train()




Step,Training Loss
10,2.1886
20,2.1164




TrainOutput(global_step=20, training_loss=2.1524888038635255, metrics={'train_runtime': 34.2001, 'train_samples_per_second': 4.211, 'train_steps_per_second': 0.585, 'total_flos': 9736233467904.0, 'train_loss': 2.1524888038635255, 'epoch': 4.0})

In [9]:
trainer.save_model("models/distilbert_lora")
tokenizer.save_pretrained("models/distilbert_lora")


('models/distilbert_lora\\tokenizer_config.json',
 'models/distilbert_lora\\special_tokens_map.json',
 'models/distilbert_lora\\vocab.txt',
 'models/distilbert_lora\\added_tokens.json',
 'models/distilbert_lora\\tokenizer.json')

In [10]:
def predict_label(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=256
    )

    outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=1).item()
    return pred


In [11]:
le = joblib.load("models/label_encoder.pkl")

def predict_driver(text):
    label = predict_label(text)
    return le.inverse_transform([label])[0]


In [12]:
sample = """
McLaren delivered by far the strongest performance of the entire weekend, with Lando Norris 
completely dominating every practice session. Norris set the fastest laps in FP1, FP2, and 
FP3 by a large margin and showed unmatched consistency during long-run simulations. His 
average race-pace was over four tenths quicker than any other driver on track.

Oscar Piastri could not match Norris's speed, and the McLaren engineers confirmed that 
Norris's setup was the best they have produced all season. Ferrari struggled severely 
with tyre degradation, and both Leclerc and Sainz were more than half a second slower in 
their high-fuel stints.

Mercedes were completely off the pace the entire weekend. George Russell reported major 
balance issues, while Lewis Hamilton lost over six tenths per lap in race trim and never 
came close to Norris's pace at any point. Hamilton was nowhere near the front of the field 
in any session.

All analysts, team strategists, and tyre engineers unanimously agree that **Lando Norris 
is the clear favourite** and most likely winner of this Grand Prix based on every metric.


"""

print("Predicted Winner:", predict_driver(sample))


Predicted Winner: Lewis Hamilton
