# DistilBERT Regression Fine-Tuning (LLM Imputation Study)

This notebook fine-tunes `distilbert-base-uncased` to predict the human toxicity score (`toxicity_human`) on any imputed dataset.

> **GPU required**: When running on Kaggle/Colab, enable GPU in the runtime settings.

## 0. Environment Setup
- Enable GPU (Kaggle: *Settings → Accelerators → GPU*).
- Install required packages (Transformers ≥ 4.40).

In [None]:
!pip install -q transformers datasets evaluate accelerate scikit-learn

In [None]:
import math
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from scipy import stats
from sklearn.metrics import f1_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

## 1. Configuration
Update these paths to match your runtime environment.

- `DATA_ROOT`: directory containing `train.parquet`, `train_text_imputed_*.parquet`, etc.
- `IMPUTED_FILE`: choose which imputed training set to use.
- `TEST_FILE`: optional test parquet; set to `None` if unavailable.

In [None]:
DATA_ROOT = Path('/kaggle/input/llmimputation/data/parquet')  # TODO: update path
IMPUTED_FILE = 'train_text_imputed_mar_knn_30.parquet'        # TODO: set desired dataset
TEST_FILE = 'test.parquet'                                   # set to None if unavailable
TEXT_COLUMN = 'text'
LABEL_COLUMN = 'toxicity_human'
LABEL_THRESHOLDS = [1.5, 2.5, 3.5, 4.5]  # for macro F1 calculation
RANDOM_STATE = 42
VALID_SIZE = 0.1  # validation ratio if no external validation split

## 2. Load Data

In [None]:
train_path = DATA_ROOT / IMPUTED_FILE
assert train_path.exists(), f'Missing train file: {train_path}'

df_train = pd.read_parquet(train_path)
print(f'Train rows: {len(df_train):,}')

df_test = None
if TEST_FILE:
    test_path = DATA_ROOT / TEST_FILE
    if test_path.exists():
        df_test = pd.read_parquet(test_path)
        print(f'Test rows: {len(df_test):,}')
    else:
        print(f"Warning: TEST_FILE '{TEST_FILE}' not found; skipping test split.")

In [None]:
X = df_train[TEXT_COLUMN].astype(str)
y = df_train[LABEL_COLUMN].astype(np.float32)

X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=VALID_SIZE,
    random_state=RANDOM_STATE,
    stratify=np.digitize(y, LABEL_THRESHOLDS),
)

train_dataset = Dataset.from_dict({TEXT_COLUMN: X_train, LABEL_COLUMN: y_train})
valid_dataset = Dataset.from_dict({TEXT_COLUMN: X_valid, LABEL_COLUMN: y_valid})

datasets_dict = DatasetDict({'train': train_dataset, 'validation': valid_dataset})

if df_test is not None:
    test_dataset = Dataset.from_dict({
        TEXT_COLUMN: df_test[TEXT_COLUMN].astype(str),
        LABEL_COLUMN: df_test[LABEL_COLUMN].astype(np.float32),
    })
    datasets_dict['test'] = test_dataset

datasets_dict

## 3. Tokenization

In [None]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(example):
    model_inputs = tokenizer(
        example[TEXT_COLUMN],
        padding=False,
        truncation=True,
        max_length=256,
    )
    model_inputs['labels'] = example[LABEL_COLUMN]
    return model_inputs

tokenized_datasets = datasets_dict.map(
    preprocess,
    batched=True,
    remove_columns=[TEXT_COLUMN, LABEL_COLUMN]
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_datasets

## 4. Metrics
We optimise MSE but also report MAE, RMSE, Pearson/Spearman correlations, and macro F1 by discretising predictions with the same thresholds.

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(-1)
    labels = labels.reshape(-1)

    mae = mean_absolute_error(labels, predictions)
    rmse = mean_squared_error(labels, predictions, squared=False)
    pearson = stats.pearsonr(labels, predictions).statistic
    spearman = stats.spearmanr(labels, predictions).statistic

    true_bins = np.digitize(labels, LABEL_THRESHOLDS)
    pred_bins = np.digitize(predictions, LABEL_THRESHOLDS)
    macro_f1 = f1_score(true_bins, pred_bins, average='macro')

    return {
        'mae': mae,
        'rmse': rmse,
        'pearson': pearson,
        'spearman': spearman,
        'macro_f1_from_regression': macro_f1,
    }

## 5. Trainer Setup

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1,
)
model.to(device)

batch_size = 16  # larger batch for DistilBERT
gradient_accumulation_steps = 1
num_train_epochs = 3
warmup_ratio = 0.1
learning_rate = 3e-5

training_args = TrainingArguments(
    output_dir='./distilbert_regression_outputs',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_rmse',
    greater_is_better=False,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    num_train_epochs=num_train_epochs,
    learning_rate=learning_rate,
    warmup_ratio=warmup_ratio,
    weight_decay=0.01,
    logging_steps=100,
    fp16=torch.cuda.is_available(),
    report_to='none',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

## 6. Training

In [None]:
train_result = trainer.train()
trainer.save_model('distilbert_regression_best')  # saves tokenizer + model
train_result.metrics

## 7. Evaluation

In [None]:
eval_metrics = trainer.evaluate(tokenized_datasets['validation'])
eval_metrics

In [None]:
if 'test' in tokenized_datasets:
    test_metrics = trainer.evaluate(tokenized_datasets['test'], metric_key_prefix='test')
    test_metrics

## 8. Save Metrics & Predictions

In [None]:
def save_dict_to_json(data, path):
    import json
    with open(path, 'w') as f:
        json.dump(data, f, indent=2)

save_dir = Path('./distilbert_regression_outputs')
save_dir.mkdir(parents=True, exist_ok=True)
save_dict_to_json(eval_metrics, save_dir / 'eval_metrics.json')

predictions = trainer.predict(tokenized_datasets['validation'])
np.savetxt(
    save_dir / 'validation_predictions.csv',
    np.vstack([predictions.predictions.reshape(-1), predictions.label_ids]).T,
    delimiter=',',
    header='pred,label',
    comments=''
)
print('Saved metrics and validation predictions to', save_dir)