# DistilBERT Regression (Multi-GPU)

This notebook fine-tunes `distilbert-base-uncased` to predict the human toxicity score (`toxicity_human`) using **multiple GPUs** (e.g. Kaggle dual T4).

> 将 `NUM_PROCESSES` 设置为 2 可在 Kaggle 上启用双 GPU；若仅有单卡，请改回 1。

## 0. Environment Setup
- 在运行环境中启用 GPU（Kaggle: *Settings → Accelerator → GPU (T4 x2)*）。
- 安装所需依赖库。

In [None]:
!pip install -q transformers datasets accelerate evaluate scikit-learn

In [None]:
import math
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from scipy import stats
from sklearn.metrics import f1_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

print(f"Available GPUs: {torch.cuda.device_count()}")

## 1. Configuration
更新下面的路径与超参数以匹配当前环境。

In [None]:
# 数据路径设置
DATA_ROOT = Path('/kaggle/input/llmimputation/data/parquet')  # TODO: 按需修改
IMPUTED_FILE = 'train_text_imputed_mar_knn_30.parquet'          # TODO: 选择要训练的数据集
TEST_FILE = 'test.parquet'                                     # 若无独立测试集可改为 None

TEXT_COLUMN = 'text'
LABEL_COLUMN = 'toxicity_human'
LABEL_THRESHOLDS = [1.5, 2.5, 3.5, 4.5]
RANDOM_STATE = 42
VALID_SIZE = 0.1

# 训练相关参数
BATCH_SIZE = 16
GRADIENT_ACCUMULATION_STEPS = 1
NUM_TRAIN_EPOCHS = 3
LEARNING_RATE = 3e-5
WARMUP_RATIO = 0.1
WEIGHT_DECAY = 0.01
MAX_LENGTH = 256

# 多 GPU 配置：Kaggle 双 T4 设置为 2；单卡保持 1
NUM_PROCESSES = 2
USE_FP16 = torch.cuda.is_available()

## 2. Load Data

In [None]:
train_path = DATA_ROOT / IMPUTED_FILE
assert train_path.exists(), f"Missing train file: {train_path}"

df_train = pd.read_parquet(train_path)
print(f"Train rows: {len(df_train):,}")

df_test = None
if TEST_FILE:
    test_path = DATA_ROOT / TEST_FILE
    if test_path.exists():
        df_test = pd.read_parquet(test_path)
        print(f"Test rows: {len(df_test):,}")
    else:
        print(f"Warning: TEST_FILE '{TEST_FILE}' not found; skipping test split.")

In [None]:
X = df_train[TEXT_COLUMN].astype(str)
y = df_train[LABEL_COLUMN].astype(np.float32)

X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=VALID_SIZE,
    random_state=RANDOM_STATE,
    stratify=np.digitize(y, LABEL_THRESHOLDS),
)

train_dataset = Dataset.from_dict({TEXT_COLUMN: X_train, LABEL_COLUMN: y_train})
valid_dataset = Dataset.from_dict({TEXT_COLUMN: X_valid, LABEL_COLUMN: y_valid})

datasets_dict = DatasetDict({'train': train_dataset, 'validation': valid_dataset})

if df_test is not None:
    test_dataset = Dataset.from_dict({
        TEXT_COLUMN: df_test[TEXT_COLUMN].astype(str),
        LABEL_COLUMN: df_test[LABEL_COLUMN].astype(np.float32),
    })
    datasets_dict['test'] = test_dataset

datasets_dict

## 3. Tokenization

In [None]:
model_name = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(example):
    model_inputs = tokenizer(
        example[TEXT_COLUMN],
        padding=False,
        truncation=True,
        max_length=MAX_LENGTH,
    )
    model_inputs['labels'] = example[LABEL_COLUMN]
    return model_inputs

tokenized_datasets = datasets_dict.map(
    preprocess,
    batched=True,
    remove_columns=[TEXT_COLUMN, LABEL_COLUMN],
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_datasets

## 4. Metrics
计算 MSE/MAE/相关系数，并将回归结果离散化后计算宏 F1。

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(-1)
    labels = labels.reshape(-1)

    mae = mean_absolute_error(labels, predictions)
    rmse = mean_squared_error(labels, predictions, squared=False)
    pearson = stats.pearsonr(labels, predictions).statistic
    spearman = stats.spearmanr(labels, predictions).statistic

    true_bins = np.digitize(labels, LABEL_THRESHOLDS)
    pred_bins = np.digitize(predictions, LABEL_THRESHOLDS)
    macro_f1 = f1_score(true_bins, pred_bins, average='macro')

    return {
        'mae': mae,
        'rmse': rmse,
        'pearson': pearson,
        'spearman': spearman,
        'macro_f1_from_regression': macro_f1,
    }

## 5. Training & Evaluation

In [None]:
def run_training():
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=1,
    )

    training_args = TrainingArguments(
        output_dir='./distilbert_regression_outputs',
        evaluation_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='eval_rmse',
        greater_is_better=False,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        num_train_epochs=NUM_TRAIN_EPOCHS,
        learning_rate=LEARNING_RATE,
        warmup_ratio=WARMUP_RATIO,
        weight_decay=WEIGHT_DECAY,
        logging_steps=100,
        fp16=USE_FP16,
        report_to='none',
        ddp_find_unused_parameters=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets['train'],
        eval_dataset=tokenized_datasets['validation'],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    trainer.save_model('distilbert_regression_best')

    eval_metrics = trainer.evaluate(tokenized_datasets['validation'])
    print('Validation metrics:', eval_metrics)

    if 'test' in tokenized_datasets:
        test_metrics = trainer.evaluate(tokenized_datasets['test'], metric_key_prefix='test')
        print('Test metrics:', test_metrics)

    torch.save(eval_metrics, 'distilbert_regression_outputs/eval_metrics.pt')

In [None]:
if NUM_PROCESSES > 1:
    from accelerate import notebook_launcher
    notebook_launcher(run_training, num_processes=NUM_PROCESSES)
else:
    run_training()

## 6. Save Metrics & Predictions

In [None]:
from pathlib import Path
import torch
import numpy as np
from transformers import Trainer

save_dir = Path('./distilbert_regression_outputs')
save_dir.mkdir(parents=True, exist_ok=True)

try:
    eval_metrics = torch.load(save_dir / 'eval_metrics.pt')
except FileNotFoundError:
    eval_metrics = {}

if eval_metrics:
    import json
    with open(save_dir / 'eval_metrics.json', 'w') as f:
        json.dump(eval_metrics, f, indent=2)

if 'validation' in tokenized_datasets:
    model = AutoModelForSequenceClassification.from_pretrained('distilbert_regression_best', num_labels=1)
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    predictions = trainer.predict(tokenized_datasets['validation'])
    np.savetxt(
        save_dir / 'validation_predictions.csv',
        np.vstack([predictions.predictions.reshape(-1), predictions.label_ids]).T,
        delimiter=',',
        header='pred,label',
        comments=''
    )
    print('Saved validation predictions.')