In [38]:
# =========  SETUP  (import + tokenizzazione + dataset)  =========
import random, numpy as np, pandas as pd, torch
from torch.utils.data import Dataset as TorchDataset
from datasets import Dataset as HfDataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from sklearn.model_selection import KFold

# ── Config globali
DEVICE        = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME    = "distilbert-base-uncased"
MAX_LEN       = 256
LR            = 1e-5
EPOCHS        = 12
BATCH_SIZE    = 16
WARMUP_RATIO  = 0.10
DROPOUT       = 0.30         # hidden + classifier dropout
N_SPLITS      = 5
SEED          = 42
print("Using device:", DEVICE)

# ── Reproducibilità
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if DEVICE.type == "cuda":
    torch.cuda.manual_seed_all(SEED)

# ── Carica train.csv (colonne: Plot, MyRating)
data = pd.read_csv("train.csv")
print("Train shape:", data.shape)

# ── Tokenizzazione unica con 🤗 datasets
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(
        batch["Plot"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN
    )

hf_ds = HfDataset.from_pandas(
    data[["Plot", "MyRating"]].rename(columns={"MyRating": "labels"}),
    preserve_index=False
).map(tokenize_fn, batched=True).remove_columns("Plot")

# ── Tensori torch
input_ids      = torch.tensor(hf_ds["input_ids"])
attention_mask = torch.tensor(hf_ds["attention_mask"])
labels_raw     = torch.tensor(hf_ds["labels"], dtype=torch.float32)

# ── Centra i target (toglie la media personale)
MU       = labels_raw.mean()          # ≈ 3.43
labels   = labels_raw - MU            # y' = y − μ

# ── Dataset PyTorch minimale
class PlotRegDataset(TorchDataset):
    def __init__(self, ids, mask, y):  self.ids, self.mask, self.y = ids, mask, y
    def __len__(self):                 return len(self.y)
    def __getitem__(self, i):
        return {"input_ids": self.ids[i],
                "attention_mask": self.mask[i],
                "labels": self.y[i]}

# ── KFold ready (useremo solo il primo split)
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
print("Setup OK — pronto per il training")

Using device: cuda
Train shape: (1404, 12)


Map:   0%|          | 0/1404 [00:00<?, ? examples/s]

Setup OK — pronto per il training


In [None]:
from sklearn.metrics import r2_score, mean_squared_error

# ---- split primo fold
train_idx, val_idx = next(iter(kf.split(input_ids)))
train_ds = PlotRegDataset(input_ids[train_idx], attention_mask[train_idx], labels[train_idx])
val_ds   = PlotRegDataset(input_ids[val_idx],   attention_mask[val_idx],   labels[val_idx])

# ---- modello + unfreeze ultimi 3 layer
config = AutoConfig.from_pretrained(
    MODEL_NAME, num_labels=1, problem_type="regression",
    hidden_dropout_prob=DROPOUT, classifier_dropout=DROPOUT
)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config=config).to(DEVICE)
unfreeze_last_n(model, 3)

# ---- metrics
def compute_metrics(eval_pred):
    preds, labs = eval_pred
    preds = preds.flatten()
    return {
        "rmse": np.sqrt(mean_squared_error(labs, preds)),
        "r2":   r2_score(labs, preds)
    }

# ---- TrainingArguments usa r2 come best-metric
args = TrainingArguments(
    output_dir="tmp_fold1",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LR, weight_decay=0.01,
    num_train_epochs=EPOCHS, warmup_ratio=WARMUP_RATIO,
    eval_strategy="epoch", save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="r2",      # <─ cambio
    greater_is_better=True,          # <─ perché r2 va massimizzato
    save_total_limit=1,
    logging_strategy="steps", logging_steps=50,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

# ---- predizioni
pred_std   = trainer.predict(val_ds).predictions.flatten()
pred_final = np.clip(pred_std + MU.item(), 0.5, 5.0)
y_true     = data.iloc[val_idx]["MyRating"].values

rmse = np.sqrt(mean_squared_error(y_true, pred_final))
r2   = r2_score(y_true, pred_final)
print(f"\nFold-1 RMSE={rmse:.4f}   |   R²={r2:.3f}")

# ---- TOP-5 e BOTTOM-5
val_df = pd.DataFrame({
    "Title": data.iloc[val_idx]["Title"].values,
    "True":  y_true,
    "Pred":  pred_final
})

print("\n🎬  TOP-5 (Pred highest)")
display(val_df.sort_values("Pred", ascending=False).head(5))

print("\n🎬  BOTTOM-5 (Pred lowest)")
display(val_df.sort_values("Pred").head(5))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Rmse,R2
1,0.5852,0.541162,0.735637,0.028803
2,0.5862,0.519324,0.720642,0.067995
3,0.5471,0.50356,0.709619,0.096287
4,0.4557,0.492755,0.701965,0.115678
