In [None]:
# Cella 1: import, embedding statico + PCA
import numpy as np
import pandas as pd
import random
import torch
import joblib
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

# Riproducibilità
SEED = 99
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# 1) Carica train/test
train_df = pd.read_csv("ml_data/train.csv")   # colonne: 'Plot','MyRating'
test_df  = pd.read_csv("ml_data/test.csv")    # colonna: 'Plot'
y_train  = train_df["MyRating"].values.astype(np.float32)

# 2) Embed con all-MiniLM-L6-v2 (384-d)
device  = "cuda" if torch.cuda.is_available() else "cpu"
st      = SentenceTransformer("all-MiniLM-L6-v2", device=device)
all_plots = train_df["Plot"].tolist() + test_df["Plot"].tolist()
emb_all   = st.encode(all_plots, batch_size=64, show_progress_bar=True)  # (1604,384)

emb_train = emb_all[: len(train_df)]
emb_test  = emb_all[len(train_df):]

np.save("ml_data/X_train_emb.npy", emb_train)   # (1404,384)
np.save("ml_data/X_test_emb.npy",  emb_test)    # ( 200,384)
np.save("ml_data/y_train.npy",     y_train)

Batches:   0%|          | 0/26 [00:00<?, ?it/s]

In [20]:
# Cella 2: OOF CV con XGBoost puro (xgb.train su DMatrix)
import random
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
import warnings

# Riproducibilità
SEED = 99
random.seed(SEED)
np.random.seed(SEED)
warnings.filterwarnings("ignore", message=".*gpu_hist.*")

# 1) Carica raw embeddings e target
X = np.load("ml_data/X_train_emb.npy")   # (1404,384)
y = np.load("ml_data/y_train.npy")       # {1.0,1.5,…,5.0}

print(f"DEBUG: X.shape = {X.shape}, y.range = [{y.min()}, {y.max()}]")

# 2) Parametri XGBoost per xgb.train
params = {
    "objective":        "reg:squarederror",
    "eval_metric":      "rmse",
    "learning_rate":    0.05,
    "max_depth":        10,
    "subsample":        0.8,
    "colsample_bytree": 0.8,
    "reg_lambda":       1.0,
    "seed":             SEED,
    "tree_method":      "hist"
}

# 3) OOF 5-fold CV
kf   = KFold(n_splits=5, shuffle=True, random_state=SEED)
oof  = np.zeros_like(y, dtype=np.float32)

for fold, (tr_idx, val_idx) in enumerate(kf.split(X), 1):
    # split
    X_tr, X_val = X[tr_idx], X[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]

    # crea DMatrix
    dtrain = xgb.DMatrix(X_tr, label=y_tr)
    dval   = xgb.DMatrix(X_val, label=y_val)

    # training con early stopping
    bst = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=[(dval, "val")],
        early_stopping_rounds=40,
        verbose_eval=False
    )

    # predizione e metrica
    pred_val    = bst.predict(dval)
    oof[val_idx] = pred_val

    rmse = np.sqrt(mean_squared_error(y_val, pred_val))
    r2   = r2_score(y_val, pred_val)
    print(f"Fold {fold}: RMSE = {rmse:.4f}, R2 = {r2:.4f}")

# 4) Metriche complessive
rmse_all = np.sqrt(mean_squared_error(y, oof))
r2_all   = r2_score(y, oof)
print(f"\nOOF complessivo: RMSE = {rmse_all:.4f}, R2 = {r2_all:.4f}")

# 5) Salva OOF scores
np.save("ml_data/plot_score_train.npy", oof)
print("✔ Salvato ml_data/plot_score_train.npy")

DEBUG: X.shape = (1404, 384), y.range = [1.0, 5.0]
Fold 1: RMSE = 0.7449, R2 = 0.0527
Fold 2: RMSE = 0.7177, R2 = 0.0568
Fold 3: RMSE = 0.7378, R2 = 0.0248
Fold 4: RMSE = 0.7046, R2 = 0.0846
Fold 5: RMSE = 0.7837, R2 = 0.0431

OOF complessivo: RMSE = 0.7382, R2 = 0.0547
✔ Salvato ml_data/plot_score_train.npy


In [26]:
# Cella 3: TOP-10 e FLOP-10 sul train set (OOF)
import numpy as np
import pandas as pd

# Carica i dati
train_df          = pd.read_csv("ml_data/train.csv")
plot_score_train  = np.load("ml_data/plot_score_train.npy")

# Aggiungi PlotScore
train_df["PlotScore"] = plot_score_train

# Scegli colonne da mostrare
cols = []
if "Title" in train_df.columns:
    cols.append("Title")
cols += ["MyRating", "PlotScore"]

In [39]:
train_df.sort_values(by='PlotScore', ascending=False).head(20)[cols]

Unnamed: 0,Title,MyRating,PlotScore
1033,Death on the Nile,3.0,4.076135
307,Falling Down,3.5,4.038161
132,The Village,4.0,4.003412
590,Loro 1,3.5,3.972655
77,Sleepers,3.0,3.969744
645,The Prince of Egypt,3.5,3.952287
339,Gosford Park,3.0,3.934301
266,The Brutalist,4.5,3.902888
594,Schindler's List,4.5,3.889819
563,Closer,3.5,3.883369


In [40]:
train_df.sort_values(by='PlotScore', ascending=True).head(20)[cols]

Unnamed: 0,Title,MyRating,PlotScore
65,After We Fell,1.0,2.186786
1266,After,1.0,2.191551
341,After We Collided,1.0,2.439617
495,Fantastic Four,3.0,2.679593
887,Sense8: Amor Vincit Omnia,3.5,2.730164
1136,Blade II,2.5,2.731323
363,The Matrix Reloaded,3.0,2.744051
442,After Ever Happy,1.0,2.766704
424,Rogue One: A Star Wars Story,4.0,2.769648
117,The Old Guard,2.5,2.777059
