In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

# --- Carrega dades
matches_with_features = pd.read_csv('tennis_matches_enriched.csv', parse_dates=['date'])

# --- Predictors
predictors = [
    "playerCode", "opponentCode",
    "Rank_1", "Rank_2", "Pts_1", "Pts_2",
    "h2h_matches", "h2h_win_pct", "h2h_surface_matches", "h2h_surface_win_pct",
    "recent_matches_p1", "recent_win_pct_p1", "recent_avg_opp_rank_p1",
    "recent_matches_p2", "recent_win_pct_p2", "recent_avg_opp_rank_p2",
    "win_pct_surface_p1", "win_pct_surface_p2",
    "series_level", "round_num", "best_of",
    "implied_prob_p1", "implied_prob_p2",
]

# --- Split dades
train = matches_with_features[
    (matches_with_features['date'] >= '2000-01-03') & 
    (matches_with_features['date'] <= '2022-12-31')
].copy()

test = matches_with_features[
    (matches_with_features['date'] >= '2023-01-01') & 
    (matches_with_features['date'] <= '2024-06-29')
].copy()

# --- Variables categòriques
cat_features = ["playerCode", "opponentCode", "series_level", "round_num", "best_of"]

# --- Label Encoding segur (ajunta train+test)
for col in cat_features:
    le = LabelEncoder()
    all_vals = pd.concat([train[col], test[col]]).astype(str)
    le.fit(all_vals)
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# --- Convertim a arrays numpy
X_train = train[predictors].values
y_train = train['target'].values
X_test = test[predictors].values
y_test = test['target'].values

# --- Índexs i dimensions categòriques correctes
cat_idxs = [predictors.index(c) for c in cat_features]

cat_dims = []
for c in cat_features:
    max_val = max(train[c].max(), test[c].max())
    cat_dims.append(int(max_val) + 1)

# --- Creem model TabNet
clf = TabNetClassifier(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=8,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    verbose=10,
    seed=42,
)

# --- Entrenament
clf.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric=['accuracy'],
    patience=20,
    max_epochs=200,
    batch_size=1024,
    virtual_batch_size=128
)

# --- Predicció i resultats
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\n TabNet Accuracy: {acc:.4f}")
