In [None]:
from catboost import CatBoostClassifier
import pandas as pd
from sklearn.metrics import classification_report

# Load data
matches_with_features = pd.read_csv('tennis_matches_enriched.csv', parse_dates=['date'])

# Define predictors
predictors = [
    "playerCode", "opponentCode",
    "Rank_1", "Rank_2", "Pts_1", "Pts_2",
    "h2h_matches", "h2h_win_pct", "h2h_surface_matches", "h2h_surface_win_pct",
    "recent_matches_p1", "recent_win_pct_p1", "recent_avg_opp_rank_p1",
    "recent_matches_p2", "recent_win_pct_p2", "recent_avg_opp_rank_p2",
    "win_pct_surface_p1", "win_pct_surface_p2",
    "series_level", "round_num", "best_of",
    "implied_prob_p1", "implied_prob_p2",
]

# Split data
train = matches_with_features[
    (matches_with_features['date'] >= '2000-01-03') & 
    (matches_with_features['date'] <= '2022-12-31')
]

test = matches_with_features[
    (matches_with_features['date'] >= '2023-01-01') & 
    (matches_with_features['date'] <= '2024-06-29')
]

# Define categorical features for CatBoost
cat_features = ["playerCode", "opponentCode", "series_level", "round_num", "best_of"]

# Convert categorical features to string (recommended for CatBoost)
for col in cat_features:
    train[col] = train[col].astype(str)
    test[col] = test[col].astype(str)

# Define and train CatBoost model with early stopping and validation set
model_cat = CatBoostClassifier(
    iterations=8000,          # More iterations with early stopping
    depth=10,                # Medium depth (between 6-8)
    learning_rate=0.02,      # Lower learning rate for better accuracy
    l2_leaf_reg=10,          # Stronger regularization
    border_count=254,        # More splits for continuous features (default optimal)
    random_strength=1,       # Controls randomness in splits
    bagging_temperature=0.5, # Smooths bootstrap (useful for small data)
    loss_function='Logloss',
    eval_metric='Accuracy',  # Optimize accuracy directly
    cat_features=cat_features,
    early_stopping_rounds=50,# More patience for convergence
    verbose=200,
    random_state=42
)

model_cat.fit(
    train[predictors], train["target"],
    eval_set=(test[predictors], test["target"])
)

# Evaluate
acc = model_cat.score(test[predictors], test['target'])
print(f"\nCatBoost Accuracy: {acc:.3f}")
