# ML experimentation Notebook

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import mlflow

In [3]:
pd.set_option("display.max_rows", None)

In [4]:
file_path = "../data/ml_dataset.pkl"
df = pd.read_pickle(file_path)
print(f"DataFrame loaded with {len(df)} rows and {len(df.columns)} columns.")

DataFrame loaded with 98355 rows and 152 columns.


In [5]:
df.dtypes

tourney_id                                    object
tourney_date                          datetime64[ns]
match_num                                      int64
player_1                                       int64
player_2                                       int64
winner                                         int64
tourney_name                                  object
surface                                       object
draw_size                                    float64
tourney_level                                 object
tourney_year                                   int32
player_id                                      int64
player_seed                                  float64
player_entry                                  object
player_name                                   object
player_hand                                   object
player_ht                                    float64
player_ioc                                    object
player_age                                   f

In [6]:
# Get all columns with dtype 'object'
cat_columns = df.select_dtypes(include="object").columns.tolist()
print(f"Categorical columns: {cat_columns}")

Categorical columns: ['tourney_id', 'tourney_name', 'surface', 'tourney_level', 'player_entry', 'player_name', 'player_hand', 'player_ioc', 'opponent_entry', 'opponent_name', 'opponent_hand', 'opponent_ioc', 'score', 'round', 'tourney_name_p2', 'surface_p2', 'tourney_level_p2', 'player_entry_p2', 'player_name_p2', 'player_hand_p2', 'player_ioc_p2', 'opponent_entry_p2', 'opponent_name_p2', 'opponent_hand_p2', 'opponent_ioc_p2', 'score_p2', 'round_p2', 'tourney_name_t', 'surface_t', 'tourney_level_t']


In [7]:
# Get all columns with dtype 'datetime64[ns]'
date_columns = df.select_dtypes(include="datetime64[ns]").columns.tolist()
print(f"Datetime columns: {date_columns}")

Datetime columns: ['tourney_date', 'tourney_date_t']


In [8]:
features = [
    col
    for col in df.columns
    if col not in cat_columns
    and col not in date_columns
    and "_id" not in col
    and "p_" not in col
    and "o_" not in col
    and col != "winner"
    and "results" not in col
]
len(features)

39

In [9]:
features

['match_num',
 'player_1',
 'player_2',
 'draw_size',
 'tourney_year',
 'player_seed',
 'player_ht',
 'player_age',
 'player_rank',
 'player_rank_points',
 'opponent_seed',
 'opponent_ht',
 'opponent_age',
 'opponent_rank',
 'opponent_rank_points',
 'best_of',
 'minutes',
 'opponent_rank_points_mean_last5',
 'minutes_mean_last5',
 'minutes_sum_last5',
 'draw_size_p2',
 'tourney_year_p2',
 'player_seed_p2',
 'player_ht_p2',
 'player_age_p2',
 'player_rank_p2',
 'player_rank_points_p2',
 'opponent_seed_p2',
 'opponent_ht_p2',
 'opponent_age_p2',
 'opponent_rank_p2',
 'opponent_rank_points_p2',
 'best_of_p2',
 'minutes_p2',
 'opponent_rank_points_mean_last5_p2',
 'minutes_mean_last5_p2',
 'minutes_sum_last5_p2',
 'draw_size_t',
 'tourney_year_t']

In [10]:
target_column = "winner"
seed = 42

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    df[features], df[target_column], test_size=0.2, random_state=seed
)

In [12]:
print((y_train.value_counts() / len(y_train)) * 100)
print((y_test.value_counts() / len(y_test)) * 100)

winner
0    50.247827
1    49.752173
Name: count, dtype: float64
winner
0    50.170301
1    49.829699
Name: count, dtype: float64


In [13]:
params = {
    "max_depth": 5,
    "learning_rate": 0.01,
    "n_estimators": 1000,
    "colsample_bytree": 0.5,
}

In [14]:
xgb_clf = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    enable_categorical=True,
    seed=seed,
    **params,
)

In [15]:
xgb_clf_model = xgb_clf.fit(X_train, y_train)
y_pred_train = xgb_clf_model.predict(X_train)
y_pred = xgb_clf_model.predict(X_test)

print(f"Train Accuracy: {accuracy_score(y_train, y_pred_train):.4f}")
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")


Train Accuracy: 0.6700
Test Accuracy: 0.6547


In [None]:
mlflow.set_tracking_uri("/Users/fraimondi/Desktop/repos/tennis-match-predictor/mlruns/")
tracking_uri = mlflow.get_tracking_uri()
print(f"Current tracking uri: {tracking_uri}")
mlflow.set_experiment("test")

2025/08/02 17:11:47 INFO mlflow.tracking.fluent: Experiment with name 'test' does not exist. Creating a new experiment.


Current tracking uri: /Users/fraimondi/Desktop/repos/tennis-match-predictor/mlruns/


<Experiment: artifact_location='/Users/fraimondi/Desktop/repos/tennis-match-predictor/mlruns/174896892551870229', creation_time=1754147507477, experiment_id='174896892551870229', last_update_time=1754147507477, lifecycle_stage='active', name='test', tags={}>

In [35]:
with mlflow.start_run() as run:
    params = {"max_depth": 5, "n_estimators": 1000}
    rf_clf = RandomForestClassifier(**params, random_state=seed)
    rf_clf_model = rf_clf.fit(X_train, y_train)
    y_pred_train = rf_clf_model.predict(X_train)
    y_pred = rf_clf_model.predict(X_test)

    print(f"Train Accuracy: {accuracy_score(y_train, y_pred_train):.4f}")
    print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")

    mlflow.log_params(params)
    mlflow.log_metrics(
        {
            "train_accuracy": accuracy_score(y_train, y_pred_train),
            "test_accuracy": accuracy_score(y_test, y_pred),
        }
    )
    mlflow.sklearn.log_model(name="rf_model", sk_model=rf_clf_model)

Train Accuracy: 0.6499
Test Accuracy: 0.6484




to see the results, activate the server: `uv run mlflow server --host 127.0.0.1 --port 8080`