# ML experimentation Notebook

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import mlflow
from mlflow.models import infer_signature

In [2]:
pd.set_option("display.max_rows", None)

In [3]:
file_path = "../data/ml_dataset.pkl"
df = pd.read_pickle(file_path)
print(f"DataFrame loaded with {len(df)} rows and {len(df.columns)} columns.")

DataFrame loaded with 98355 rows and 152 columns.


In [4]:
df.dtypes

tourney_id                                    object
tourney_date                          datetime64[ns]
match_num                                      int64
player_1                                       int64
player_2                                       int64
winner                                         int64
tourney_name                                  object
surface                                       object
draw_size                                    float64
tourney_level                                 object
tourney_year                                   int32
player_id                                      int64
player_seed                                  float64
player_entry                                  object
player_name                                   object
player_hand                                   object
player_ht                                    float64
player_ioc                                    object
player_age                                   f

In [5]:
# Get all columns with dtype 'object'
cat_columns = df.select_dtypes(include="object").columns.tolist()
print(f"Categorical columns: {cat_columns}")

Categorical columns: ['tourney_id', 'tourney_name', 'surface', 'tourney_level', 'player_entry', 'player_name', 'player_hand', 'player_ioc', 'opponent_entry', 'opponent_name', 'opponent_hand', 'opponent_ioc', 'score', 'round', 'tourney_name_p2', 'surface_p2', 'tourney_level_p2', 'player_entry_p2', 'player_name_p2', 'player_hand_p2', 'player_ioc_p2', 'opponent_entry_p2', 'opponent_name_p2', 'opponent_hand_p2', 'opponent_ioc_p2', 'score_p2', 'round_p2', 'tourney_name_t', 'surface_t', 'tourney_level_t']


In [6]:
# Get all columns with dtype 'datetime64[ns]'
date_columns = df.select_dtypes(include="datetime64[ns]").columns.tolist()
print(f"Datetime columns: {date_columns}")

Datetime columns: ['tourney_date', 'tourney_date_t']


In [7]:
features = [
    col
    for col in df.columns
    if col not in cat_columns
    and col not in date_columns
    and "_id" not in col
    and "p_" not in col
    and "o_" not in col
    and col != "winner"
    and "results" not in col
]
len(features)

39

In [8]:
features

['match_num',
 'player_1',
 'player_2',
 'draw_size',
 'tourney_year',
 'player_seed',
 'player_ht',
 'player_age',
 'player_rank',
 'player_rank_points',
 'opponent_seed',
 'opponent_ht',
 'opponent_age',
 'opponent_rank',
 'opponent_rank_points',
 'best_of',
 'minutes',
 'opponent_rank_points_mean_last5',
 'minutes_mean_last5',
 'minutes_sum_last5',
 'draw_size_p2',
 'tourney_year_p2',
 'player_seed_p2',
 'player_ht_p2',
 'player_age_p2',
 'player_rank_p2',
 'player_rank_points_p2',
 'opponent_seed_p2',
 'opponent_ht_p2',
 'opponent_age_p2',
 'opponent_rank_p2',
 'opponent_rank_points_p2',
 'best_of_p2',
 'minutes_p2',
 'opponent_rank_points_mean_last5_p2',
 'minutes_mean_last5_p2',
 'minutes_sum_last5_p2',
 'draw_size_t',
 'tourney_year_t']

In [9]:
target_column = "winner"
seed = 42

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    df[features], df[target_column], test_size=0.2, random_state=seed
)

In [11]:
print((y_train.value_counts() / len(y_train)) * 100)
print((y_test.value_counts() / len(y_test)) * 100)

winner
0    50.247827
1    49.752173
Name: count, dtype: float64
winner
0    50.170301
1    49.829699
Name: count, dtype: float64


In [12]:
mlflow.set_tracking_uri("/Users/fraimondi/Desktop/repos/tennis-match-predictor/mlruns/")
tracking_uri = mlflow.get_tracking_uri()
print(f"Current tracking uri: {tracking_uri}")
mlflow.set_experiment("ml_experimentation")

2025/08/10 11:41:51 INFO mlflow.tracking.fluent: Experiment with name 'ml_experimentation' does not exist. Creating a new experiment.


Current tracking uri: /Users/fraimondi/Desktop/repos/tennis-match-predictor/mlruns/


<Experiment: artifact_location='/Users/fraimondi/Desktop/repos/tennis-match-predictor/mlruns/511814337024080562', creation_time=1754818911959, experiment_id='511814337024080562', last_update_time=1754818911959, lifecycle_stage='active', name='ml_experimentation', tags={}>

In [13]:
params = {
    "max_depth": 5,
    "learning_rate": 0.01,
    "n_estimators": 1000,
    "colsample_bytree": 0.5,
}

xgb_clf = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    enable_categorical=True,
    seed=seed,
    **params,
)

In [20]:
with mlflow.start_run() as run:
    xgb_clf_model = xgb_clf.fit(X_train, y_train)
    y_pred_train = xgb_clf_model.predict(X_train)
    y_pred = xgb_clf_model.predict(X_test)

    print(f"Train Accuracy: {accuracy_score(y_train, y_pred_train):.4f}")
    print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")

    mlflow.log_params(params)
    mlflow.log_metrics(
        {
            "train_accuracy": accuracy_score(y_train, y_pred_train),
            "test_accuracy": accuracy_score(y_test, y_pred),
        }
    )
    # Infer the signature from training data and predictions
    signature = infer_signature(X_train.head(5), xgb_clf_model.predict(X_train.head(5)))
    # Log the model with the signature
    mlflow.xgboost.log_model(xgb_clf_model, name="model", signature=signature)

    print(f"Run ID: {run.info.run_id}")

Train Accuracy: 0.6700
Test Accuracy: 0.6547


  self.get_booster().save_model(fname)


Run ID: ec99daef8ba448db896c87d75b42ce11


to see the results, activate the server: `uv run mlflow server --host 127.0.0.1 --port 8080`

In [22]:
mv = mlflow.register_model(
    f"runs:/{run.info.run_id}/model",
    "tennis_predictor_model",
)

Registered model 'tennis_predictor_model' already exists. Creating a new version of this model...
Created version '2' of model 'tennis_predictor_model'.


In [24]:
client = mlflow.MlflowClient()

# create "champion" alias for version 1 of model "tennis_predictor_model"
client.set_registered_model_alias("tennis_predictor_model", "champion", mv.version)

In [None]:
xgb_loaded_model = mlflow.xgboost.load_model("models:/tennis_predictor_model@champion")

In [26]:
xgb_loaded_model.predict(X_test)

array([0, 0, 1, ..., 1, 0, 1], shape=(19671,))

In [29]:
# X_test.head(1).to_dict(orient="split")

serve the model:
```bash
mlflow models serve -m models:/tennis_predictor_model@champion -p 5000 --env-manager local
```

example of inference request
```bash
curl http://127.0.0.1:5000/invocations -H "Content-Type:application/json" --data '{"dataframe_split":{"index": [27711],"columns": ["match_num","player_1","player_2","draw_size","tourney_year","player_seed","player_ht","player_age","player_rank","player_rank_points","opponent_seed","opponent_ht","opponent_age","opponent_rank","opponent_rank_points","best_of","minutes","opponent_rank_points_mean_last5","minutes_mean_last5","minutes_sum_last5","draw_size_p2","tourney_year_p2","player_seed_p2","player_ht_p2","player_age_p2","player_rank_p2","player_rank_points_p2","opponent_seed_p2","opponent_ht_p2","opponent_age_p2","opponent_rank_p2","opponent_rank_points_p2","best_of_p2","minutes_p2","opponent_rank_points_mean_last5_p2","minutes_mean_last5_p2","minutes_sum_last5_p2","draw_size_t","tourney_year_t"],"data": [[46,102358,102998,128.0,1999,null,190.0,24.8,21.0,1608.0,null,190.0,21.6,38.0,983.0,5,120.0,718.2,71.0,355.0,128.0,1999,null,190.0,21.6,38.0,983.0,null,190.0,24.8,21.0,1608.0,5,120.0,2862.75,84.4,422.0,128.0,1999]]}}'
```