In [None]:
# Exploratory Analysis: Tennis Match Length Prediction
# The analysis explores:
 #- Pre-match features such as ranking differentials, surface, and match format
 #- Model selection and validation for classification and regression tasks
 #- Error distributions and stability across different match contexts

 #This notebook is intended for analytical exploration and model validation rather than
 #deployment or optimization against external benchmarks.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os, re
import pandas as pd
import numpy as np

# Your dataset lives here (confirmed from your screenshots)
BASE = "/kaggle/input/tml-database-master/TML-Database-master"

# Years to include (change if you want more/less)
YEARS = list(range(2015, 2025))  # 2015â€“2024


In [None]:
files = [f"{BASE}/{y}.csv" for y in YEARS if os.path.exists(f"{BASE}/{y}.csv")]
print(f"{len(files)} year files found")
files[:5]


In [None]:
df_list = []
for fp in files:
    tmp = pd.read_csv(fp)
    tmp["season"] = int(re.search(r"(\d{4})\.csv$", fp).group(1))
    df_list.append(tmp)

raw = pd.concat(df_list, ignore_index=True)
raw.shape


In [None]:
colmap = {
    "winner_name":"winner_name","loser_name":"loser_name",
    "winner_rank":"winner_rank","loser_rank":"loser_rank",
    "winner_ht":"winner_ht","loser_ht":"loser_ht",
    "winner_hand":"winner_hand","loser_hand":"loser_hand",
    "surface":"surface","round":"round","best_of":"best_of",
    "tourney_name":"tourney_name","tourney_date":"tourney_date","date":"tourney_date",
    "score":"score"
}
# Align names if casing differs
for k in list(colmap):
    if k not in raw.columns:
        cands = [c for c in raw.columns if c.lower()==k.lower()]
        if cands: colmap[k] = cands[0]

use_cols = [v for v in colmap.values() if v in raw.columns]
df = raw[use_cols + ["season"]].copy()

# Drop walkovers/retirements if score exists
if "score" in df.columns:
    bad = ["W/O","WO","RET","DEF","ABD"]
    df = df[~df["score"].astype(str).str.contains("|".join(bad), case=False, na=False)]

# Ensure rank columns exist and are numeric
for c in ["winner_rank","loser_rank"]:
    if c not in df.columns: df[c] = np.nan
df["winner_rank"] = pd.to_numeric(df["winner_rank"], errors="coerce").fillna(1000)
df["loser_rank"]  = pd.to_numeric(df["loser_rank"],  errors="coerce").fillna(1000)

df.shape


In [None]:
def build_training_frame(df):
    a = pd.DataFrame({
        "p1": df["winner_name"], "p2": df["loser_name"],
        "p1_rank": df["winner_rank"], "p2_rank": df["loser_rank"],
        "surface": df.get("surface","Hard"), "best_of": df.get("best_of",3),
        "round": df.get("round","R32"), "season": df["season"], "y": 1
    })
    b = pd.DataFrame({
        "p1": df["loser_name"], "p2": df["winner_name"],
        "p1_rank": df["loser_rank"], "p2_rank": df["winner_rank"],
        "surface": df.get("surface","Hard"), "best_of": df.get("best_of",3),
        "round": df.get("round","R32"), "season": df["season"], "y": 0
    })
    out = pd.concat([a,b], ignore_index=True)
    out["rank_diff"] = out["p1_rank"] - out["p2_rank"]
    out["best_of"] = pd.to_numeric(out["best_of"], errors="coerce").fillna(3)
    out["surface"] = out["surface"].fillna("Hard").replace("", "Hard")
    out["round"] = out["round"].fillna("R32")
    return out

train_df = build_training_frame(df)

X = train_df[["rank_diff","best_of"]].copy()
X = pd.concat([X, pd.get_dummies(train_df["surface"], prefix="surf")], axis=1)
y = train_df["y"].astype(int)

train_df.shape, X.shape


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score

cut = 2023  # train <= 2022, validate on 2023â€“2024
idx_train = train_df["season"] <= cut-1
idx_valid = train_df["season"] >= cut

X_train, y_train = X[idx_train], y[idx_train]
X_valid, y_valid = X[idx_valid], y[idx_valid]

clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)

pred_valid = clf.predict_proba(X_valid)[:,1]
acc = accuracy_score(y_valid, (pred_valid>=0.5).astype(int))
ll  = log_loss(y_valid, pred_valid)
auc = roc_auc_score(y_valid, pred_valid)
acc, ll, auc


In [None]:
import joblib
joblib.dump(clf, "/kaggle/working/model_logreg.joblib")
X.columns.to_series().to_csv("/kaggle/working/feature_columns.csv", index=False)

latest = train_df.groupby("p1", as_index=False).agg({"p1_rank":"last"})
latest.columns = ["player","rank"]
SURF_COLS = [c for c in X.columns if c.startswith("surf_")]
FEAT_COLS = X.columns.tolist()

def make_feature_row(p1, p2, surface="Hard", best_of=3):
    r1 = latest.loc[latest["player"]==p1, "rank"].iloc[0] if (latest["player"]==p1).any() else 1000
    r2 = latest.loc[latest["player"]==p2, "rank"].iloc[0] if (latest["player"]==p2).any() else 1000
    row = {"rank_diff": r1 - r2, "best_of": best_of}
    for c in SURF_COLS: row[c] = 1 if c == f"surf_{surface}" else 0
    return pd.DataFrame([row])[FEAT_COLS]

# Example:
# x = make_feature_row("Novak Djokovic","Carlos Alcaraz","Hard",3)
# float(clf.predict_proba(x)[:,1])


In [None]:
# ðŸ§  Quick in-notebook predictor (no Streamlit yet)

# Example: change players/surface to test
p1 = "Novak Djokovic"
p2 = "Carlos Alcaraz"
surface = "Hard"

x = make_feature_row(p1, p2, surface)
prob = float(clf.predict_proba(x)[:,1])
print(f"{p1} win probability vs {p2} on {surface}: {prob:.1%}")


In [None]:
import pandas as pd

# Create a unique list of players with their latest known rank
players = (
    train_df[["p1", "p1_rank"]]
      .dropna()
      .drop_duplicates("p1", keep="last")
      .rename(columns={"p1": "name", "p1_rank": "rank"})
      .sort_values("name")
      .reset_index(drop=True)
)

# Save it as players.csv in your output folder
players.to_csv("/kaggle/working/players.csv", index=False)

players.head()
