In [None]:
import pandas as pd 
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier

In [None]:
import os
from pathlib import Path
import glob
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Folder paths
path = r"C:\Users\viaud\OneDrive\Desktop\Datasets\tennis_dataset"

# Find csv files
files = glob.glob(os.path.join(path, "*.csv"))

print("Found", len(files), "CSV files in tennis_dataset")

# Read and combine CSVs safely
def read_and_concat(file_list):
    if not file_list:
        return pd.DataFrame()  # return empty DataFrame if none found
    dfs = []
    for f in file_list:
        try:
            dfs.append(pd.read_csv(f, low_memory=False))
        except Exception as e:
            print(f"Warning: failed to read {f}: {e}")
    if not dfs:
        return pd.DataFrame()
    return pd.concat(dfs, ignore_index=True)

df = read_and_concat(files)

print("âœ… Successfully combined", len(files), "files into df")
print("Total rows in df:", len(df))

# Quick peek (only if not empty)
if not df.empty:
    display(df.head())

In [None]:
df = df.dropna(subset=['winner_rank', 'loser_rank', 'winner_age', 'loser_age'])

# Feature engineering
df['rank_diff'] = df['winner_rank'] - df['loser_rank']
df['age_diff'] = df['winner_age'] - df['loser_age']
df['height_diff'] = df['winner_ht'] - df['loser_ht']
df['ace_rate'] = df['w_ace'] / df['w_svpt']
df['df_rate'] = df['w_df'] / df['w_svpt']
df['first_serve_win_pct'] = df['w_1stWon'] / df['w_1stIn']


In [None]:
# Original winner perspective
df_win = df.copy()
df_win['player_id'] = df_win['winner_id']
df_win['opponent_id'] = df_win['loser_id']
df_win['player_rank'] = df_win['winner_rank']
df_win['opponent_rank'] = df_win['loser_rank']
df_win['player_age'] = df_win['winner_age']
df_win['opponent_age'] = df_win['loser_age']
df_win['player_ht'] = df_win['winner_ht']
df_win['opponent_ht'] = df_win['loser_ht']
df_win['outcome'] = 1

# Flipped loser perspective
df_lose = df.copy()
df_lose['player_id'] = df_lose['loser_id']
df_lose['opponent_id'] = df_lose['winner_id']
df_lose['player_rank'] = df_lose['loser_rank']
df_lose['opponent_rank'] = df_lose['winner_rank']
df_lose['player_age'] = df_lose['loser_age']
df_lose['opponent_age'] = df_lose['winner_age']
df_lose['player_ht'] = df_lose['loser_ht']
df_lose['opponent_ht'] = df_lose['winner_ht']
df_lose['outcome'] = 0

# Combine both perspectives
df_model = pd.concat([df_win, df_lose], ignore_index=True)

features = ['player_rank', 'opponent_rank', 'player_age', 'opponent_age',
            'player_ht', 'opponent_ht', 'ace_rate', 'df_rate', 'first_serve_win_pct']
X = df_model[features]
y = df_model['outcome']



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = XGBClassifier(n_estimators=300, max_depth=6, learning_rate=0.1, use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Run grid search
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)

# Evaluate
y_pred = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

final_pred = (rf.predict_proba(X_test)[:,1] + xgb.predict_proba(X_test)[:,1]) / 2
final_labels = (final_pred > 0.5).astype(int)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(xgb, X, y, cv=5, scoring='accuracy')
print("CV Accuracy:", scores.mean())


In [None]:
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [None]:
import joblib

# Load model
model = joblib.load('xgb_tennis_model.pkl')

# Example new match data
new_match = pd.DataFrame([{
    'player_rank': 5,
    'opponent_rank': 25,
    'player_age': 27,
    'opponent_age': 30,
    'player_ht': 188,
    'opponent_ht': 183,
    'ace_rate': 0.12,
    'df_rate': 0.03,
    'first_serve_win_pct': 0.72,
    'surface_hard': 1,
    'surface_clay': 0,
    'surface_grass': 0,
    'round_QF': 0,
    'round_SF': 1,
    'tourney_level_G': 1,
    'tourney_level_A': 0
}])

# Predict outcome
prediction = model.predict(new_match)
probability = model.predict_proba(new_match)[0][1]

print("Predicted outcome:", "Win" if prediction[0] == 1 else "Loss")
print("Win probability:", round(probability * 100, 2), "%")