In [377]:
import pandas as pd
import os
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from statsmodels.discrete.conditional_models import ConditionalLogit
from getpass import getuser
# =============================
# CONFIG
# =============================
USER = getuser()
INPUT_DIR = f"C:/Users/{USER}/Documents/GitHub/tennis-homophily/data/atp/"
INPUT_FILE = os.path.join(INPUT_DIR, "grand_slam_matches_with_ranks_2018_2023.xlsx")
OUTPUT_FILE = os.path.join(INPUT_DIR, "win_lose_df.xlsx")
CLEANED_FILE = os.path.join(INPUT_DIR, "cleaned_grand_slam_matches_2018_2023.xlsx")

# =============================
# Load Dataset
# =============================
print("Loading Grand Slam matches…")
if not os.path.exists(INPUT_FILE):
    raise FileNotFoundError(f"Grand Slam file not found: {INPUT_FILE}")

df = pd.read_excel(INPUT_FILE)
print(df.shape)
display(df.head())
df.columns.tolist()


Loading Grand Slam matches…
(1405, 112)


Unnamed: 0,match_id,tournament,location,date,year,tournament_code,stage,match_duration,winners_p1,winners_p2,...,losers_p2_Titles Career,losers_p2_DOB,losers_p2_Turned pro,losers_p2_Unnamed: 15,losers_p2_Country,losers_p2_Birthplace,losers_p2_Plays,losers_p2_Coach,losers_p2_WeightKg,losers_p2_HeightCm
0,0,Australian Open,"Melbourne,Australia","15-28 Jan, 2018",2018,580,Finals,01:33:00,Oliver Marach(7),Mate Pavic(7),...,19.0,1987/01/20,2010.0,x,Colombia,"Montreal, Canada","Right-Handed, Two-Handed Backhand",Mariano Hood,89.0,193.0
1,1,Australian Open,"Melbourne,Australia","15-28 Jan, 2018",2018,580,Semi-Finals,02:26:00,Oliver Marach(7),Mate Pavic(7),...,1.0,1990-04-25,2009.0,x,Germany,"Warstein, Germany","Right-Handed, Two-Handed Backhand",Markus Wislsperger,92.0,193.0
2,2,Australian Open,"Melbourne,Australia","15-28 Jan, 2018",2018,580,Semi-Finals,01:29:00,Juan Sebastian Cabal(11),Robert Farah(11),...,124.0,1978/04/29,1998.0,x,United States,"Camarillo, CA, USA","Right-Handed, One-Handed Backhand","David Macpherson, Dave Marshall",84.0,191.0
3,3,Australian Open,"Melbourne,Australia","15-28 Jan, 2018",2018,580,Quarter-Finals,01:06:00,Bob Bryan(6),Mike Bryan(6),...,18.0,1980-03-17,1998.0,x,Pakistan,"Lahore, Pakistan","Right-Handed, One-Handed Backhand",Yasir Khan,83.0,183.0
4,4,Australian Open,"Melbourne,Australia","15-28 Jan, 2018",2018,580,Quarter-Finals,02:40:00,Oliver Marach(7),Mate Pavic(7),...,14.0,1986/03/06,,x,Great Britain,"London, England","Right-Handed, One-Handed Backhand",Louis Cayer,94.0,198.0


['match_id',
 'tournament',
 'location',
 'date',
 'year',
 'tournament_code',
 'stage',
 'match_duration',
 'winners_p1',
 'winners_p2',
 'losers_p1',
 'losers_p2',
 'winners_set1',
 'winners_set2',
 'winners_set3',
 'winners_set1_tiebreak',
 'winners_set2_tiebreak',
 'winners_set3_tiebreak',
 'losers_set1',
 'losers_set2',
 'losers_set3',
 'losers_set1_tiebreak',
 'losers_set2_tiebreak',
 'losers_set3_tiebreak',
 'winners_set4',
 'winners_set5',
 'losers_set4',
 'losers_set5',
 'winners_set4_tiebreak',
 'losers_set4_tiebreak',
 'losers_set5_tiebreak',
 'tourn_key',
 'winners_p1_Rank',
 'winners_p1_Player',
 'winners_p1_Player Profile Link',
 'winners_p1_Tourns',
 'winners_p1_Tournament',
 'winners_p1_Year',
 'winners_p1_DateWeek',
 'winners_p1_W-L YTD',
 'winners_p1_W-L Career',
 'winners_p1_Titles YTD',
 'winners_p1_Titles Career',
 'winners_p1_DOB',
 'winners_p1_Turned pro',
 'winners_p1_Unnamed: 15',
 'winners_p1_Country',
 'winners_p1_Birthplace',
 'winners_p1_Plays',
 'winners_p

# Create new variables

In [378]:
# Rimuovi l'anno 2020 dal dataset, COVID time, Wimblendon non si è giocato
df = df[df['year'] != 2020].copy()

# Controllo rapido
print(df['year'].value_counts())

2018    262
2019    250
2021    249
2022    249
2023    238
Name: year, dtype: int64


In [379]:
# Create surface variable
surface_map = {
    "Australian Open": "Hard",
    "US Open": "Hard",
    "Roland Garros": "Clay",
    "Wimbledon": "Grass"
}

df['surface'] = df['tournament'].map(surface_map)


In [380]:
# 2. Normalize stage names
df['stage_clean'] = df['stage'].str.lower().str.strip()


stage_order = {
    "finals": 7,
    "semi-finals": 6,
    "quarter-finals": 5,
    "round of 16": 4,
    "round of 32": 3,
    "round of 64": 2,
    "2nd round qualifying": 1,
    "1st round qualifying": 0
}

df['stage_code'] = df['stage_clean'].map(stage_order)

In [381]:
df['stage_code'].unique()

array([7, 6, 5, 4, 3, 2, 1, 0], dtype=int64)

In [382]:
# --- 1. Funzione di supporto per convertire "W-L" in win% --- #

def wl_to_ratio(x):
    """
    Converte una stringa tipo '25-10' in win ratio 25 / (25+10).
    Se x è NaN o malformato, restituisce NaN.
    """
    if pd.isna(x):
        return np.nan
    try:
        s = str(x).strip()
        if '-' not in s:
            return np.nan
        w_str, l_str = s.split('-', 1)
        w = int(w_str)
        l = int(l_str)
        tot = w + l
        if tot == 0:
            return np.nan
        return w / tot
    except Exception:
        return np.nan


In [383]:
def extract_hand(play_string):
    """Restituisce solo la mano (Right-Handed / Left-Handed)."""
    if pd.isna(play_string):
        return None
    # prendo la parte prima della virgola
    return play_string.split(",")[0].strip()

def extract_backhand(play_string):
    """Restituisce solo il tipo di rovescio (One-/Two-Handed Backhand)."""
    if pd.isna(play_string):
        return None
    parts = play_string.split(",")
    if len(parts) < 2:
        return None
    return parts[1].strip()

# --- 1. Crea colonne Hand e Backhand per tutti i giocatori ---

for side in ["winners", "losers"]:
    for p in ["p1", "p2"]:
        plays_col = f"{side}_{p}_Plays"
        hand_col  = f"{side}_{p}_Hand"
        bh_col    = f"{side}_{p}_Backhand"

        df[hand_col] = df[plays_col].apply(extract_hand)
        df[bh_col]   = df[plays_col].apply(extract_backhand)

In [384]:
# --- Physical Homophily --- #

# --- 2. Omofilia sulla mano ---

df['same_hand_winners'] = (df['winners_p1_Hand'] == df['winners_p2_Hand']).astype(int)
df['same_hand_losers']  = (df['losers_p1_Hand']  == df['losers_p2_Hand']).astype(int)

# --- 3. Omofilia sul tipo di rovescio ---

df['same_backhand_winners'] = (df['winners_p1_Backhand'] == df['winners_p2_Backhand']).astype(int)
df['same_backhand_losers']  = (df['losers_p1_Backhand']  == df['losers_p2_Backhand']).astype(int)

# Altezza: differenza assoluta in cm
df['height_diff_winners'] = (df['winners_p1_HeightCm'] - df['winners_p2_HeightCm']).abs()
df['height_diff_losers']  = (df['losers_p1_HeightCm']  - df['losers_p2_HeightCm']).abs()

# Peso: differenza assoluta in kg
df['weight_diff_winners'] = (df['winners_p1_WeightKg'] - df['winners_p2_WeightKg']).abs()
df['weight_diff_losers']  = (df['losers_p1_WeightKg']  - df['losers_p2_WeightKg']).abs()

# Esperienza: approssimo con (year - Turned pro)
for side in ['winners', 'losers']:
    df[f'{side}_p1_Turned pro_num'] = pd.to_numeric(df[f'{side}_p1_Turned pro'], errors='coerce')
    df[f'{side}_p2_Turned pro_num'] = pd.to_numeric(df[f'{side}_p2_Turned pro'], errors='coerce')
    df[f'{side}_p1_experience'] = df['year'] - df[f'{side}_p1_Turned pro_num']
    df[f'{side}_p2_experience'] = df['year'] - df[f'{side}_p2_Turned pro_num']

df['experience_diff_winners'] = (df['winners_p1_experience'] - df['winners_p2_experience']).abs()
df['experience_diff_losers']  = (df['losers_p1_experience']  - df['losers_p2_experience']).abs()


In [385]:

# --- Social Homophily --- #

# Stessa nazionalità
df['same_country_winners'] = (df['winners_p1_Country'] == df['winners_p2_Country']).astype(int)
df['same_country_losers']  = (df['losers_p1_Country']  == df['losers_p2_Country']).astype(int)

# Stesso coach
df['same_coach_winners'] = (
    (df['winners_p1_Coach'].notna()) &
    (df['winners_p2_Coach'].notna()) &
    (df['winners_p1_Coach'] == df['winners_p2_Coach'])
).astype(int)

df['same_coach_losers'] = (
    (df['losers_p1_Coach'].notna()) &
    (df['losers_p2_Coach'].notna()) &
    (df['losers_p1_Coach'] == df['losers_p2_Coach'])
).astype(int)

# (Volendo puoi aggiungere anche same_birthplace_* se pensi abbia senso)


In [386]:

# --- Ability Homophily --- #

# Prima converto W-L in ratio per p1 e p2 (YTD e Career)
for side in ['winners', 'losers']:
    for stat in ['W-L YTD', 'W-L Career']:
        df[f'{side}_p1_{stat}_ratio'] = df[f'{side}_p1_{stat}'].apply(wl_to_ratio)
        df[f'{side}_p2_{stat}_ratio'] = df[f'{side}_p2_{stat}'].apply(wl_to_ratio)

# Differenza assoluta nel win ratio YTD e Career
df['wl_ytd_diff_winners']    = (df['winners_p1_W-L YTD_ratio']    - df['winners_p2_W-L YTD_ratio']).abs()
df['wl_ytd_diff_losers']     = (df['losers_p1_W-L YTD_ratio']     - df['losers_p2_W-L YTD_ratio']).abs()
df['wl_career_diff_winners'] = (df['winners_p1_W-L Career_ratio'] - df['winners_p2_W-L Career_ratio']).abs()
df['wl_career_diff_losers']  = (df['losers_p1_W-L Career_ratio']  - df['losers_p2_W-L Career_ratio']).abs()

# Differenza in titoli (YTD e Career)
df['titles_ytd_diff_winners']    = (df['winners_p1_Titles YTD']    - df['winners_p2_Titles YTD']).abs()
df['titles_ytd_diff_losers']     = (df['losers_p1_Titles YTD']     - df['losers_p2_Titles YTD']).abs()
df['titles_career_diff_winners'] = (df['winners_p1_Titles Career'] - df['winners_p2_Titles Career']).abs()
df['titles_career_diff_losers']  = (df['losers_p1_Titles Career']  - df['losers_p2_Titles Career']).abs()

## standardize variables 

In [387]:
# 1. Elenco delle variabili continue di cui vogliamo lo z-score
cont_vars_winners = [
    'height_diff_winners',
    'weight_diff_winners',
    'experience_diff_winners',
    'wl_ytd_diff_winners',
    'wl_career_diff_winners',
    'titles_career_diff_winners'
]

cont_vars_losers = [
    'height_diff_losers',
    'weight_diff_losers',
    'experience_diff_losers',
    'wl_ytd_diff_losers',
    'wl_career_diff_losers',
    'titles_career_diff_losers'
]

# 2. Funzione per aggiungere z-score SOLO per le colonne che esistono

def add_z_scores(df, cols, suffix='_z'):
    for c in cols:
        if c in df.columns:
            mean = df[c].mean()
            std = df[c].std(ddof=0)
            if std == 0 or np.isnan(std):
                # Se la var è costante o tutta NaN: z-score = 0
                df[c + suffix] = 0
            else:
                df[c + suffix] = (df[c] - mean) / std
        else:
            # Se non esiste, stampo info e NON creo la colonna
            print(f"[add_z_scores] Column {c} not found, skipping z-score.")
    return df

df = add_z_scores(df, cont_vars_winners, suffix='_z')
df = add_z_scores(df, cont_vars_losers,  suffix='_z')

# 3. Ora individuiamo le colonne *_z effettivamente presenti

z_cols_winners = [c for c in df.columns if c.endswith('_winners_z')]
z_cols_losers  = [c for c in df.columns if c.endswith('_losers_z')]

# 4. Sostituiamo i NaN con 0 SOLO per queste colonne

df[z_cols_winners] = df[z_cols_winners].fillna(0)
df[z_cols_losers]  = df[z_cols_losers].fillna(0)

# 5. Calcoliamo gli indici di omofilia
# (se una colonna z non esiste, usiamo 0 come fallback)

def get_col(df, name):
    return df[name] if name in df.columns else 0

df['homophily_index_winners'] = (
    - get_col(df, 'height_diff_winners_z')
    - get_col(df, 'weight_diff_winners_z')
    - get_col(df, 'experience_diff_winners_z')
    - get_col(df, 'wl_ytd_diff_winners_z')
    - get_col(df, 'wl_career_diff_winners_z')
    - get_col(df, 'titles_career_diff_winners_z')
    + df['same_hand_winners'].fillna(0) 
    + df['same_backhand_winners'].fillna(0) 
    + df['same_country_winners'].fillna(0)
    + df['same_coach_winners'].fillna(0)
)

df['homophily_index_losers'] = (
    - get_col(df, 'height_diff_losers_z')
    - get_col(df, 'weight_diff_losers_z')
    - get_col(df, 'experience_diff_losers_z')
    - get_col(df, 'wl_ytd_diff_losers_z')
    - get_col(df, 'wl_career_diff_losers_z')
    - get_col(df, 'titles_career_diff_losers_z')
    + df['same_hand_losers'].fillna(0)
    + df['same_backhand_losers'].fillna(0)
    + df['same_country_losers'].fillna(0)
    + df['same_coach_losers'].fillna(0)
)

# 6. Controllo veloce

display(
    df[[
        'same_hand_winners',
        'same_hand_losers',
        'same_backhand_winners',
        'same_backhand_losers',
        'height_diff_winners',
        'height_diff_losers',
        'homophily_index_winners',
        'homophily_index_losers'
    ]].head(10)
)


Unnamed: 0,same_hand_winners,same_hand_losers,same_backhand_winners,same_backhand_losers,height_diff_winners,height_diff_losers,homophily_index_winners,homophily_index_losers
0,0,1,0,1,6.0,8.0,-2.527917,5.187147
1,0,1,0,0,6.0,8.0,-2.527917,0.725384
2,1,0,1,1,8.0,2.0,4.844931,6.798715
3,0,1,1,0,2.0,2.0,6.792128,3.108388
4,0,1,0,0,6.0,10.0,-2.527917,-0.297577
5,1,0,1,0,8.0,,4.844931,0.0
6,1,1,0,1,8.0,10.0,0.433839,1.442973
7,1,0,1,0,10.0,10.0,1.420138,-2.833316
8,0,1,1,0,2.0,10.0,6.792128,-0.125362
9,0,1,0,0,6.0,8.0,-2.527917,1.986453


In [388]:
cols_to_numeric = [
    'winners_p1_Rank', 'winners_p2_Rank', 'losers_p1_Rank', 'losers_p2_Rank',
    'winners_p1_Titles Career', 'winners_p2_Titles Career',
    'losers_p1_Titles Career', 'losers_p2_Titles Career',
    'winners_p1_Titles YTD', 'winners_p2_Titles YTD',
    'losers_p1_Titles YTD', 'losers_p2_Titles YTD',
    'winners_p1_WeightKg', 'winners_p2_WeightKg',
    'losers_p1_WeightKg', 'losers_p2_WeightKg',
    'winners_p1_HeightCm', 'winners_p2_HeightCm',
    'losers_p1_HeightCm', 'losers_p2_HeightCm',
    'winners_p1_W-L Career_ratio', 'winners_p2_W-L Career_ratio',
    'losers_p1_W-L Career_ratio', 'losers_p2_W-L Career_ratio',
    'winners_p1_W-L YTD_ratio', 'winners_p2_W-L YTD_ratio',
    'losers_p1_W-L YTD_ratio', 'losers_p2_W-L YTD_ratio'
]

for col in cols_to_numeric:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')


## difference variables 

In [389]:
# Create homophily difference
df['homophily_diff'] = df['homophily_index_winners'] - df['homophily_index_losers']

df['wl_career_diff'] = (
    df['winners_p1_W-L Career_ratio'] + df['winners_p2_W-L Career_ratio']
    - (df['losers_p1_W-L Career_ratio'] + df['losers_p2_W-L Career_ratio'])
)

df['titles_career_diff'] = (
    (df['winners_p1_Titles Career'] + df['winners_p2_Titles Career'])
    - (df['losers_p1_Titles Career'] + df['losers_p2_Titles Career'])
)

df['rank_diff'] = (
    (df['winners_p1_Rank'] + df['winners_p2_Rank'])
    - (df['losers_p1_Rank'] + df['losers_p2_Rank'])
)





# duplicate rows win == 1 and win == 0

In [390]:
team_rows = []

for i, row in df.iterrows():

    # WINNERS ROW
    team_rows.append({
        'match_id': row['match_id'],
        'team': 'winners',
        'won': 1,
        'homophily': row['homophily_index_winners'],
        'hompohily_diff': row['homophily_diff'],
        'height_diff': row['height_diff_winners_z'],
        'weight_diff': row['weight_diff_winners_z'],
        'experience_diff': row['experience_diff_winners_z'],
        'experience_p1': row['winners_p1_experience'],
        'experience_p2': row['winners_p2_experience'],
        'rank_p1': row['winners_p1_Rank'],
        'rank_p2': row['winners_p2_Rank'],
        'rank_diff': row['rank_diff'],
        'wl_carreer_diff': row['wl_career_diff_winners'],
        'wl_career_p1': row['winners_p1_W-L Career_ratio'],
        'wl_career_p2': row['winners_p2_W-L Career_ratio'],
        'wl_ytd_diff': row['wl_ytd_diff_winners'],
        'wl_ytd_p1': row['winners_p1_W-L YTD_ratio'],
        'wl_ytd_p2': row['winners_p2_W-L YTD_ratio'],
        'titles_p1': row['winners_p1_Titles Career'],
        'titles_p2': row['winners_p2_Titles Career'],
        'titles_career_diff': row['titles_career_diff_winners'],
        'same_country': row['same_country_winners'],
        'same_hand': row['same_hand_winners'],
        'same_backhand': row['same_backhand_winners'],
        'same_coach': row['same_coach_winners'],
        'year': row['year'],
        'tournament': row['tournament'],
        'stage': row['stage_code'],
        'surface': row['surface']
    })

    # LOSERS ROW
    team_rows.append({
        'match_id': row['match_id'],
        'team': 'losers',
        'won': 0,
        'homophily': row['homophily_index_losers'],
        'hompohily_diff': row['homophily_diff'],
        'height_diff': row['height_diff_losers_z'],
        'weight_diff': row['weight_diff_losers_z'],
        'experience_diff': row['experience_diff_losers_z'],
        'experience_p1': row['losers_p1_experience'],
        'experience_p2': row['losers_p2_experience'],
        'rank_p1': row['losers_p1_Rank'],
        'rank_p2': row['losers_p2_Rank'],
        'rank_diff': row['rank_diff'],
        'wl_carreer_diff': row['wl_career_diff_losers'],
        'wl_career_p1': row['losers_p1_W-L Career_ratio'],
        'wl_career_p2': row['losers_p2_W-L Career_ratio'],
        'wl_ytd_diff': row['wl_ytd_diff_losers'],
        'wl_ytd_p1': row['losers_p1_W-L YTD_ratio'],
        'wl_ytd_p2': row['losers_p2_W-L YTD_ratio'],
        'wl_ratio_p1': row['losers_p1_W-L Career_ratio'],
        'wl_ratio_p2': row['losers_p2_W-L Career_ratio'],
        'titles_p1': row['losers_p1_Titles Career'],
        'titles_p2': row['losers_p2_Titles Career'],
        'titles_career_diff': row['titles_career_diff_losers'],
        'same_country': row['same_country_losers'],
        'same_hand': row['same_hand_losers'],
        'same_backhand': row['same_backhand_losers'],
        'same_coach': row['same_coach_losers'],
        'year': row['year'],
        'tournament': row['tournament'],
        'stage': row['stage_code'],
        'surface': row['surface']
    })

team_df = pd.DataFrame(team_rows)


In [391]:
team_df.shape

(2496, 32)

In [392]:
# Team-level ability (media/somma)
team_df['rank_team'] = team_df['rank_p1'] + team_df['rank_p2']
team_df['experience_team'] = (team_df['experience_p1'] + team_df['experience_p2']) / 2
team_df['wl_career_team'] = team_df['wl_career_p1'] + team_df['wl_career_p2']
team_df['wl_ytd_team'] = team_df['wl_ytd_p1'] + team_df['wl_ytd_p2']
team_df['titles_team'] = team_df['titles_p1'] + team_df['titles_p2']
team_df['wl_ratio_team'] = team_df['wl_ratio_p1'] + team_df['wl_ratio_p2']


In [393]:
# Copia il dataset
dfc = team_df.copy()

# Variabili richieste dal modello
cols_needed = [
    'won',
    'rank_team', 'wl_career_team', 'wl_ytd_team', 'titles_team', 'experience_team',
    'height_diff', 'weight_diff', 'experience_diff',
    'wl_carreer_diff', 'wl_ytd_diff',
    'same_country', 'same_hand', 'same_coach'
]

# Rimuovi solo righe con NaN in queste colonne
dfc = dfc.dropna(subset=cols_needed)


In [394]:
dfc.shape

(832, 38)

# Logistic regressions

In [395]:

continuous_vars = [
    'rank_team', 'wl_career_team', 'wl_ytd_team', 'titles_team', 'experience_team',
    'height_diff', 'weight_diff', 'experience_diff',
    'wl_carreer_diff', 'wl_ytd_diff'
]

scaler = StandardScaler()
dfc[continuous_vars] = scaler.fit_transform(dfc[continuous_vars])


In [396]:
formula = """
won ~
    rank_diff + 
    height_diff + weight_diff + wl_ytd_diff + 
    same_country + same_hand  + same_backhand +
      C(surface) + C(stage)
"""


In [397]:

model_reg = smf.logit(formula, data=dfc).fit_regularized(
    method='l1',
    alpha=1.0   # puoi provare 0.1, 0.5, 2.0 come robustness
)

print(model_reg.summary())


Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.6797611870050381
            Iterations: 73
            Function evaluations: 77
            Gradient evaluations: 73
                           Logit Regression Results                           
Dep. Variable:                    won   No. Observations:                  690
Model:                          Logit   Df Residuals:                      673
Method:                           MLE   Df Model:                           16
Date:                Mon, 08 Dec 2025   Pseudo R-squ.:                 0.01471
Time:                        14:27:43   Log-Likelihood:                -467.32
converged:                       True   LL-Null:                       -474.30
Covariance Type:            nonrobust   LLR p-value:                    0.6022
                          coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))
Try increasing solver accuracy or number of iterations, decreasing alpha, or switch solvers


## cluster SE

In [398]:
# 1. Build a clean dataset for the model
vars_for_model = [
    'won', 'match_id',
    'rank_diff', 'height_diff', 'weight_diff', 'wl_ytd_diff',
    'same_country', 'same_hand', 'same_backhand',
    'surface', 'stage'
]

dfc_clean = dfc[vars_for_model].replace([np.inf, -np.inf], np.nan).dropna().copy()

print(dfc_clean.shape)        # quick check
print(dfc_clean.isna().sum()) # should all be 0


(690, 11)
won              0
match_id         0
rank_diff        0
height_diff      0
weight_diff      0
wl_ytd_diff      0
same_country     0
same_hand        0
same_backhand    0
surface          0
stage            0
dtype: int64


In [399]:
def map_stage(s):
    if s in [0, 1, 2, 3]:
        return 'early'
    elif s == 4:
        return 'middle'
    elif s in [5, 6]:
        return 'late'
    elif s == 7:
        return 'final'
    else:
        return None

dfc_clean['stage_simple'] = dfc_clean['stage'].apply(map_stage)


In [400]:
dfc_clean['stage_simple'].value_counts()


early     489
middle     99
late       87
final      15
Name: stage_simple, dtype: int64

In [401]:
formula = """
won ~ rank_diff + height_diff + weight_diff +
       wl_ytd_diff + same_country + same_hand + same_backhand +
       C(surface) + C(stage_simple)
"""


In [402]:
# 3. Build model
logit_mod = smf.logit(formula, data=dfc_clean)

# 4. Fit with CLUSTERED standard errors by match_id
res_cl = logit_mod.fit(
    maxiter=200,             # to be safe
    cov_type='cluster',
    cov_kwds={'groups': dfc_clean['match_id']}
)

print(res_cl.summary())

Optimization terminated successfully.
         Current function value: 0.676726
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:                    won   No. Observations:                  690
Model:                          Logit   Df Residuals:                      677
Method:                           MLE   Df Model:                           12
Date:                Mon, 08 Dec 2025   Pseudo R-squ.:                 0.01551
Time:                        14:27:43   Log-Likelihood:                -466.94
converged:                       True   LL-Null:                       -474.30
Covariance Type:              cluster   LLR p-value:                    0.2577
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept                     0.7410      0.228      3.247      0.001       0.

In [403]:
# =============================
# Save
# =============================
print(f"Saving to {OUTPUT_FILE} …")
with pd.ExcelWriter(OUTPUT_FILE, engine="xlsxwriter") as xlw:
    team_df.to_excel(xlw, index=False, sheet_name="players_list")

print("Done.")


Saving to C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp/win_lose_df.xlsx …
Done.


In [404]:
# =============================
# Save
# =============================
print(f"Saving to {CLEANED_FILE} …")
with pd.ExcelWriter(CLEANED_FILE, engine="xlsxwriter") as xlw:
    df.to_excel(xlw, index=False, sheet_name="players_list")

print("Done.")


Saving to C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp/cleaned_grand_slam_matches_2018_2023.xlsx …
Done.
