In [239]:
import pandas as pd
import os
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from statsmodels.discrete.conditional_models import ConditionalLogit
from getpass import getuser
import seaborn as sns
import matplotlib.pyplot as plt

In [240]:
# =============================
# CONFIG
# =============================
USER = getuser()
INPUT_DIR = f"C:/Users/{USER}/Documents/GitHub/tennis-homophily/data/atp/"
INPUT_FILE = os.path.join(INPUT_DIR, "men_matches_with_ranks.xlsx")
OUTPUT_FILE = os.path.join(INPUT_DIR, "win_lose_df.xlsx")
CLEANED_FILE = os.path.join(INPUT_DIR, "men_matches_with_ranks_cleaned.xlsx")

# =============================
# Load Dataset
# =============================
print("Loading Grand Slam matches…")
if not os.path.exists(INPUT_FILE):
    raise FileNotFoundError(f"Grand Slam file not found: {INPUT_FILE}")

df = pd.read_excel(INPUT_FILE)
print(df.shape)
display(df.head())
df.columns.tolist()


Loading Grand Slam matches…
(1405, 272)


Unnamed: 0,match_id,tournament,location,date,year,tournament_code,stage,match_duration,winners_set1,winners_set2,...,losers_p2_abs_gap_10y,losers_p2_single_specialist,losers_p2_birthplace-city,losers_p2_birthplace-country,losers_p2_hand,losers_p2_backhand,losers_p2_colonial_legacy,losers_p2_federal_legacy,losers_p2_country_std,losers_p2_iso3
0,0,Australian Open,"Melbourne,Australia",2018-01-15,2018,580,Finals,01:33:00,6,6.0,...,,0.0,Montreal,Canada,Right-Handed,Two-Handed Backhand,Spanish,,Colombia,COL
1,1,Australian Open,"Melbourne,Australia",2018-01-15,2018,580,Semi-Finals,02:26:00,4,7.0,...,,,,,,,,,,
2,2,Australian Open,"Melbourne,Australia",2018-01-15,2018,580,Semi-Finals,01:29:00,7,7.0,...,,0.0,"Camarillo, CA, USA",,Right-Handed,One-Handed Backhand,British,,United States,USA
3,3,Australian Open,"Melbourne,Australia",2018-01-15,2018,580,Quarter-Finals,01:06:00,6,6.0,...,,0.0,Lahore,Pakistan,Right-Handed,One-Handed Backhand,,,Pakistan,PAK
4,4,Australian Open,"Melbourne,Australia",2018-01-15,2018,580,Quarter-Finals,02:40:00,6,6.0,...,,0.0,London,England,Right-Handed,One-Handed Backhand,British,,Great Britain,GBR


['match_id',
 'tournament',
 'location',
 'date',
 'year',
 'tournament_code',
 'stage',
 'match_duration',
 'winners_set1',
 'winners_set2',
 'winners_set3',
 'winners_set1_tiebreak',
 'winners_set2_tiebreak',
 'winners_set3_tiebreak',
 'losers_set1',
 'losers_set2',
 'losers_set3',
 'losers_set1_tiebreak',
 'losers_set2_tiebreak',
 'losers_set3_tiebreak',
 'winners_set4',
 'winners_set5',
 'losers_set4',
 'losers_set5',
 'winners_set4_tiebreak',
 'losers_set4_tiebreak',
 'losers_set5_tiebreak',
 'winners_p1_name',
 'winners_p1_surname',
 'winners_p1_ranking',
 'winners_p1_status',
 'winners_p2_name',
 'winners_p2_surname',
 'winners_p2_ranking',
 'winners_p2_status',
 'losers_p1_name',
 'losers_p1_surname',
 'losers_p1_ranking',
 'losers_p1_status',
 'losers_p2_name',
 'losers_p2_surname',
 'losers_p2_ranking',
 'losers_p2_status',
 'tourn_key',
 'winners_p1_initial',
 'winners_p2_initial',
 'losers_p1_initial',
 'losers_p2_initial',
 'winners_p1_rank',
 'winners_p1_tourns',
 'winner

In [241]:
pairs_needed = set(
    zip(df["winners_p1_iso3"], df["winners_p2_iso3"])
) | set(
    zip(df["losers_p1_iso3"], df["losers_p2_iso3"])
)

# also add reversed order because CEPII is directional
pairs_needed = pairs_needed | {(b,a) for (a,b) in pairs_needed}

len(pairs_needed)

462

In [242]:
path = f"C:/Users/{USER}/Documents/GitHub/tennis-homophily/data/atp/Gravity_V202211.csv"

usecols = ["iso3_o", "iso3_d", "comlang_off", "comlang_ethno"]

dtypes = {
    "iso3_o": "string",
    "iso3_d": "string",
    "comlang_off": "Int8",
    "comlang_ethno": "Int8",
}

chunks = []

for chunk in pd.read_csv(path, usecols=usecols, dtype=dtypes, chunksize=300_000):
    mask = list(zip(chunk["iso3_o"], chunk["iso3_d"]))
    filtered = chunk[[pair in pairs_needed for pair in mask]]
    if not filtered.empty:
        chunks.append(filtered)

gravity_lang = pd.concat(chunks, ignore_index=True)

gravity_lang.shape


(32560, 4)

In [243]:
gravity_lang.to_parquet(
    f"C:/Users/{USER}/Documents/GitHub/tennis-homophily/data/atp/gravity_lang_lookup.parquet",
    index=False
)


  # Filter dataset

In [244]:
# Rimuovi l'anno 2020 dal dataset, COVID time, Wimblendon non si è giocato
df = df[df['year'] != 2020].copy()

# Controllo rapido
print(df['year'].value_counts())

2018    262
2019    250
2021    249
2022    249
2023    238
Name: year, dtype: int64


# Create new variables

In [245]:
# Create surface variable
surface_map = {
    "Australian Open": "Hard",
    "US Open": "Hard",
    "Roland Garros": "Clay",
    "Wimbledon": "Grass"
}

df['surface'] = df['tournament'].map(surface_map)


In [246]:
# 2. Normalize stage names
df['stage_clean'] = df['stage'].str.lower().str.strip()


stage_order = {
    "finals": 7,
    "semi-finals": 6,
    "quarter-finals": 5,
    "round of 16": 4,
    "round of 32": 3,
    "round of 64": 2,
    "2nd round qualifying": 1,
    "1st round qualifying": 0
}

df['stage_code'] = df['stage_clean'].map(stage_order)

In [247]:
df['stage_code'].unique()

array([7, 6, 5, 4, 3, 2, 1, 0], dtype=int64)

In [248]:
# --- Physical Homophily --- #

# --- 2. Omofilia sulla mano ---

df['same_hand_winners'] = (df['winners_p1_hand'] == df['winners_p2_hand']).astype(int)
df['same_hand_losers']  = (df['losers_p1_hand']  == df['losers_p2_hand']).astype(int)

# --- 3. Omofilia sul tipo di rovescio ---

df['same_backhand_winners'] = (df['winners_p1_backhand'] == df['winners_p2_backhand']).astype(int)
df['same_backhand_losers']  = (df['losers_p1_backhand']  == df['losers_p2_backhand']).astype(int)

# Altezza: differenza assoluta in cm
df['height_diff_winners'] = (df['winners_p1_height-cm'] - df['winners_p2_height-cm']).abs()
df['height_diff_losers']  = (df['losers_p1_height-cm']  - df['losers_p2_height-cm']).abs()

# Peso: differenza assoluta in kg
df['weight_diff_winners'] = (df['winners_p1_weight-kg'] - df['winners_p2_weight-kg']).abs()
df['weight_diff_losers']  = (df['losers_p1_weight-kg']  - df['losers_p2_weight-kg']).abs()

df['experience_diff_winners'] = (df['winners_p1_experience_double'] - df['winners_p2_experience_double']).abs()
df['experience_diff_losers']  = (df['losers_p1_experience_double']  - df['losers_p2_experience_double']).abs()

df['rank_diff_winners'] = (df['winners_p1_rank'] - df['winners_p2_rank']).abs()
df['rank_diff_losers']  = (df['losers_p1_rank']  - df['losers_p2_rank']).abs()

df["single_rank_diff_winners"] = (df['winners_p1_singles_career_high_rank_num'] - df['winners_p2_singles_career_high_rank_num']).abs()
df["single_rank_diff_losers"]  = (df['losers_p1_singles_career_high_rank_num']  - df['losers_p2_singles_career_high_rank_num']).abs()

In [249]:

# --- Social Homophily --- #

# Stessa nazionalità
df['same_country_winners'] = (df['winners_p1_country'] == df['winners_p2_country']).astype(int)
df['same_country_losers']  = (df['losers_p1_country']  == df['losers_p2_country']).astype(int)

# Stesso coach
df['same_coach_winners'] = (
    (df['winners_p1_coach'].notna()) &
    (df['winners_p2_coach'].notna()) &
    (df['winners_p1_coach'] == df['winners_p2_coach'])
).astype(int)

df['same_coach_losers'] = (
    (df['losers_p1_coach'].notna()) &
    (df['losers_p2_coach'].notna()) &
    (df['losers_p1_coach'] == df['losers_p2_coach'])
).astype(int)

# (Volendo puoi aggiungere anche same_birthplace_* se pensi abbia senso)


In [250]:

# Differenza assoluta nel win ratio YTD e Career
df['wl_ytd_diff_winners']    = (df['winners_p1_doubles_win_ratio-ytd'] - df['winners_p2_doubles_win_ratio-ytd']).abs()
df['wl_ytd_diff_losers']     = (df['losers_p1_doubles_win_ratio-ytd']  - df['losers_p2_doubles_win_ratio-ytd']).abs()
df['wl_career_diff_winners'] = (df['winners_p1_doubles_win_ratio-career'] - df['winners_p2_doubles_win_ratio-career']).abs()
df['wl_career_diff_losers']  = (df['losers_p1_doubles_win_ratio-career']  - df['losers_p2_doubles_win_ratio-career']).abs()

# Differenza in titoli (YTD e Career)
df['titles_ytd_diff_winners']    = (df['winners_p1_doubles_titles-ytd'] - df['winners_p2_doubles_titles-ytd']).abs()
df['titles_ytd_diff_losers']     = (df['losers_p1_doubles_titles-ytd'] - df['losers_p2_doubles_titles-ytd']).abs()
df['titles_career_diff_winners'] = (df['winners_p1_doubles_titles-career'] - df['winners_p2_doubles_titles-career']).abs()
df['titles_career_diff_losers']  = (df['losers_p1_doubles_titles-career'] - df['losers_p2_doubles_titles-career']).abs()

# single specialists

df['single_specialist_winners'] = (
  (df['winners_p1_single_specialist'] == 1) & (df['winners_p2_single_specialist'] == 1)
).astype(int)

df['single_specialist_losers']  = (
  (df['losers_p1_single_specialist'] == 1) & (df['losers_p2_single_specialist'] == 1)
).astype(int)

In [251]:
gravity_lang_sym = pd.concat(
    [
        gravity_lang,
        gravity_lang.rename(columns={"iso3_o":"iso3_d","iso3_d":"iso3_o"})
    ],
    ignore_index=True
).drop_duplicates(subset=["iso3_o","iso3_d"])


In [252]:
same = gravity_lang_sym["iso3_o"] == gravity_lang_sym["iso3_d"]
gravity_lang_sym.loc[same, ["comlang_off","comlang_ethno"]] = 1


In [253]:
df = df.merge(
    gravity_lang_sym[["iso3_o","iso3_d","comlang_off","comlang_ethno"]],
    left_on=["winners_p1_iso3","winners_p2_iso3"],
    right_on=["iso3_o","iso3_d"],
    how="left"
)

df = df.rename(columns={
    "comlang_off":"winners_same_language",
    "comlang_ethno":"winners_linguistic_proximity"
}).drop(columns=["iso3_o","iso3_d"])


In [254]:
df = df.merge(
    gravity_lang_sym[["iso3_o","iso3_d","comlang_off","comlang_ethno"]],
    left_on=["losers_p1_iso3","losers_p2_iso3"],
    right_on=["iso3_o","iso3_d"],
    how="left"
)

df = df.rename(columns={
    "comlang_off":"losers_same_language",
    "comlang_ethno":"losers_linguistic_proximity"
}).drop(columns=["iso3_o","iso3_d"])


In [255]:
cols = [
    "winners_same_language","winners_linguistic_proximity",
    "losers_same_language","losers_linguistic_proximity"
]

for c in cols:
    df[c] = df[c].fillna(0).astype(int)


## standardize variables 

In [256]:
# 1. Elenco delle variabili continue di cui vogliamo lo z-score
cont_vars_winners = [
    'height_diff_winners',
    'weight_diff_winners',
    'experience_diff_winners',
    'rank_diff_winners',
    'wl_ytd_diff_winners',
    'wl_career_diff_winners',
    'titles_career_diff_winners'
]

cont_vars_losers = [
    'height_diff_losers',
    'weight_diff_losers',
    'experience_diff_losers',
    'rank_diff_losers',
    'wl_ytd_diff_losers',
    'wl_career_diff_losers',
    'titles_career_diff_losers'
]

# 2. Funzione per aggiungere z-score SOLO per le colonne che esistono

def add_z_scores(df, cols, suffix='_z'):
    for c in cols:
        if c in df.columns:
            mean = df[c].mean()
            std = df[c].std(ddof=0)
            if std == 0 or np.isnan(std):
                # Se la var è costante o tutta NaN: z-score = 0
                df[c + suffix] = 0
            else:
                df[c + suffix] = (df[c] - mean) / std
        else:
            # Se non esiste, stampo info e NON creo la colonna
            print(f"[add_z_scores] Column {c} not found, skipping z-score.")
    return df

df = add_z_scores(df, cont_vars_winners, suffix='_z')
df = add_z_scores(df, cont_vars_losers,  suffix='_z')

# 3. Ora individuiamo le colonne *_z effettivamente presenti

z_cols_winners = [c for c in df.columns if c.endswith('_winners_z')]
z_cols_losers  = [c for c in df.columns if c.endswith('_losers_z')]

# 4. Sostituiamo i NaN con 0 SOLO per queste colonne

df[z_cols_winners] = df[z_cols_winners].fillna(0)
df[z_cols_losers]  = df[z_cols_losers].fillna(0)

# 5. Calcoliamo gli indici di omofilia
# (se una colonna z non esiste, usiamo 0 come fallback)

def get_col(df, name):
    return df[name] if name in df.columns else 0

df['homophily_index_winners'] = (
    - get_col(df, 'height_diff_winners_z')
    - get_col(df, 'weight_diff_winners_z')
    - get_col(df, 'experience_diff_winners_z') 
    - get_col(df, 'rank_diff_winners_z')
    - get_col(df, 'wl_ytd_diff_winners_z')
    - get_col(df, 'wl_career_diff_winners_z')
    - get_col(df, 'titles_career_diff_winners_z')
    + df['same_hand_winners'].fillna(0) 
    + df['same_backhand_winners'].fillna(0) 
    + df['same_country_winners'].fillna(0)
    + df['same_coach_winners'].fillna(0)
)

df['homophily_index_losers'] = (
    - get_col(df, 'height_diff_losers_z')
    - get_col(df, 'weight_diff_losers_z')
    - get_col(df, 'experience_diff_losers_z')
    - get_col(df, 'rank_diff_losers_z')
    - get_col(df, 'wl_ytd_diff_losers_z')
    - get_col(df, 'wl_career_diff_losers_z')
    - get_col(df, 'titles_career_diff_losers_z')
    + df['same_hand_losers'].fillna(0)
    + df['same_backhand_losers'].fillna(0)
    + df['same_country_losers'].fillna(0)
    + df['same_coach_losers'].fillna(0)
)

# 6. Controllo veloce

display(
    df[[
        'same_hand_winners',
        'same_hand_losers',
        'same_backhand_winners',
        'same_backhand_losers',
        'height_diff_winners',
        'height_diff_losers',
        'homophily_index_winners',
        'homophily_index_losers'
    ]].head(10)
)


Unnamed: 0,same_hand_winners,same_hand_losers,same_backhand_winners,same_backhand_losers,height_diff_winners,height_diff_losers,homophily_index_winners,homophily_index_losers
0,0,1,0,1,6.0,8.0,-2.741117,5.808806
1,0,0,0,0,6.0,,-2.741117,0.0
2,1,0,1,1,8.0,2.0,5.251481,7.899373
3,0,1,1,0,2.0,2.0,7.709187,2.87445
4,0,1,0,0,6.0,10.0,-2.741117,-0.793731
5,1,0,1,0,8.0,,5.251481,0.0
6,0,1,0,1,,10.0,0.0,2.360998
7,1,0,1,0,10.0,10.0,2.142555,-4.579667
8,0,0,1,0,2.0,,7.709187,0.0
9,0,1,0,0,6.0,8.0,-2.741117,2.490751


In [257]:
cols_to_numeric = [
    # --- Rank ---
    'winners_p1_rank', 'winners_p2_rank',
    'losers_p1_rank',  'losers_p2_rank',

    # --- Titles career ---
    'winners_p1_titles-career', 'winners_p2_titles-career',
    'losers_p1_titles-career',  'losers_p2_titles-career',

    # --- Titles YTD ---
    'winners_p1_titles-ytd', 'winners_p2_titles-ytd',
    'losers_p1_titles-ytd',  'losers_p2_titles-ytd',

    # --- Weight (kg) ---
    'winners_p1_weight-kg', 'winners_p2_weight-kg',
    'losers_p1_weight-kg',  'losers_p2_weight-kg',

    # --- Height (cm) ---
    'winners_p1_height-cm', 'winners_p2_height-cm',
    'losers_p1_height-cm',  'losers_p2_height-cm',

    # --- Win ratio (career) ---
    'winners_p1_win-ratio-career', 'winners_p2_win-ratio-career',
    'losers_p1_win-ratio-career',  'losers_p2_win-ratio-career',

    # --- Win ratio (YTD) ---
    'winners_p1_win-ratio-ytd', 'winners_p2_win-ratio-ytd',
    'losers_p1_win-ratio-ytd',  'losers_p2_win-ratio-ytd'
]


for col in cols_to_numeric:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')


## difference winners and losers variables 

In [258]:
# #--- Homophily difference ---
# df['homophily_diff'] = df['homophily_index_winners'] - df['homophily_index_losers']


# # --- W-L Career win ratio difference ---
# df['wl_career_diff'] = (
#     df['winners_p1_win-ratio-career'] + df['winners_p2_win-ratio-career']
#     - (df['losers_p1_win-ratio-career'] + df['losers_p2_win-ratio-career'])
# )


# # --- Titles career difference ---
# df['titles_career_diff'] = (
#     (df['winners_p1_titles-career'] + df['winners_p2_titles-career'])
#     - (df['losers_p1_titles-career'] + df['losers_p2_titles-career'])
# )


# # --- Rank difference ---
# df['rank_diff'] = (
#     (df['winners_p1_rank'] + df['winners_p2_rank'])
#     - (df['losers_p1_rank'] + df['losers_p2_rank'])
# )


# duplicate rows win == 1 and win == 0

In [259]:
team_rows = []

for i, row in df.iterrows():

    # --------------------------
    # WINNERS ROW
    # --------------------------
    team_rows.append({
        'match_id': row['match_id'],
        'team': 'winners',
        'won': 1,

        'homophily': row['homophily_index_winners'],

        # Z-differences
        'height_diff': row['height_diff_winners_z'],
        'weight_diff': row['weight_diff_winners_z'],
        'experience_diff': row['experience_diff_winners_z'],

        # Player experience
        'experience_p1': row['winners_p1_experience_double'],
        'experience_p2': row['winners_p2_experience_double'],

        # single specialists
        'single_specialist': row['single_specialist_winners'],
        'single_specialist_p1': row['winners_p1_single_specialist'],
        'single_specialist_p2': row['winners_p2_single_specialist'],

        # Rank
        'rank_p1': row['winners_p1_rank'],
        'rank_p2': row['winners_p2_rank'],
        'rank_diff': row['rank_diff_winners'],
        "rank_single_p1": row['winners_p1_singles_career_high_rank_num'],
        "rank_single_p2": row['winners_p2_singles_career_high_rank_num'],
        'rank_diff_single': row['single_rank_diff_winners'],

        # W/L career ratio
        'wl_career_diff': row['wl_career_diff_winners_z'],  
        'wl_career_p1': row['winners_p1_doubles_win_ratio-career'],
        'wl_career_p2': row['winners_p2_doubles_win_ratio-career'],

        # W/L YTD ratio
        'wl_ytd_diff': row['wl_ytd_diff_winners_z'],
        'wl_ytd_p1': row['winners_p1_doubles_win_ratio-ytd'],
        'wl_ytd_p2': row['winners_p2_doubles_win_ratio-ytd'],

        # Titles (career)
        'titles_p1': row['winners_p1_doubles_titles-career'],
        'titles_p2': row['winners_p2_doubles_titles-career'],
        'titles_career_diff': row['titles_career_diff_winners_z'],

        # Homophily components
        'same_country': row['same_country_winners'],
        'same_hand': row['same_hand_winners'],
        'same_backhand': row['same_backhand_winners'],
        'same_coach': row['same_coach_winners'],
        'same_language': row['winners_same_language'],
        'linguistic_proximity': row['winners_linguistic_proximity'],

        # Match meta
        'year': row['year'],
        'tournament': row['tournament'],
        'stage': row['stage_code'],
        'surface': row['surface']
    })


    # --------------------------
    # LOSERS ROW
    # --------------------------
    team_rows.append({
        'match_id': row['match_id'],
        'team': 'losers',
        'won': 0,

        'homophily': row['homophily_index_losers'],

        # Z-differences
        'height_diff': row['height_diff_losers_z'],
        'weight_diff': row['weight_diff_losers_z'],
        'experience_diff': row['experience_diff_losers_z'],

        # Player experience
        'experience_p1': row['losers_p1_experience_double'],
        'experience_p2': row['losers_p2_experience_double'],

        # Rank
        'rank_p1': row['losers_p1_rank'],
        'rank_p2': row['losers_p2_rank'],
        'rank_diff': row['rank_diff_losers'],
        "rank_single_p1": row['losers_p1_singles_career_high_rank_num'],
        "rank_single_p2": row['losers_p2_singles_career_high_rank_num'],
        'rank_diff_single': row['single_rank_diff_losers'],

        # single specialists
        'single_specialist': row['single_specialist_losers'],
        'single_specialist_p1': row['losers_p1_single_specialist'],
        'single_specialist_p2': row['losers_p2_single_specialist'],

        # W/L career ratio
        'wl_career_diff': row['wl_career_diff_losers_z'],   
        'wl_career_p1': row['losers_p1_doubles_win_ratio-career'],
        'wl_career_p2': row['losers_p2_doubles_win_ratio-career'],

        # W/L YTD ratio
        'wl_ytd_diff': row['wl_ytd_diff_losers_z'],
        'wl_ytd_p1': row['losers_p1_doubles_win_ratio-ytd'],
        'wl_ytd_p2': row['losers_p2_doubles_win_ratio-ytd'],

        # Titles
        'titles_p1': row['losers_p1_doubles_titles-career'],
        'titles_p2': row['losers_p2_doubles_titles-career'],
        'titles_career_diff': row['titles_career_diff_losers_z'],

        # Homophily components
        'same_country': row['same_country_losers'],
        'same_hand': row['same_hand_losers'],
        'same_backhand': row['same_backhand_losers'],
        'same_coach': row['same_coach_losers'],
        'same_language': row['losers_same_language'],
        'linguistic_proximity': row['losers_linguistic_proximity'],

        # Match meta
        'year': row['year'],
        'tournament': row['tournament'],
        'stage': row['stage_code'],
        'surface': row['surface']
    })

# Build the final team-level dataframe
team_df = pd.DataFrame(team_rows)


In [260]:
team_df.shape

(2496, 37)

In [261]:
# Team-level ability (media/somma)
team_df['rank_team'] = team_df[['rank_p1', 'rank_p2']].mean(axis=1)
team_df['rank_team_best'] = team_df[['rank_p1', 'rank_p2']].min(axis=1)
team_df['rank_single_team'] = team_df[['rank_single_p1', 'rank_single_p2']].mean(axis=1)
team_df['rank_single_best'] = team_df[['rank_single_p1', 'rank_single_p2']].min(axis=1)
team_df['experience_team'] = (team_df['experience_p1'] + team_df['experience_p2']) / 2
team_df['wl_career_team'] = team_df['wl_career_p1'] + team_df['wl_career_p2']
team_df['wl_ytd_team'] = team_df['wl_ytd_p1'] + team_df['wl_ytd_p2']
team_df['titles_team'] = team_df['titles_p1'] + team_df['titles_p2']
team_df['wl_ratio_team_career'] = team_df['wl_career_p1'] + team_df['wl_career_p2']
team_df['wl_ratio_team_ytd']    = team_df['wl_ytd_p1']    + team_df['wl_ytd_p2']


# single players 

In [262]:
needed_cols = ["match_id","won","rank_single_team","rank_single_best","single_specialist"]

df = team_df.dropna(subset=needed_cols).copy()

# solo match con una squadra vincente e una perdente
valid = df.groupby("match_id")["won"].apply(lambda s: set(s) == {0,1})
df = df[df["match_id"].isin(valid[valid].index)].copy()


In [263]:
wide = df.pivot(index="match_id", columns="won",
                values=["rank_single_team","rank_single_best","single_specialist"])

# rinomina colonne
wide.columns = [f"{v}_{'winner' if w==1 else 'loser'}" for v,w in wide.columns]
wide = wide.reset_index()


In [264]:
# ranking: positivo → winner migliore
wide["rank_advantage"] = wide["rank_single_team_loser"] - wide["rank_single_team_winner"]
wide["best_player_advantage"] = wide["rank_single_best_loser"] - wide["rank_single_best_winner"]

# indicatori intuitivi
wide["winner_better_rank"] = wide["rank_advantage"] > 0
wide["winner_better_best_player"] = wide["best_player_advantage"] > 0


In [265]:
# match dove le squadre differiscono
spec_matches = wide[wide["single_specialist_winner"] != wide["single_specialist_loser"]].copy()

# 1 se vince il team specialist
spec_matches["specialist_wins"] = spec_matches["single_specialist_winner"] == 1


In [266]:
rank_table = pd.DataFrame({
    "Share of matches won": [
        wide["winner_better_rank"].mean(),
        wide["winner_better_best_player"].mean()
    ]
}, index=[
    "Team with better average singles ranking wins",
    "Team with best singles player wins"
])


In [267]:
spec_table = pd.DataFrame({
    "Share of matches won": [spec_matches["specialist_wins"].mean()]
}, index=[
    "Specialist team wins vs non-specialist"
])

In [268]:
final_table = pd.concat([rank_table, spec_table])
final_table


Unnamed: 0,Share of matches won
Team with better average singles ranking wins,0.516927
Team with best singles player wins,0.532552
Specialist team wins vs non-specialist,0.44


# language

In [269]:
needed_cols = [
    "match_id","won",
    "same_language","linguistic_proximity"
]

df_lang = team_df.dropna(subset=needed_cols).copy()

valid = df_lang.groupby("match_id")["won"].apply(lambda s: set(s) == {0,1})
df_lang = df_lang[df_lang["match_id"].isin(valid[valid].index)].copy()


In [282]:
wide_lang = df_lang.pivot(index="match_id", columns="won",
                          values=["same_language","linguistic_proximity"])

wide_lang.columns = [f"{v}_{'winner' if w==1 else 'loser'}" for v,w in wide_lang.columns]
wide_lang = wide_lang.reset_index()


In [283]:
# match dove differiscono
sl_matches = wide_lang[wide_lang["same_language_winner"] != wide_lang["same_language_loser"]].copy()

# 1 se vince la squadra con stessa lingua
sl_matches["same_language_wins"] = sl_matches["same_language_winner"] == 1


In [284]:
lp_matches = wide_lang[wide_lang["linguistic_proximity_winner"] != wide_lang["linguistic_proximity_loser"]].copy()

lp_matches["linguistic_proximity_wins"] = lp_matches["linguistic_proximity_winner"] == 1


In [285]:
language_table = pd.DataFrame({
    "Share of matches won": [
        sl_matches["same_language_wins"].mean(),
        lp_matches["linguistic_proximity_wins"].mean()
    ]
}, index=[
    "Same official language team wins vs different language",
    "Linguistically proximate team wins vs non-proximate"
])

language_table


Unnamed: 0,Share of matches won
Same official language team wins vs different language,0.646536
Linguistically proximate team wins vs non-proximate,0.643739


# Logistic regressions

In [286]:
# Copia il dataset
dfc = team_df.copy()

# Variabili richieste dal modello
cols_needed = [
    'won',

    # team performance
    'rank_team', 'wl_career_team', 'wl_ytd_team', 'titles_team', 'experience_team',

    # singles ability / specialization
    'rank_single_team', 'rank_single_best', 'single_specialist',

    # physical / experience differences
    'height_diff', 'weight_diff', 'experience_diff',
    'wl_career_diff', 'wl_ytd_diff',

    # homophily
    'same_country', 'same_hand', 'same_coach',
    'same_language', 'linguistic_proximity'
]

# Rimuovi solo righe con NaN in queste colonne
dfc = dfc.dropna(subset=cols_needed)


In [287]:
dfc.shape

(388, 47)

In [288]:

continuous_vars = [
    # team performance
    'rank_team', 'wl_career_team', 'wl_ytd_team', 'titles_team',

    # singles ability
    'rank_single_team', 'rank_single_best',

    # differences
    'height_diff', 'weight_diff', 'experience_diff',
    'wl_career_diff', 'wl_ytd_diff'
]

scaler = StandardScaler()
dfc[continuous_vars] = scaler.fit_transform(dfc[continuous_vars])


In [305]:
formula = """
won ~
    rank_team + wl_career_team +

    rank_single_team + single_specialist +

    height_diff + weight_diff +

    same_backhand +
    same_language +

    C(surface)
"""


In [306]:

model_reg = smf.logit(formula, data=dfc).fit_regularized(
    method='l1',
    alpha=1.0   # puoi provare 0.1, 0.5, 2.0 come robustness
)

print(model_reg.summary())


Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.6469148174241566
            Iterations: 63
            Function evaluations: 63
            Gradient evaluations: 63
                           Logit Regression Results                           
Dep. Variable:                    won   No. Observations:                  388
Model:                          Logit   Df Residuals:                      377
Method:                           MLE   Df Model:                           10
Date:                Tue, 17 Feb 2026   Pseudo R-squ.:                 0.05235
Time:                        16:25:19   Log-Likelihood:                -247.76
converged:                       True   LL-Null:                       -261.45
Covariance Type:            nonrobust   LLR p-value:                  0.002271
                          coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------

## cluster SE

In [310]:
# 1. Build a clean dataset for the model
vars_for_model = [
    'won', 'match_id',
    'rank_diff', 'height_diff', 'weight_diff', 'wl_ytd_diff', 'wl_career_diff',
    'same_country', 'same_hand', 'same_backhand',
    'surface', 'stage', 'same_language', 'linguistic_proximity'
]

dfc_clean = dfc[vars_for_model].replace([np.inf, -np.inf], np.nan).dropna().copy()

print(dfc_clean.shape)        # quick check
print(dfc_clean.isna().sum()) # should all be 0


(388, 14)
won                     0
match_id                0
rank_diff               0
height_diff             0
weight_diff             0
wl_ytd_diff             0
wl_career_diff          0
same_country            0
same_hand               0
same_backhand           0
surface                 0
stage                   0
same_language           0
linguistic_proximity    0
dtype: int64


In [311]:
def map_stage(s):
    if s in [0, 1, 2, 3]:
        return 'early'
    elif s == 4:
        return 'middle'
    elif s in [5, 6]:
        return 'late'
    elif s == 7:
        return 'final'
    else:
        return None

dfc_clean['stage_simple'] = dfc_clean['stage'].apply(map_stage)


In [277]:
dfc_clean['stage_simple'].value_counts()


early     273
middle     61
late       45
final       9
Name: stage_simple, dtype: int64

In [312]:
formula = """
won ~
    rank_diff + wl_career_diff +

    height_diff + weight_diff +
    same_hand + same_backhand +

    same_language +

    C(surface) + C(stage_simple)
"""


In [313]:
# 3. Build model
logit_mod = smf.logit(formula, data=dfc_clean)

# 4. Fit with CLUSTERED standard errors by match_id
res_cl = logit_mod.fit(
    maxiter=200,             # to be safe
    cov_type='cluster',
    cov_kwds={'groups': dfc_clean['match_id']}
)

print(res_cl.summary())

Optimization terminated successfully.
         Current function value: 0.643072
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                    won   No. Observations:                  388
Model:                          Logit   Df Residuals:                      375
Method:                           MLE   Df Model:                           12
Date:                Tue, 17 Feb 2026   Pseudo R-squ.:                 0.04566
Time:                        16:29:30   Log-Likelihood:                -249.51
converged:                       True   LL-Null:                       -261.45
Covariance Type:              cluster   LLR p-value:                   0.02115
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept                     1.3995      0.404      3.461      0.001       0.

In [280]:
# =============================
# Save
# =============================
print(f"Saving to {OUTPUT_FILE} …")
with pd.ExcelWriter(OUTPUT_FILE, engine="xlsxwriter") as xlw:
    team_df.to_excel(xlw, index=False, sheet_name="players_list")

print("Done.")


Saving to C:/Users/ALESSANDRO/Documents/GitHub/tennis-homophily/data/atp/win_lose_df.xlsx …
Done.


In [281]:
# =============================
# Save
# =============================
print(f"Saving to {CLEANED_FILE} …")
with pd.ExcelWriter(CLEANED_FILE, engine="xlsxwriter") as xlw:
    df.to_excel(xlw, index=False, sheet_name="players_list")

print("Done.")


Saving to C:/Users/ALESSANDRO/Documents/GitHub/tennis-homophily/data/atp/men_matches_with_ranks_cleaned.xlsx …
Done.
