In [81]:
import pandas as pd
from getpass import getuser
import os

In [82]:
# =============================
# CONFIG
# =============================
USER = getuser()
INPUT_DIR = f"C:/Users/{USER}/Documents/GitHub/tennis-homophily/data/atp/"
INPUT_FILE = os.path.join(INPUT_DIR, "cleaned_grand_slam_matches_2018_2023.xlsx")  # the dataset is find in logit.ipynb
OUTPUT_FILE = os.path.join(INPUT_DIR, "pairs_df.xlsx")

# =============================
# Load Dataset
# =============================
print("Loading Grand Slam matches…")
if not os.path.exists(INPUT_FILE):
    raise FileNotFoundError(f"Grand Slam file not found: {INPUT_FILE}")

df = pd.read_excel(INPUT_FILE)
print(df.shape)
display(df.head())
df.columns.tolist()


Loading Grand Slam matches…
(1248, 179)


Unnamed: 0,match_id,tournament,location,date,year,tournament_code,stage,match_duration,winners_p1,winners_p2,...,experience_diff_losers_z,wl_ytd_diff_losers_z,wl_career_diff_losers_z,titles_career_diff_losers_z,homophily_index_winners,homophily_index_losers,homophily_diff,wl_career_diff,titles_career_diff,rank_diff
0,0,Australian Open,"Melbourne,Australia","15-28 Jan, 2018",2018,580,Finals,01:33:00,Oliver Marach(7),Mate Pavic(7),...,0.056412,0.0,-0.783193,-0.651529,-2.527917,5.187147,-7.715064,0.015528,26.0,-17.0
1,1,Australian Open,"Melbourne,Australia","15-28 Jan, 2018",2018,580,Semi-Finals,02:26:00,Oliver Marach(7),Mate Pavic(7),...,0.056412,0.0,-0.734542,-0.131266,-2.527917,0.725384,-3.253301,0.296899,57.0,-197.0
2,2,Australian Open,"Melbourne,Australia","15-28 Jan, 2018",2018,580,Semi-Finals,01:29:00,Juan Sebastian Cabal(11),Robert Farah(11),...,-1.217088,0.0,-0.852937,-0.235319,4.844931,6.798715,-1.953784,-0.289932,-204.0,
3,3,Australian Open,"Melbourne,Australia","15-28 Jan, 2018",2018,580,Quarter-Finals,01:06:00,Bob Bryan(6),Mike Bryan(6),...,0.056412,0.0,-0.55561,-0.651529,6.792128,3.108388,3.683739,0.437822,208.0,
4,4,Australian Open,"Melbourne,Australia","15-28 Jan, 2018",2018,580,Quarter-Finals,02:40:00,Oliver Marach(7),Mate Pavic(7),...,0.0,0.0,-0.528839,0.180891,-2.527917,-0.297577,-2.23034,0.182091,46.0,-59.0


['match_id',
 'tournament',
 'location',
 'date',
 'year',
 'tournament_code',
 'stage',
 'match_duration',
 'winners_p1',
 'winners_p2',
 'losers_p1',
 'losers_p2',
 'winners_set1',
 'winners_set2',
 'winners_set3',
 'winners_set1_tiebreak',
 'winners_set2_tiebreak',
 'winners_set3_tiebreak',
 'losers_set1',
 'losers_set2',
 'losers_set3',
 'losers_set1_tiebreak',
 'losers_set2_tiebreak',
 'losers_set3_tiebreak',
 'winners_set4',
 'winners_set5',
 'losers_set4',
 'losers_set5',
 'winners_set4_tiebreak',
 'losers_set4_tiebreak',
 'losers_set5_tiebreak',
 'tourn_key',
 'winners_p1_Rank',
 'winners_p1_Player',
 'winners_p1_Player Profile Link',
 'winners_p1_Tourns',
 'winners_p1_Tournament',
 'winners_p1_Year',
 'winners_p1_DateWeek',
 'winners_p1_W-L YTD',
 'winners_p1_W-L Career',
 'winners_p1_Titles YTD',
 'winners_p1_Titles Career',
 'winners_p1_DOB',
 'winners_p1_Turned pro',
 'winners_p1_Unnamed: 15',
 'winners_p1_Country',
 'winners_p1_Birthplace',
 'winners_p1_Plays',
 'winners_p

In [83]:
def ordered_pair_with_attrs(name1, name2, attrs1: dict, attrs2: dict):
    """
    Restituisce (player1_name, player2_name, player1_attrs, player2_attrs)
    dove player1/player2 sono ordinati alfabeticamente,
    e gli attributi sono riordinati di conseguenza.
    """
    if name1 <= name2:
        return name1, name2, attrs1, attrs2
    else:
        return name2, name1, attrs2, attrs1


In [84]:
pairs_rows = []

for _, row in df.iterrows():

    # =====================
    # COPPIA VINCENTE
    # =====================
    w1_name = row["winners_p1"]
    w2_name = row["winners_p2"]

    # Attributi di w1 nel match
    w1_attrs = {
        "p1_rank_raw": row["winners_p1_Rank"],
        "p1_country_raw": row["winners_p1_Country"],
        "p1_hand_raw": row["winners_p1_Hand"],
        "p1_backhand_raw": row["winners_p1_Backhand"],
        "p1_height_raw": row["winners_p1_HeightCm"],
        "p1_weight_raw": row["winners_p1_WeightKg"],
        "p1_experience_raw": row["winners_p1_experience"],
        "p1_wl_ytd_raw": row["winners_p1_W-L YTD_ratio"],
        "p1_wl_career_raw": row["winners_p1_W-L Career_ratio"],
        "p1_titles_career": row["winners_p1_Titles Career"],
        "p1_titles_ytd": row["winners_p1_Titles YTD"],
        "p1_tournaments_played": row["winners_p1_Tourns"],
        "p1_coach_raw": row["winners_p1_Coach"],
    }

    # Attributi di w2 nel match
    w2_attrs = {
        "p2_rank_raw": row["winners_p2_Rank"],
        "p2_country_raw": row["winners_p2_Country"],
        "p2_hand_raw": row["winners_p2_Hand"],
        "p2_backhand_raw": row["winners_p2_Backhand"],
        "p2_height_raw": row["winners_p2_HeightCm"],
        "p2_weight_raw": row["winners_p2_WeightKg"],
        "p2_experience_raw": row["winners_p2_experience"],
        "p2_wl_ytd_raw": row["winners_p2_W-L YTD_ratio"],
        "p2_wl_career_raw": row["winners_p2_W-L Career_ratio"],
        "p2_titles_career": row["winners_p2_Titles Career"],
        "p2_titles_ytd": row["winners_p2_Titles YTD"],
        "p2_tournaments_played": row["winners_p2_Tourns"],
        "p2_coach_raw": row["winners_p2_Coach"],
    }

    # Ordiniamo la coppia + attributi in base ai nomi
    p1_name, p2_name, p1_attrs, p2_attrs = ordered_pair_with_attrs(
        w1_name, w2_name, w1_attrs, w2_attrs
    )

    row_dict_win = {
        "player1": p1_name,
        "player2": p2_name,
        "year": row["year"],
        "tournament": row["tournament"],
        "surface": row["surface"],
        "stage": row["stage_code"],
        "match_id": row["match_id"],
        "result": "win",
    }

    # Aggiungi attributi di player1 e player2 (rinominati in modo pulito)
    for k, v in p1_attrs.items():
        row_dict_win[k.replace("_raw", "")] = v
    for k, v in p2_attrs.items():
        row_dict_win[k.replace("_raw", "")] = v

    pairs_rows.append(row_dict_win)

    # =====================
    # COPPIA PERDENTE
    # =====================
    l1_name = row["losers_p1"]
    l2_name = row["losers_p2"]

    l1_attrs = {
        "p1_rank_raw": row["losers_p1_Rank"],
        "p1_country_raw": row["losers_p1_Country"],
        "p1_hand_raw": row["losers_p1_Hand"],
        "p1_backhand_raw": row["losers_p1_Backhand"],
        "p1_height_raw": row["losers_p1_HeightCm"],
        "p1_weight_raw": row["losers_p1_WeightKg"],
        "p1_experience_raw": row["losers_p1_experience"],
        "p1_wl_ytd_raw": row["losers_p1_W-L YTD_ratio"],
        "p1_wl_career_raw": row["losers_p1_W-L Career_ratio"],
        "p1_titles_career": row["losers_p1_Titles Career"],
        "p1_titles_ytd": row["losers_p1_Titles YTD"],
        "p1_tournaments_played": row["losers_p1_Tourns"],
        "p1_coach_raw": row["losers_p1_Coach"],
    }

    l2_attrs = {
        "p2_rank_raw": row["losers_p2_Rank"],
        "p2_country_raw": row["losers_p2_Country"],
        "p2_hand_raw": row["losers_p2_Hand"],
        "p2_backhand_raw": row["losers_p2_Backhand"],
        "p2_height_raw": row["losers_p2_HeightCm"],
        "p2_weight_raw": row["losers_p2_WeightKg"],
        "p2_experience_raw": row["losers_p2_experience"],
        "p2_wl_ytd_raw": row["losers_p2_W-L YTD_ratio"],
        "p2_wl_career_raw": row["losers_p2_W-L Career_ratio"],
        "p1_titles_career": row["losers_p2_Titles Career"],
        "p1_titles_ytd": row["losers_p2_Titles YTD"],
        "p2_tournaments_played": row["losers_p2_Tourns"],
        "p2_coach_raw": row["losers_p2_Coach"],
    }

    p1_name, p2_name, p1_attrs, p2_attrs = ordered_pair_with_attrs(
        l1_name, l2_name, l1_attrs, l2_attrs
    )

    row_dict_loss = {
        "player1": p1_name,
        "player2": p2_name,
        "year": row["year"],
        "tournament": row["tournament"],
        "surface": row["surface"],
        "stage": row["stage_code"],
        "match_id": row["match_id"],
        "result": "loss",
    }

    for k, v in p1_attrs.items():
        row_dict_loss[k.replace("_raw", "")] = v
    for k, v in p2_attrs.items():
        row_dict_loss[k.replace("_raw", "")] = v

    pairs_rows.append(row_dict_loss)

pairs_df = pd.DataFrame(pairs_rows)


# fill missing values for players appearing multiple times in the dataset

In [85]:
# Colonne che vogliamo stabilizzare per giocatore
attr_cols = ["country", "hand", "backhand", "height", "weight", "experience"]

# 1. Prendiamo i giocatori quando sono player1
p1_long = pairs_df[["player1"] + [f"p1_{c}" for c in attr_cols]].copy()
p1_long = p1_long.rename(columns={"player1": "player"})
p1_long = p1_long.rename(columns={f"p1_{c}": c for c in attr_cols})

# 2. Prendiamo i giocatori quando sono player2
p2_long = pairs_df[["player2"] + [f"p2_{c}" for c in attr_cols]].copy()
p2_long = p2_long.rename(columns={"player2": "player"})
p2_long = p2_long.rename(columns={f"p2_{c}": c for c in attr_cols})

# 3. Mettiamo insieme tutte le osservazioni per giocatore
players_long = pd.concat([p1_long, p2_long], ignore_index=True)


In [86]:
def pick_mode(series):
    s = series.dropna()
    if s.empty:
        return None
    # modalità; se ce ne sono più di una, prendi la prima
    return s.mode().iloc[0]

players_ref = (
    players_long
    .groupby("player")
    .agg({c: pick_mode for c in attr_cols})
    .reset_index()
)

players_ref.head()


Unnamed: 0,player,country,hand,backhand,height,weight,experience
0,Adam Pavlasek,Uruguay,Right-Handed,Two-Handed Backhand,180.0,80.0,17.0
1,Adhithya Ganesan,,,,,,
2,Adil Shamasdin,Canada,Right-Handed,Two-Handed Backhand,180.0,73.0,10.0
3,Adrian Mannarino,France,Left-Handed,Two-Handed Backhand,180.0,79.0,18.0
4,Aidan McHugh,,,,,,


In [87]:
# Creiamo dizionari player -> attributo
maps = {
    c: players_ref.set_index("player")[c].to_dict()
    for c in attr_cols
}

# Riempire per player1
for c in attr_cols:
    col = f"p1_{c}"
    pairs_df[col] = pairs_df[col].fillna(pairs_df["player1"].map(maps[c]))

# Riempire per player2
for c in attr_cols:
    col = f"p2_{c}"
    pairs_df[col] = pairs_df[col].fillna(pairs_df["player2"].map(maps[c]))


In [88]:
pairs_df["same_country"]  = (pairs_df["p1_country"] == pairs_df["p2_country"]).astype(int)
pairs_df["same_hand"]     = (pairs_df["p1_hand"] == pairs_df["p2_hand"]).astype(int)
pairs_df["same_backhand"] = (pairs_df["p1_backhand"] == pairs_df["p2_backhand"]).astype(int)
pairs_df["same_coach"]    = (pairs_df["p1_coach"] == pairs_df["p2_coach"]).astype(int)

pairs_df["height_diff"]      = (pairs_df["p1_height"] - pairs_df["p2_height"]).abs()
pairs_df["weight_diff"]      = (pairs_df["p1_weight"] - pairs_df["p2_weight"]).abs()
pairs_df["rank_diff_pair"]   = (pairs_df["p1_rank"]   - pairs_df["p2_rank"]).abs()
pairs_df["experience_diff"]  = (pairs_df["p1_experience"] - pairs_df["p2_experience"]).abs()
pairs_df["wl_ytd_diff"]      = (pairs_df["p1_wl_ytd"] - pairs_df["p2_wl_ytd"]).abs()
pairs_df["wl_career_diff"]  = (pairs_df["p1_wl_career"] - pairs_df["p2_wl_career"]).abs()



In [89]:
# 1. Calcola quanti valori distinti ha ogni dummy per ogni coppia
consistency = (
    pairs_df
    .groupby(["player1", "player2"])[["same_country", "same_hand", "same_backhand"]]
    .nunique()
    .reset_index()
)

# 2. Pairs with at least one inconsistency
inconsistent_pairs = consistency[
    (consistency["same_country"] > 1) |
    (consistency["same_hand"] > 1) |
    (consistency["same_backhand"] > 1)
]

print("Inconsistent pairs:")
print(inconsistent_pairs)


Inconsistent pairs:
                  player1                  player2  same_country  same_hand  \
12       Adrian Mannarino            Quentin Halys             1          2   
35   Albert Ramos-Vinolas  Bernabe Zapata Miralles             2          2   
92         Alexei Popyrin             Lloyd Harris             2          1   
237           Ben Shelton      Christopher Eubanks             1          2   
281        Cameron Norrie             Daniel Evans             1          1   
369     Diego Schwartzman           Federico Coria             1          2   
397         Dusan Lajovic         Filip Krajinovic             1          1   
468       Feliciano Lopez               Marc Lopez             2          1   
469       Feliciano Lopez            Maxime Cressy             2          2   

     same_backhand  
12               1  
35               1  
92               1  
237              2  
281              2  
369              1  
397              2  
468              1  


In [90]:
# 1. Seleziona solo coppie senza inconsistenze
ok_pairs = consistency[
    (consistency["same_country"] <= 1) &
    (consistency["same_hand"] <= 1) &
    (consistency["same_backhand"] <= 1)
][["player1", "player2"]]

pairs_clean = pairs_df.merge(ok_pairs, on=["player1", "player2"], how="inner")


In [91]:
pair_summary = (
    pairs_clean
    .groupby(["player1", "player2"])
    .agg(
        # storia della coppia
        n_matches=("match_id", "nunique"),
        n_seasons=("year", "nunique"),
        n_tournaments=("tournament", "nunique"),

        # risultati
        wins=("result", lambda x: (x == "win").sum()),
        losses=("result", lambda x: (x == "loss").sum()),
        win_rate=("result", lambda x: (x == "win").mean()),

        # omofilia (valore unico per coppia)
        same_country=("same_country", "first"),
        same_hand=("same_hand", "first"),
        same_backhand=("same_backhand", "first"),

        # differenze medie
        height_diff_mean=("height_diff", "mean"),
        weight_diff_mean=("weight_diff", "mean"),
        rank_diff_pair_mean=("rank_diff_pair", "mean"),
        experience_diff_mean=("experience_diff", "mean"),
        wl_ytd_diff_pair_mean=("wl_ytd_diff", "mean"),
        wl_career_diff_pair_mean=("wl_career_diff", "mean"),
    )
    .reset_index()
)


In [92]:
# =============================
# Save
# =============================
print(f"Saving to {OUTPUT_FILE} …")
with pd.ExcelWriter(OUTPUT_FILE, engine="xlsxwriter") as xlw:
    pair_summary.to_excel(xlw, index=False, sheet_name="players_list")

print("Done.")


Saving to C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp/pairs_df.xlsx …
Done.
