In [22]:
import pandas as pd
from getpass import getuser
import os

In [23]:
# =============================
# CONFIG
# =============================
USER = getuser()
INPUT_DIR = f"C:/Users/{USER}/Documents/GitHub/tennis-homophily/data/atp/"
INPUT_FILE = os.path.join(INPUT_DIR, "men_matches_with_ranks_cleaned.xlsx") 
OUTPUT_FILE = os.path.join(INPUT_DIR, "pairs_df.xlsx")

# =============================
# Load Dataset
# =============================
print("Loading Grand Slam matches…")
if not os.path.exists(INPUT_FILE):
    raise FileNotFoundError(f"Grand Slam file not found: {INPUT_FILE}")

df = pd.read_excel(INPUT_FILE)
print(df.shape)
display(df.head())
df.columns.tolist()


Loading Grand Slam matches…
(1248, 203)


Unnamed: 0,match_id,tournament,location,date,year,tournament_code,stage,match_duration,winners_set1,winners_set2,...,titles_career_diff_winners_z,height_diff_losers_z,weight_diff_losers_z,experience_diff_losers_z,rank_diff_losers_z,wl_ytd_diff_losers_z,wl_career_diff_losers_z,titles_career_diff_losers_z,homophily_index_winners,homophily_index_losers
0,0,Australian Open,"Melbourne,Australia",2018-01-15,2018,580,Finals,01:33:00,6,6.0,...,0.710707,0.209107,-0.018066,0.067036,-0.547214,0.0,-0.783465,-0.619038,-2.031063,5.69164
1,1,Australian Open,"Melbourne,Australia",2018-01-15,2018,580,Semi-Finals,02:26:00,4,7.0,...,0.710707,0.209107,0.867153,0.067036,0.00931,0.0,-0.758994,-0.0987,-2.031063,0.705088
2,2,Australian Open,"Melbourne,Australia",2018-01-15,2018,580,Semi-Finals,01:29:00,7,7.0,...,-0.584173,-0.941408,-0.549197,-1.198336,-0.57504,0.0,-0.846997,-0.202768,5.242303,7.313747
3,3,Australian Open,"Melbourne,Australia",2018-01-15,2018,580,Quarter-Finals,01:06:00,6,6.0,...,-0.296422,-0.941408,-0.018066,0.067036,-0.470692,0.0,-0.576152,-0.619038,7.169658,3.558321
4,4,Australian Open,"Melbourne,Australia",2018-01-15,2018,580,Quarter-Finals,02:40:00,6,6.0,...,0.710707,0.592613,1.044197,0.0,-0.463736,0.0,-0.551766,0.213502,-2.031063,0.16519


['match_id',
 'tournament',
 'location',
 'date',
 'year',
 'tournament_code',
 'stage',
 'match_duration',
 'winners_set1',
 'winners_set2',
 'winners_set3',
 'winners_set1_tiebreak',
 'winners_set2_tiebreak',
 'winners_set3_tiebreak',
 'losers_set1',
 'losers_set2',
 'losers_set3',
 'losers_set1_tiebreak',
 'losers_set2_tiebreak',
 'losers_set3_tiebreak',
 'winners_set4',
 'winners_set5',
 'losers_set4',
 'losers_set5',
 'winners_set4_tiebreak',
 'losers_set4_tiebreak',
 'losers_set5_tiebreak',
 'winners_p1_name',
 'winners_p1_surname',
 'winners_p1_ranking',
 'winners_p1_status',
 'winners_p2_name',
 'winners_p2_surname',
 'winners_p2_ranking',
 'winners_p2_status',
 'losers_p1_name',
 'losers_p1_surname',
 'losers_p1_ranking',
 'losers_p1_status',
 'losers_p2_name',
 'losers_p2_surname',
 'losers_p2_ranking',
 'losers_p2_status',
 'tourn_key',
 'winners_p1_initial',
 'winners_p2_initial',
 'losers_p1_initial',
 'losers_p2_initial',
 'winners_p1_rank',
 'winners_p1_tourns',
 'winner

In [24]:
def ordered_pair_with_attrs(name1, name2, attrs1: dict, attrs2: dict):
    """
    Restituisce (player1_name, player2_name, player1_attrs, player2_attrs)
    dove player1/player2 sono ordinati alfabeticamente,
    e gli attributi sono riordinati di conseguenza.
    """
    if name1 <= name2:
        return name1, name2, attrs1, attrs2
    else:
        return name2, name1, attrs2, attrs1


In [25]:
pairs_rows = []

def make_player_label(prefix, row):
    """
    Build a player label using the columns you have in df:
    e.g., 'winners_p1_name' + 'winners_p1_surname' -> 'name surname' (lowercase/clean if desired).
    """
    n = row.get(f"{prefix}_name")
    s = row.get(f"{prefix}_surname")

    if pd.isna(n) and pd.isna(s):
        return None
    n = "" if pd.isna(n) else str(n).strip()
    s = "" if pd.isna(s) else str(s).strip()
    return (n + " " + s).strip()


for _, row in df.iterrows():

    # =====================
    # WINNING PAIR
    # =====================
    w1_name = make_player_label("winners_p1", row)
    w2_name = make_player_label("winners_p2", row)

    # attributes of w1 in the match
    w1_attrs = {
        "p1_rank_raw": row["winners_p1_rank"],
        "p1_country_raw": row["winners_p1_country"],
        "p1_hand_raw": row["winners_p1_hand"],
        "p1_backhand_raw": row["winners_p1_backhand"],
        "p1_height_raw": row["winners_p1_height-cm"],
        "p1_weight_raw": row["winners_p1_weight-kg"],
        "p1_experience_raw": row["winners_p1_experience"],
        "p1_wl_ytd_raw": row["winners_p1_win-ratio-ytd"],
        "p1_wl_career_raw": row["winners_p1_win-ratio-career"],
        "p1_titles_career": row["winners_p1_titles-career"],
        "p1_titles_ytd": row["winners_p1_titles-ytd"],
        "p1_tournaments_played": row["winners_p1_tourns"],
        "p1_coach_raw": row["winners_p1_coach"],
    }

    # attributes of w2 in the match
    w2_attrs = {
        "p2_rank_raw": row["winners_p2_rank"],
        "p2_country_raw": row["winners_p2_country"],
        "p2_hand_raw": row["winners_p2_hand"],
        "p2_backhand_raw": row["winners_p2_backhand"],
        "p2_height_raw": row["winners_p2_height-cm"],
        "p2_weight_raw": row["winners_p2_weight-kg"],
        "p2_experience_raw": row["winners_p2_experience"],
        "p2_wl_ytd_raw": row["winners_p2_win-ratio-ytd"],
        "p2_wl_career_raw": row["winners_p2_win-ratio-career"],
        "p2_titles_career": row["winners_p2_titles-career"],
        "p2_titles_ytd": row["winners_p2_titles-ytd"],
        "p2_tournaments_played": row["winners_p2_tourns"],
        "p2_coach_raw": row["winners_p2_coach"],
    }

    # order the pair + attributes by name (stable ordering)
    p1_name, p2_name, p1_attrs, p2_attrs = ordered_pair_with_attrs(
        w1_name, w2_name, w1_attrs, w2_attrs
    )

    row_dict_win = {
        "player1": p1_name,
        "player2": p2_name,
        "year": row["year"],
        "tournament": row["tournament"],
        "surface": row["surface"],
        "stage": row["stage_code"],
        "match_id": row["match_id"],
        "result": "win",
    }

    # add attributes (remove "_raw" suffix in keys)
    for k, v in p1_attrs.items():
        row_dict_win[k.replace("_raw", "")] = v
    for k, v in p2_attrs.items():
        row_dict_win[k.replace("_raw", "")] = v

    pairs_rows.append(row_dict_win)

    # =====================
    # LOSING PAIR
    # =====================
    l1_name = make_player_label("losers_p1", row)
    l2_name = make_player_label("losers_p2", row)

    l1_attrs = {
        "p1_rank_raw": row["losers_p1_rank"],
        "p1_country_raw": row["losers_p1_country"],
        "p1_hand_raw": row["losers_p1_hand"],
        "p1_backhand_raw": row["losers_p1_backhand"],
        "p1_height_raw": row["losers_p1_height-cm"],
        "p1_weight_raw": row["losers_p1_weight-kg"],
        "p1_experience_raw": row["losers_p1_experience"],
        "p1_wl_ytd_raw": row["losers_p1_win-ratio-ytd"],
        "p1_wl_career_raw": row["losers_p1_win-ratio-career"],
        "p1_titles_career": row["losers_p1_titles-career"],
        "p1_titles_ytd": row["losers_p1_titles-ytd"],
        "p1_tournaments_played": row["losers_p1_tourns"],
        "p1_coach_raw": row["losers_p1_coach"],
    }

    l2_attrs = {
        "p2_rank_raw": row["losers_p2_rank"],
        "p2_country_raw": row["losers_p2_country"],
        "p2_hand_raw": row["losers_p2_hand"],
        "p2_backhand_raw": row["losers_p2_backhand"],
        "p2_height_raw": row["losers_p2_height-cm"],
        "p2_weight_raw": row["losers_p2_weight-kg"],
        "p2_experience_raw": row["losers_p2_experience"],
        "p2_wl_ytd_raw": row["losers_p2_win-ratio-ytd"],
        "p2_wl_career_raw": row["losers_p2_win-ratio-career"],
        "p2_titles_career": row["losers_p2_titles-career"],
        "p2_titles_ytd": row["losers_p2_titles-ytd"],
        "p2_tournaments_played": row["losers_p2_tourns"],
        "p2_coach_raw": row["losers_p2_coach"],
    }

    p1_name, p2_name, p1_attrs, p2_attrs = ordered_pair_with_attrs(
        l1_name, l2_name, l1_attrs, l2_attrs
    )

    row_dict_loss = {
        "player1": p1_name,
        "player2": p2_name,
        "year": row["year"],
        "tournament": row["tournament"],
        "surface": row["surface"],
        "stage": row["stage_code"],
        "match_id": row["match_id"],
        "result": "loss",
    }

    for k, v in p1_attrs.items():
        row_dict_loss[k.replace("_raw", "")] = v
    for k, v in p2_attrs.items():
        row_dict_loss[k.replace("_raw", "")] = v

    pairs_rows.append(row_dict_loss)

pairs_df = pd.DataFrame(pairs_rows)


# fill missing values for players appearing multiple times in the dataset

In [26]:
# Colonne che vogliamo stabilizzare per giocatore
attr_cols = ["country", "hand", "backhand", "height", "weight", "experience"]

# 1. Prendiamo i giocatori quando sono player1
p1_long = pairs_df[["player1"] + [f"p1_{c}" for c in attr_cols]].copy()
p1_long = p1_long.rename(columns={"player1": "player"})
p1_long = p1_long.rename(columns={f"p1_{c}": c for c in attr_cols})

# 2. Prendiamo i giocatori quando sono player2
p2_long = pairs_df[["player2"] + [f"p2_{c}" for c in attr_cols]].copy()
p2_long = p2_long.rename(columns={"player2": "player"})
p2_long = p2_long.rename(columns={f"p2_{c}": c for c in attr_cols})

# 3. Mettiamo insieme tutte le osservazioni per giocatore
players_long = pd.concat([p1_long, p2_long], ignore_index=True)


In [27]:
def pick_mode(series):
    s = series.dropna()
    if s.empty:
        return None
    # modalità; se ce ne sono più di una, prendi la prima
    return s.mode().iloc[0]

players_ref = (
    players_long
    .groupby("player")
    .agg({c: pick_mode for c in attr_cols})
    .reset_index()
)

players_ref.head()


Unnamed: 0,player,country,hand,backhand,height,weight,experience
0,adam pavlasek,Uruguay,Right-Handed,Two-Handed Backhand,180.0,80.0,17.0
1,adhithya ganesan,,,,,,
2,adil shamasdin,Canada,Right-Handed,Two-Handed Backhand,180.0,73.0,10.0
3,adrian mannarino,France,Left-Handed,Two-Handed Backhand,180.0,79.0,18.0
4,aidan mchugh,Great Britain,Right-Handed,Unknown Backhand,185.0,74.0,


In [28]:
# Creiamo dizionari player -> attributo
maps = {
    c: players_ref.set_index("player")[c].to_dict()
    for c in attr_cols
}

# Riempire per player1
for c in attr_cols:
    col = f"p1_{c}"
    pairs_df[col] = pairs_df[col].fillna(pairs_df["player1"].map(maps[c]))

# Riempire per player2
for c in attr_cols:
    col = f"p2_{c}"
    pairs_df[col] = pairs_df[col].fillna(pairs_df["player2"].map(maps[c]))


In [29]:
pairs_df["same_country"] = (pairs_df["p1_country"] == pairs_df["p2_country"]).astype(int)
pairs_df["same_hand"] = (pairs_df["p1_hand"] == pairs_df["p2_hand"]).astype(int)
pairs_df["same_backhand"] = (pairs_df["p1_backhand"] == pairs_df["p2_backhand"]).astype(int)
pairs_df["same_coach"] = (pairs_df["p1_coach"] == pairs_df["p2_coach"]).astype(int)

pairs_df["height_diff"] = (pairs_df["p1_height"] - pairs_df["p2_height"]).abs()
pairs_df["weight_diff"] = (pairs_df["p1_weight"] - pairs_df["p2_weight"]).abs()
pairs_df["rank_diff_pair"] = (pairs_df["p1_rank"]   - pairs_df["p2_rank"]).abs()
pairs_df["experience_diff"]= (pairs_df["p1_experience"] - pairs_df["p2_experience"]).abs()
pairs_df["wl_ytd_diff"] = (pairs_df["p1_wl_ytd"] - pairs_df["p2_wl_ytd"]).abs()
pairs_df["wl_career_diff"]  = (pairs_df["p1_wl_career"] - pairs_df["p2_wl_career"]).abs()



In [30]:
# 1. Calcola quanti valori distinti ha ogni dummy per ogni coppia
consistency = (
    pairs_df
    .groupby(["player1", "player2"])[["same_country", "same_hand", "same_backhand"]]
    .nunique()
    .reset_index()
)

# 2. Pairs with at least one inconsistency
inconsistent_pairs = consistency[
    (consistency["same_country"] > 1) |
    (consistency["same_hand"] > 1) |
    (consistency["same_backhand"] > 1)
]

print("Inconsistent pairs:")
print(inconsistent_pairs)


Inconsistent pairs:
                  player1           player2  same_country  same_hand  \
31   albert ramos-vinolas      benoit paire             1          2   
32   albert ramos-vinolas  bernabe miralles             2          2   
85         alexei popyrin      lloyd harris             2          1   
184       austin krajicek        ivan dodig             1          2   
221          bruno soares        mate pavic             2          1   
301     diego schwartzman    federico coria             1          2   
325         dusan lajovic  filip krajinovic             1          1   
372       feliciano lopez        marc lopez             2          1   
420           guido pella      hugo dellien             2          2   

     same_backhand  
31               1  
32               1  
85               1  
184              1  
221              1  
301              1  
325              2  
372              1  
420              1  


In [31]:
# 1. Seleziona solo coppie senza inconsistenze
ok_pairs = consistency[
    (consistency["same_country"] <= 1) &
    (consistency["same_hand"] <= 1) &
    (consistency["same_backhand"] <= 1)
][["player1", "player2"]]

pairs_clean = pairs_df.merge(ok_pairs, on=["player1", "player2"], how="inner")


In [32]:
# 1) Create the pair-level homophily index in pairs_clean
pairs_clean["homophily_index_pair"] = (
    pairs_clean["same_country"].fillna(0).astype(int)
    + pairs_clean["same_hand"].fillna(0).astype(int)
    + pairs_clean["same_backhand"].fillna(0).astype(int)
)

# (optional) normalize to 0–1
pairs_clean["homophily_index_pair_norm"] = pairs_clean["homophily_index_pair"] / 3


In [33]:
pair_summary = (
    pairs_clean
    .groupby(["player1", "player2"])
    .agg(
        n_matches=("match_id", "nunique"),
        n_seasons=("year", "nunique"),
        n_tournaments=("tournament", "nunique"),

        wins=("result", lambda x: (x == "win").sum()),
        losses=("result", lambda x: (x == "loss").sum()),
        win_rate=("result", lambda x: (x == "win").mean()),

        same_country=("same_country", "first"),
        same_hand=("same_hand", "first"),
        same_backhand=("same_backhand", "first"),

        homophily_index_pair=("homophily_index_pair", "first"),
        homophily_index_pair_norm=("homophily_index_pair_norm", "first"),

        height_diff_mean=("height_diff", "mean"),
        weight_diff_mean=("weight_diff", "mean"),
        rank_diff_pair_mean=("rank_diff_pair", "mean"),
        experience_diff_mean=("experience_diff", "mean"),
        wl_ytd_diff_pair_mean=("wl_ytd_diff", "mean"),
        wl_career_diff_pair_mean=("wl_career_diff", "mean"),
    )
    .reset_index()
)


In [34]:
# =============================
# Save
# =============================
print(f"Saving to {OUTPUT_FILE} …")
with pd.ExcelWriter(OUTPUT_FILE, engine="xlsxwriter") as xlw:
    pair_summary.to_excel(xlw, index=False, sheet_name="players_list")

print("Done.")


Saving to C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp/pairs_df.xlsx …
Done.
