In [1]:
import pandas as pd
import os
import numpy as np
from getpass import getuser

In [44]:
# =============================
# CONFIG
# =============================
USER = getuser()
INPUT_DIR = f"C:/Users/{USER}/Documents/GitHub/tennis-homophily/data/atp/"
INPUT_FILE = os.path.join(INPUT_DIR, "men_matches_with_ranks.xlsx")
OUTPUT_FILE = os.path.join(INPUT_DIR, "win_lose_df.xlsx")
CLEANED_FILE = os.path.join(INPUT_DIR, "men_matches_with_ranks_cleaned.xlsx")
gravity_lang_lookup_dir = os.path.join(INPUT_DIR, "gravity_lang_lookup.xlsx")


In [5]:

# =============================
# Load Dataset
# =============================
print("Loading Grand Slam matches…")
if not os.path.exists(INPUT_FILE):
    raise FileNotFoundError(f"Grand Slam file not found: {INPUT_FILE}")

df = pd.read_excel(INPUT_FILE)
print(df.shape)
display(df.head())
df.columns.tolist()


Loading Grand Slam matches…
(1405, 272)


Unnamed: 0,match_id,tournament,location,date,year,tournament_code,stage,match_duration,winners_set1,winners_set2,...,losers_p2_abs_gap_10y,losers_p2_single_specialist,losers_p2_birthplace-city,losers_p2_birthplace-country,losers_p2_hand,losers_p2_backhand,losers_p2_colonial_legacy,losers_p2_federal_legacy,losers_p2_country_std,losers_p2_iso3
0,0,Australian Open,"Melbourne,Australia",2018-01-15,2018,580,Finals,01:33:00,6,6.0,...,,0.0,Montreal,Canada,Right-Handed,Two-Handed Backhand,Spanish,,Colombia,COL
1,1,Australian Open,"Melbourne,Australia",2018-01-15,2018,580,Semi-Finals,02:26:00,4,7.0,...,,,,,,,,,,
2,2,Australian Open,"Melbourne,Australia",2018-01-15,2018,580,Semi-Finals,01:29:00,7,7.0,...,,0.0,"Camarillo, CA, USA",,Right-Handed,One-Handed Backhand,British,,United States,USA
3,3,Australian Open,"Melbourne,Australia",2018-01-15,2018,580,Quarter-Finals,01:06:00,6,6.0,...,,0.0,Lahore,Pakistan,Right-Handed,One-Handed Backhand,,,Pakistan,PAK
4,4,Australian Open,"Melbourne,Australia",2018-01-15,2018,580,Quarter-Finals,02:40:00,6,6.0,...,,0.0,London,England,Right-Handed,One-Handed Backhand,British,,Great Britain,GBR


['match_id',
 'tournament',
 'location',
 'date',
 'year',
 'tournament_code',
 'stage',
 'match_duration',
 'winners_set1',
 'winners_set2',
 'winners_set3',
 'winners_set1_tiebreak',
 'winners_set2_tiebreak',
 'winners_set3_tiebreak',
 'losers_set1',
 'losers_set2',
 'losers_set3',
 'losers_set1_tiebreak',
 'losers_set2_tiebreak',
 'losers_set3_tiebreak',
 'winners_set4',
 'winners_set5',
 'losers_set4',
 'losers_set5',
 'winners_set4_tiebreak',
 'losers_set4_tiebreak',
 'losers_set5_tiebreak',
 'winners_p1_name',
 'winners_p1_surname',
 'winners_p1_ranking',
 'winners_p1_status',
 'winners_p2_name',
 'winners_p2_surname',
 'winners_p2_ranking',
 'winners_p2_status',
 'losers_p1_name',
 'losers_p1_surname',
 'losers_p1_ranking',
 'losers_p1_status',
 'losers_p2_name',
 'losers_p2_surname',
 'losers_p2_ranking',
 'losers_p2_status',
 'tourn_key',
 'winners_p1_initial',
 'winners_p2_initial',
 'losers_p1_initial',
 'losers_p2_initial',
 'winners_p1_rank',
 'winners_p1_tourns',
 'winner

# CEPII data language

In [None]:

path = f"C:/Users/{USER}/Documents/GitHub/tennis-homophily/data/atp/Gravity_V202211.csv"

# ISO3 presenti nel dataset match
iso_set = set(df["winners_p1_iso3"]) | \
          set(df["winners_p2_iso3"]) | \
          set(df["losers_p1_iso3"]) | \
          set(df["losers_p2_iso3"])

usecols = ["year", "iso3_o", "iso3_d", "comlang_off", "comlang_ethno"]

dtypes = {
    "year": "int16",
    "iso3_o": "string",
    "iso3_d": "string",
    "comlang_off": "Int8",
    "comlang_ethno": "Int8",
}

chunks = []

for chunk in pd.read_csv(path, usecols=usecols, dtype=dtypes, chunksize=300_000):

    # filtro anni
    chunk = chunk[chunk["year"].between(2018, 2023)]
    if chunk.empty:
        continue

    # filtro paesi rilevanti
    chunk = chunk[
        chunk["iso3_o"].isin(iso_set) &
        chunk["iso3_d"].isin(iso_set)
    ]

    if not chunk.empty:
        chunks.append(chunk)

gravity_lang = pd.concat(chunks, ignore_index=True)

gravity_lang.shape


(10404, 5)

In [None]:
# Creazione dataset simmetrico (iso3_o, iso3_d) <-> (iso3_d, iso3_o)
gravity_lang_sym = pd.concat(
    [
        gravity_lang,
        gravity_lang.rename(columns={"iso3_o":"iso3_d","iso3_d":"iso3_o"})
    ],
    ignore_index=True
)

gravity_lang_sym = gravity_lang_sym.drop_duplicates(
    subset=["year","iso3_o","iso3_d"]
)


Unnamed: 0,year,iso3_o,iso3_d,comlang_off,comlang_ethno
0,2018,ARG,ARG,0,0
1,2019,ARG,ARG,0,0
2,2020,ARG,ARG,0,0
3,2021,ARG,ARG,0,0
4,2018,ARG,AUS,0,0
...,...,...,...,...,...
10399,2021,ZAF,VEN,0,0
10400,2018,ZAF,ZAF,0,0
10401,2019,ZAF,ZAF,0,0
10402,2020,ZAF,ZAF,0,0


In [None]:
# symmetry: same nationality -->  same language
same = gravity_lang_sym["iso3_o"] == gravity_lang_sym["iso3_d"]
gravity_lang_sym.loc[same, ["comlang_off","comlang_ethno"]] = 1
gravity_lang_sym


Unnamed: 0,year,iso3_o,iso3_d,comlang_off,comlang_ethno
0,2018,ARG,ARG,1,1
1,2019,ARG,ARG,1,1
2,2020,ARG,ARG,1,1
3,2021,ARG,ARG,1,1
4,2018,ARG,AUS,0,0
...,...,...,...,...,...
10399,2021,ZAF,VEN,0,0
10400,2018,ZAF,ZAF,1,1
10401,2019,ZAF,ZAF,1,1
10402,2020,ZAF,ZAF,1,1


In [None]:
# make dataset lookup: (iso3_o, iso3_d) -> (comlang_off, comlang_ethno)
gravity_lang_lookup = (
    gravity_lang_sym
    .groupby(["iso3_o","iso3_d"], as_index=False)[
        ["comlang_off","comlang_ethno"]
    ]
    .max()
)
gravity_lang_lookup


Unnamed: 0,iso3_o,iso3_d,comlang_off,comlang_ethno
0,ARG,ARG,1,1
1,ARG,AUS,0,0
2,ARG,AUT,0,0
3,ARG,BEL,0,0
4,ARG,BIH,0,0
...,...,...,...,...
2299,ZAF,UKR,0,0
2300,ZAF,URY,0,0
2301,ZAF,USA,1,1
2302,ZAF,VEN,0,0


In [46]:
# =============================
# Save
# =============================
print(f"Saving to {gravity_lang_lookup_dir} …")
with pd.ExcelWriter(gravity_lang_lookup_dir, engine="xlsxwriter") as xlw:
    gravity_lang_lookup.to_excel(xlw, index=False, sheet_name="main_sheet")

print("Done.")


Saving to C:/Users/ALESSANDRO/Documents/GitHub/tennis-homophily/data/atp/gravity_lang_lookup.xlsx …
Done.


In [None]:
# gravity_lang_lookup = pd.read_excel(gravity_lang_lookup_dir, sheet_name="main_sheet")

In [26]:
# merge winners
df = df.merge(
    gravity_lang_lookup,
    left_on=["winners_p1_iso3","winners_p2_iso3"],
    right_on=["iso3_o","iso3_d"],
    how="left"
)

df = df.rename(columns={
    "comlang_off":"winners_same_language",
    "comlang_ethno":"winners_linguistic_proximity"
}).drop(columns=["iso3_o","iso3_d"])

# merge losers
df = df.merge(
    gravity_lang_lookup,
    left_on=["losers_p1_iso3","losers_p2_iso3"],
    right_on=["iso3_o","iso3_d"],
    how="left"
)

df = df.rename(columns={
    "comlang_off":"losers_same_language",
    "comlang_ethno":"losers_linguistic_proximity"
}).drop(columns=["iso3_o","iso3_d"])



In [None]:
cols = [
    "winners_same_language",
    "winners_linguistic_proximity",
    "losers_same_language",
    "losers_linguistic_proximity"
]

for c in cols:
    df[c] = df[c].fillna(0).astype(int)


Unnamed: 0,match_id,tournament,location,date,year,tournament_code,stage,match_duration,winners_set1,winners_set2,...,losers_p2_colonial_legacy,losers_p2_federal_legacy,losers_p2_country_std,losers_p2_iso3,winners_same_language,winners_linguistic_proximity,winners_same_language.1,winners_linguistic_proximity.1,losers_same_language,losers_linguistic_proximity
0,0,Australian Open,"Melbourne,Australia",2018-01-15,2018,580,Finals,01:33:00,6,6.0,...,Spanish,,Colombia,COL,0,0,0,0,1,1
1,1,Australian Open,"Melbourne,Australia",2018-01-15,2018,580,Semi-Finals,02:26:00,4,7.0,...,,,,,0,0,0,0,0,0
2,2,Australian Open,"Melbourne,Australia",2018-01-15,2018,580,Semi-Finals,01:29:00,7,7.0,...,British,,United States,USA,1,1,1,1,1,1
3,3,Australian Open,"Melbourne,Australia",2018-01-15,2018,580,Quarter-Finals,01:06:00,6,6.0,...,,,Pakistan,PAK,1,1,1,1,0,0
4,4,Australian Open,"Melbourne,Australia",2018-01-15,2018,580,Quarter-Finals,02:40:00,6,6.0,...,British,,Great Britain,GBR,0,0,0,0,1,1


  # Filter dataset

In [31]:
# Rimuovi l'anno 2020 dal dataset, COVID time, Wimblendon non si è giocato
df = df[df['year'] != 2020].copy()

# Controllo rapido
print(df['year'].value_counts())

2018    262
2019    250
2021    249
2022    249
2023    238
Name: year, dtype: int64


# Create new variables

In [245]:
# Create surface variable
surface_map = {
    "Australian Open": "Hard",
    "US Open": "Hard",
    "Roland Garros": "Clay",
    "Wimbledon": "Grass"
}

df['surface'] = df['tournament'].map(surface_map)


In [None]:
# Normalize stage names
df['stage_clean'] = df['stage'].str.lower().str.strip()

stage_order = {
    "finals": 7,
    "semi-finals": 6,
    "quarter-finals": 5,
    "round of 16": 4,
    "round of 32": 3,
    "round of 64": 2,
    "2nd round qualifying": 1,
    "1st round qualifying": 0
}

df['stage_code'] = df['stage_clean'].map(stage_order)
df['stage_code'].unique()

In [None]:
# --- Physical Homophily --- #

# --- hand ---

df['same_hand_winners'] = (df['winners_p1_hand'] == df['winners_p2_hand']).astype(int)
df['same_hand_losers']  = (df['losers_p1_hand']  == df['losers_p2_hand']).astype(int)

# --- backhand ---

df['same_backhand_winners'] = (df['winners_p1_backhand'] == df['winners_p2_backhand']).astype(int)
df['same_backhand_losers']  = (df['losers_p1_backhand']  == df['losers_p2_backhand']).astype(int)

# --- height ---
df['height_diff_winners'] = (df['winners_p1_height-cm'] - df['winners_p2_height-cm']).abs()
df['height_diff_losers']  = (df['losers_p1_height-cm']  - df['losers_p2_height-cm']).abs()

# --- weight ---
df['weight_diff_winners'] = (df['winners_p1_weight-kg'] - df['winners_p2_weight-kg']).abs()
df['weight_diff_losers']  = (df['losers_p1_weight-kg']  - df['losers_p2_weight-kg']).abs()

# --- experience ---

df['experience_diff_winners'] = (df['winners_p1_experience_double'] - df['winners_p2_experience_double']).abs()
df['experience_diff_losers']  = (df['losers_p1_experience_double']  - df['losers_p2_experience_double']).abs()

df['rank_diff_winners'] = (df['winners_p1_rank'] - df['winners_p2_rank']).abs()
df['rank_diff_losers']  = (df['losers_p1_rank']  - df['losers_p2_rank']).abs()

df["single_rank_diff_winners"] = (df['winners_p1_singles_career_high_rank_num'] - df['winners_p2_singles_career_high_rank_num']).abs()
df["single_rank_diff_losers"]  = (df['losers_p1_singles_career_high_rank_num']  - df['losers_p2_singles_career_high_rank_num']).abs()

In [None]:

# --- Social Homophily --- #

# --- nationality ---
df['same_country_winners'] = (df['winners_p1_country'] == df['winners_p2_country']).astype(int)
df['same_country_losers']  = (df['losers_p1_country']  == df['losers_p2_country']).astype(int)

# --- coach ---
df['same_coach_winners'] = (
    (df['winners_p1_coach'].notna()) &
    (df['winners_p2_coach'].notna()) &
    (df['winners_p1_coach'] == df['winners_p2_coach'])
).astype(int)

df['same_coach_losers'] = (
    (df['losers_p1_coach'].notna()) &
    (df['losers_p2_coach'].notna()) &
    (df['losers_p1_coach'] == df['losers_p2_coach'])
).astype(int)


In [None]:

# Differenza assoluta nel win ratio YTD e Career
df['wl_ytd_diff_winners'] = (df['winners_p1_doubles_win_ratio-ytd'] - df['winners_p2_doubles_win_ratio-ytd']).abs()
df['wl_ytd_diff_losers'] = (df['losers_p1_doubles_win_ratio-ytd']  - df['losers_p2_doubles_win_ratio-ytd']).abs()

df['wl_career_diff_winners'] = (df['winners_p1_doubles_win_ratio-career'] - df['winners_p2_doubles_win_ratio-career']).abs()
df['wl_career_diff_losers']  = (df['losers_p1_doubles_win_ratio-career']  - df['losers_p2_doubles_win_ratio-career']).abs()

# Differenza in titoli (YTD e Career)
df['titles_ytd_diff_winners']    = (df['winners_p1_doubles_titles-ytd'] - df['winners_p2_doubles_titles-ytd']).abs()
df['titles_ytd_diff_losers']     = (df['losers_p1_doubles_titles-ytd'] - df['losers_p2_doubles_titles-ytd']).abs()

df['titles_career_diff_winners'] = (df['winners_p1_doubles_titles-career'] - df['winners_p2_doubles_titles-career']).abs()
df['titles_career_diff_losers']  = (df['losers_p1_doubles_titles-career'] - df['losers_p2_doubles_titles-career']).abs()

# single specialists
df['single_specialist_winners'] = (
  (df['winners_p1_single_specialist'] == 1) & (df['winners_p2_single_specialist'] == 1)
).astype(int)

df['single_specialist_losers']  = (
  (df['losers_p1_single_specialist'] == 1) & (df['losers_p2_single_specialist'] == 1)
).astype(int)

 # some stats

In [None]:

cols = [
    "winners_p1_rank","winners_p2_rank",
    "losers_p1_rank","losers_p2_rank",
    "winners_p1_singles_career_high_rank_num",
    "winners_p2_singles_career_high_rank_num",
    "losers_p1_singles_career_high_rank_num",
    "losers_p2_singles_career_high_rank_num"
]

for c in cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")


## single specialist + best in pair

In [35]:
### WINNERS ###

w_p1_single = df["winners_p1_single_specialist"].eq(1)
w_p2_single = df["winners_p2_single_specialist"].eq(1)

w_p1_best = df["winners_p1_rank"] < df["winners_p2_rank"]
w_p2_best = df["winners_p2_rank"] < df["winners_p1_rank"]

df["winner_specialist_best"] = (
    (w_p1_single & w_p1_best) |
    (w_p2_single & w_p2_best)
).astype(int)


### LOSERS ###

l_p1_single = df["losers_p1_single_specialist"].eq(1)
l_p2_single = df["losers_p2_single_specialist"].eq(1)

l_p1_best = df["losers_p1_rank"] < df["losers_p2_rank"]
l_p2_best = df["losers_p2_rank"] < df["losers_p1_rank"]

df["loser_specialist_best"] = (
    (l_p1_single & l_p1_best) |
    (l_p2_single & l_p2_best)
).astype(int)


In [36]:
df["structure_diff"] = df["winner_specialist_best"] - df["loser_specialist_best"]

structure_counts = df["structure_diff"].value_counts()
print("Structure comparison:")
print(structure_counts)

advantage_wins = (df["structure_diff"] == 1).sum()
advantage_losses = (df["structure_diff"] == -1).sum()

if (advantage_wins + advantage_losses) > 0:
    advantage_rate = advantage_wins / (advantage_wins + advantage_losses)
else:
    advantage_rate = np.nan

print("\nWin rate when structure differs:", round(advantage_rate,4))


Structure comparison:
 0    1201
-1      25
 1      22
Name: structure_diff, dtype: int64

Win rate when structure differs: 0.4681


## does the team with the best current double ranked player win more often?

In [37]:
df["winner_best_rank"] = df[["winners_p1_rank","winners_p2_rank"]].min(axis=1)
df["loser_best_rank"]  = df[["losers_p1_rank","losers_p2_rank"]].min(axis=1)


In [38]:
df["winner_has_best_current_rank"] = (
    df["winner_best_rank"] < df["loser_best_rank"]
).astype(int)

current_rank_wins = df["winner_has_best_current_rank"].sum()
total_valid = df["winner_has_best_current_rank"].notna().sum()

print("\nWins by team with best CURRENT rank:", current_rank_wins)
print("Share:", round(current_rank_wins / total_valid,4))



Wins by team with best CURRENT rank: 496
Share: 0.3974


## does the team with the best single ranked player win more often?

In [39]:
df["winner_best_career"] = df[
    ["winners_p1_singles_career_high_rank_num",
     "winners_p2_singles_career_high_rank_num"]
].min(axis=1)

df["loser_best_career"] = df[
    ["losers_p1_singles_career_high_rank_num",
     "losers_p2_singles_career_high_rank_num"]
].min(axis=1)


In [40]:
df["winner_has_best_career_rank"] = (
    df["winner_best_career"] < df["loser_best_career"]
).astype(int)

career_rank_wins = df["winner_has_best_career_rank"].sum()
total_valid_career = df["winner_has_best_career_rank"].notna().sum()

print("\nWins by team with best CAREER-HIGH rank:", career_rank_wins)
print("Share:", round(career_rank_wins / total_valid_career,4))



Wins by team with best CAREER-HIGH rank: 409
Share: 0.3277


# dataset: one row for winners, one row for losers

In [259]:
team_rows = []

for i, row in df.iterrows():

    # --------------------------
    # WINNERS ROW
    # --------------------------
    team_rows.append({
        'match_id': row['match_id'],
        'team': 'winners',
        'won': 1,

        'homophily': row['homophily_index_winners'],

        # Z-differences
        'height_diff': row['height_diff_winners_z'],
        'weight_diff': row['weight_diff_winners_z'],
        'experience_diff': row['experience_diff_winners_z'],

        # Player experience
        'experience_p1': row['winners_p1_experience_double'],
        'experience_p2': row['winners_p2_experience_double'],

        # single specialists
        'single_specialist': row['single_specialist_winners'],
        'single_specialist_p1': row['winners_p1_single_specialist'],
        'single_specialist_p2': row['winners_p2_single_specialist'],

        # Rank
        'rank_p1': row['winners_p1_rank'],
        'rank_p2': row['winners_p2_rank'],
        'rank_diff': row['rank_diff_winners'],
        "rank_single_p1": row['winners_p1_singles_career_high_rank_num'],
        "rank_single_p2": row['winners_p2_singles_career_high_rank_num'],
        'rank_diff_single': row['single_rank_diff_winners'],

        # W/L career ratio
        'wl_career_diff': row['wl_career_diff_winners_z'],  
        'wl_career_p1': row['winners_p1_doubles_win_ratio-career'],
        'wl_career_p2': row['winners_p2_doubles_win_ratio-career'],

        # W/L YTD ratio
        'wl_ytd_diff': row['wl_ytd_diff_winners_z'],
        'wl_ytd_p1': row['winners_p1_doubles_win_ratio-ytd'],
        'wl_ytd_p2': row['winners_p2_doubles_win_ratio-ytd'],

        # Titles (career)
        'titles_p1': row['winners_p1_doubles_titles-career'],
        'titles_p2': row['winners_p2_doubles_titles-career'],
        'titles_career_diff': row['titles_career_diff_winners_z'],

        # Homophily components
        'same_country': row['same_country_winners'],
        'same_hand': row['same_hand_winners'],
        'same_backhand': row['same_backhand_winners'],
        'same_coach': row['same_coach_winners'],
        'same_language': row['winners_same_language'],
        'linguistic_proximity': row['winners_linguistic_proximity'],

        # Match meta
        'year': row['year'],
        'tournament': row['tournament'],
        'stage': row['stage_code'],
        'surface': row['surface']
    })


    # --------------------------
    # LOSERS ROW
    # --------------------------
    team_rows.append({
        'match_id': row['match_id'],
        'team': 'losers',
        'won': 0,

        'homophily': row['homophily_index_losers'],

        # Z-differences
        'height_diff': row['height_diff_losers_z'],
        'weight_diff': row['weight_diff_losers_z'],
        'experience_diff': row['experience_diff_losers_z'],

        # Player experience
        'experience_p1': row['losers_p1_experience_double'],
        'experience_p2': row['losers_p2_experience_double'],

        # Rank
        'rank_p1': row['losers_p1_rank'],
        'rank_p2': row['losers_p2_rank'],
        'rank_diff': row['rank_diff_losers'],
        "rank_single_p1": row['losers_p1_singles_career_high_rank_num'],
        "rank_single_p2": row['losers_p2_singles_career_high_rank_num'],
        'rank_diff_single': row['single_rank_diff_losers'],

        # single specialists
        'single_specialist': row['single_specialist_losers'],
        'single_specialist_p1': row['losers_p1_single_specialist'],
        'single_specialist_p2': row['losers_p2_single_specialist'],

        # W/L career ratio
        'wl_career_diff': row['wl_career_diff_losers_z'],   
        'wl_career_p1': row['losers_p1_doubles_win_ratio-career'],
        'wl_career_p2': row['losers_p2_doubles_win_ratio-career'],

        # W/L YTD ratio
        'wl_ytd_diff': row['wl_ytd_diff_losers_z'],
        'wl_ytd_p1': row['losers_p1_doubles_win_ratio-ytd'],
        'wl_ytd_p2': row['losers_p2_doubles_win_ratio-ytd'],

        # Titles
        'titles_p1': row['losers_p1_doubles_titles-career'],
        'titles_p2': row['losers_p2_doubles_titles-career'],
        'titles_career_diff': row['titles_career_diff_losers_z'],

        # Homophily components
        'same_country': row['same_country_losers'],
        'same_hand': row['same_hand_losers'],
        'same_backhand': row['same_backhand_losers'],
        'same_coach': row['same_coach_losers'],
        'same_language': row['losers_same_language'],
        'linguistic_proximity': row['losers_linguistic_proximity'],

        # Match meta
        'year': row['year'],
        'tournament': row['tournament'],
        'stage': row['stage_code'],
        'surface': row['surface']
    })

# Build the final team-level dataframe
team_df = pd.DataFrame(team_rows)


In [261]:
# Team-level ability (media/somma)
team_df['rank_team'] = team_df[['rank_p1', 'rank_p2']].mean(axis=1)
team_df['rank_team_best'] = team_df[['rank_p1', 'rank_p2']].min(axis=1)
team_df['rank_single_team'] = team_df[['rank_single_p1', 'rank_single_p2']].mean(axis=1)
team_df['rank_single_best'] = team_df[['rank_single_p1', 'rank_single_p2']].min(axis=1)
team_df['experience_team'] = (team_df['experience_p1'] + team_df['experience_p2']) / 2
team_df['wl_career_team'] = team_df['wl_career_p1'] + team_df['wl_career_p2']
team_df['wl_ytd_team'] = team_df['wl_ytd_p1'] + team_df['wl_ytd_p2']
team_df['titles_team'] = team_df['titles_p1'] + team_df['titles_p2']
team_df['wl_ratio_team_career'] = team_df['wl_career_p1'] + team_df['wl_career_p2']
team_df['wl_ratio_team_ytd']    = team_df['wl_ytd_p1']    + team_df['wl_ytd_p2']


# single players 

In [262]:
needed_cols = ["match_id","won","rank_single_team","rank_single_best","single_specialist"]

df = team_df.dropna(subset=needed_cols).copy()

# solo match con una squadra vincente e una perdente
valid = df.groupby("match_id")["won"].apply(lambda s: set(s) == {0,1})
df = df[df["match_id"].isin(valid[valid].index)].copy()


In [263]:
wide = df.pivot(index="match_id", columns="won",
                values=["rank_single_team","rank_single_best","single_specialist"])

# rinomina colonne
wide.columns = [f"{v}_{'winner' if w==1 else 'loser'}" for v,w in wide.columns]
wide = wide.reset_index()


In [264]:
# ranking: positivo → winner migliore
wide["rank_advantage"] = wide["rank_single_team_loser"] - wide["rank_single_team_winner"]
wide["best_player_advantage"] = wide["rank_single_best_loser"] - wide["rank_single_best_winner"]

# indicatori intuitivi
wide["winner_better_rank"] = wide["rank_advantage"] > 0
wide["winner_better_best_player"] = wide["best_player_advantage"] > 0


In [265]:
# match dove le squadre differiscono
spec_matches = wide[wide["single_specialist_winner"] != wide["single_specialist_loser"]].copy()

# 1 se vince il team specialist
spec_matches["specialist_wins"] = spec_matches["single_specialist_winner"] == 1


In [266]:
rank_table = pd.DataFrame({
    "Share of matches won": [
        wide["winner_better_rank"].mean(),
        wide["winner_better_best_player"].mean()
    ]
}, index=[
    "Team with better average singles ranking wins",
    "Team with best singles player wins"
])


In [267]:
spec_table = pd.DataFrame({
    "Share of matches won": [spec_matches["specialist_wins"].mean()]
}, index=[
    "Specialist team wins vs non-specialist"
])

In [268]:
final_table = pd.concat([rank_table, spec_table])
final_table


Unnamed: 0,Share of matches won
Team with better average singles ranking wins,0.516927
Team with best singles player wins,0.532552
Specialist team wins vs non-specialist,0.44


# language

In [269]:
needed_cols = [
    "match_id","won",
    "same_language","linguistic_proximity"
]

df_lang = team_df.dropna(subset=needed_cols).copy()

valid = df_lang.groupby("match_id")["won"].apply(lambda s: set(s) == {0,1})
df_lang = df_lang[df_lang["match_id"].isin(valid[valid].index)].copy()


In [282]:
wide_lang = df_lang.pivot(index="match_id", columns="won",
                          values=["same_language","linguistic_proximity"])

wide_lang.columns = [f"{v}_{'winner' if w==1 else 'loser'}" for v,w in wide_lang.columns]
wide_lang = wide_lang.reset_index()


In [283]:
# match dove differiscono
sl_matches = wide_lang[wide_lang["same_language_winner"] != wide_lang["same_language_loser"]].copy()

# 1 se vince la squadra con stessa lingua
sl_matches["same_language_wins"] = sl_matches["same_language_winner"] == 1


In [284]:
lp_matches = wide_lang[wide_lang["linguistic_proximity_winner"] != wide_lang["linguistic_proximity_loser"]].copy()

lp_matches["linguistic_proximity_wins"] = lp_matches["linguistic_proximity_winner"] == 1


In [285]:
language_table = pd.DataFrame({
    "Share of matches won": [
        sl_matches["same_language_wins"].mean(),
        lp_matches["linguistic_proximity_wins"].mean()
    ]
}, index=[
    "Same official language team wins vs different language",
    "Linguistically proximate team wins vs non-proximate"
])

language_table


Unnamed: 0,Share of matches won
Same official language team wins vs different language,0.646536
Linguistically proximate team wins vs non-proximate,0.643739


In [280]:
# =============================
# Save
# =============================
print(f"Saving to {OUTPUT_FILE} …")
with pd.ExcelWriter(OUTPUT_FILE, engine="xlsxwriter") as xlw:
    team_df.to_excel(xlw, index=False, sheet_name="players_list")

print("Done.")


Saving to C:/Users/ALESSANDRO/Documents/GitHub/tennis-homophily/data/atp/win_lose_df.xlsx …
Done.


In [281]:
# =============================
# Save
# =============================
print(f"Saving to {CLEANED_FILE} …")
with pd.ExcelWriter(CLEANED_FILE, engine="xlsxwriter") as xlw:
    df.to_excel(xlw, index=False, sheet_name="players_list")

print("Done.")


Saving to C:/Users/ALESSANDRO/Documents/GitHub/tennis-homophily/data/atp/men_matches_with_ranks_cleaned.xlsx …
Done.
