In [103]:
import os
import pandas as pd
from getpass import getuser

In [104]:
# =============================
# CONFIG
# =============================
USER = getuser()
INPUT_DIR = f"C:/Users/{USER}/Documents/GitHub/tennis-homophily/data/atp/"

MEN_MATCHES_FILE  = os.path.join(INPUT_DIR, "men_matches.xlsx")
MEN_RANKINGS_FILE = os.path.join(INPUT_DIR, "men_rankings.xlsx")
OUTPUT_FILE       = os.path.join(INPUT_DIR, "men_matches_with_ranks.xlsx")

TOURN_COL       = "tournament"
TOURN_CODE_COL  = "tournament_code"
STAGE_COL       = "stage"
RANK_COL        = "rank"

PLAYER_ROLES = [
    ("winners_p1_surname", "winners_p1"),
    ("winners_p2_surname", "winners_p2"),
    ("losers_p1_surname",  "losers_p1"),
    ("losers_p2_surname",  "losers_p2"),
]

In [105]:
# =============================
# Helpers
# =============================
def normalize_tournament(s):
    if pd.isna(s):
        return None
    return " ".join(str(s).lower().strip().split())


In [106]:
# =============================
# Load data
# =============================
matches  = pd.read_excel(MEN_MATCHES_FILE)
rankings = pd.read_excel(MEN_RANKINGS_FILE)

# Create composite key for each match
matches["row_key"] = list(zip(matches[TOURN_CODE_COL], matches[STAGE_COL]))

In [107]:
# =============================
# Normalize tournament name keys
# =============================
matches["tourn_key"]  = matches[TOURN_COL].map(normalize_tournament)
rankings["tourn_key"] = rankings[TOURN_COL].map(normalize_tournament)


In [108]:
# normalize surname + initial + year in rankings
rankings["surname"] = rankings["player_surname"].str.lower().str.strip()
rankings["initial"] = rankings["player_initial"].str.lower().str.strip().str[0]

# ensure year exists
if "year" not in rankings.columns:
    rankings["year"] = pd.to_datetime(rankings["dateweek"], errors="coerce").dt.year

# deterministic order (optional but nice)
rankings = rankings.sort_values(
    ["tourn_key", "surname", "initial", "year"],
    ignore_index=True
)

# keys used in the merge
rk_key_cols = ["tourn_key", "surname", "initial", "year"]

# all other ranking information kept
profile_cols = [c for c in rankings.columns if c not in rk_key_cols]


In [109]:
# Create *_initial from *_name for each role in matches 
for surname_col, role in PLAYER_ROLES:
    name_col = f"{role}_name"
    initial_col = f"{role}_initial"

    if name_col not in matches.columns:
        raise KeyError(f"Missing column: {name_col} (player first name)")

    matches[initial_col] = (
        matches[name_col]
        .astype(str)
        .str.strip()
        .str[0]          # first character
        .str.lower()
    )


In [110]:
# matches must already have 'year' column
if "year" not in matches.columns:
    raise KeyError("matches needs a 'year' column.")

long_rows = []

for surname_col, role in PLAYER_ROLES:
    if surname_col not in matches.columns:
        raise KeyError(f"Missing column: {surname_col}")

    initial_col = f"{role}_initial"
    if initial_col not in matches.columns:
        raise KeyError(f"Missing column: {initial_col} (did you create initials from player names?)")

    tmp = matches[[TOURN_CODE_COL, STAGE_COL, "tourn_key", "year", surname_col, initial_col]].copy()
    tmp.rename(columns={surname_col: "surname", initial_col: "initial"}, inplace=True)

    tmp["surname"] = tmp["surname"].str.lower().str.strip()
    tmp["initial"] = tmp["initial"].str.lower().str.strip().str[0]

    tmp["role"] = role
    tmp["row_key"] = list(zip(tmp[TOURN_CODE_COL], tmp[STAGE_COL], tmp["year"]))

    long_rows.append(tmp)

players_long = pd.concat(long_rows, ignore_index=True)
players_long = players_long.sort_values(["row_key", "role"], ignore_index=True)


In [111]:
print("players_long keys sample:")
print(
    players_long[["tourn_key", "surname", "initial", "year"]]
    .drop_duplicates()
    .head()
)

print("\nrankings keys sample:")
print(
    rankings[["tourn_key", "surname", "initial", "year"]]
    .drop_duplicates()
    .head()
)

# check overlap count on (tourn_key, surname, initial, year)
keys_matches = set(
    map(tuple, players_long[["tourn_key", "surname", "initial", "year"]]
        .drop_duplicates()
        .to_numpy())
)
keys_rank = set(
    map(tuple, rankings[["tourn_key", "surname", "initial", "year"]]
        .drop_duplicates()
        .to_numpy())
)

print("\n# (tourn_key, surname, initial, year) in matches:", len(keys_matches))
print("# (tourn_key, surname, initial, year) in rankings:", len(keys_rank))
print("# intersection:", len(keys_matches & keys_rank))


players_long keys sample:
       tourn_key  surname initial  year
0  roland garros   marach       o  2018
1  roland garros    pavic       m  2018
2  roland garros  herbert       p  2018
3  roland garros    mahut       n  2018
4  roland garros   chardy       j  2019

rankings keys sample:
         tourn_key surname initial  year
0  australian open  aboian       l  2020
1  australian open  aboian       l  2021
2  australian open  aboian       l  2022
3  australian open  aboian       l  2023
4  australian open  aboian       v  2022

# (tourn_key, surname, initial, year) in matches: 2900
# (tourn_key, surname, initial, year) in rankings: 22491
# intersection: 2775


In [112]:
merged_long = players_long.merge(
    rankings[rk_key_cols + profile_cols],
    on=["tourn_key", "surname", "initial", "year"],
    how="left",
    # validate="m:1"  # turn this on again once keys are confirmed correct
)


In [113]:
# build wide profiles per role
role_frames = []

for _, role in PLAYER_ROLES:
    role_df = (
        merged_long[merged_long["role"] == role]
        .drop_duplicates(subset=["row_key"])          # defensive
        .set_index("row_key")[profile_cols]           # index by row_key
        .add_prefix(f"{role}_")                       # winners_p1_rank, etc.
    )
    role_frames.append(role_df)

profiles_wide = pd.concat(role_frames, axis=1)

matches["row_key"] = list(zip(matches[TOURN_CODE_COL], matches[STAGE_COL], matches["year"]))

result = matches.set_index("row_key").join(profiles_wide, how="left").reset_index(drop=True)

In [114]:
# =============================
# Save output
# =============================
result.to_excel(OUTPUT_FILE, index=False)
print(f"Saved merged file with rankings:\n{OUTPUT_FILE}")


Saved merged file with rankings:
C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp/men_matches_with_ranks.xlsx


# unmatched observations

In [115]:
def why_unmatched(name="Adrian Mannarino", year=None):
    """
    Debug helper for the men_matches / men_rankings pipeline.
    Looks at (surname, initial, tourn_key, year) to see why a player
    has no attached rank.

    Parameters
    ----------
    name : str
        Any representation of the player name (e.g. 'Mannarino',
        'Adrian Mannarino').
    year : int or None
        If provided, restricts the search to that season (e.g. 2019).
    """
    print(f"== Debug for: {name} ==")

    # --- 1) Extract surname and initial from name ---
    surname = extract_surname(name)
    if surname is None:
        print("Could not extract surname from:", name)
        return
    initial = str(name).strip()[0].lower() if isinstance(name, str) and name.strip() else None

    print("surname key ->", surname)
    print("initial key ->", initial)

    # --- 2) Where he appears in matches ---
    surname_cols = [
        "winners_p1_surname",
        "winners_p2_surname",
        "losers_p1_surname",
        "losers_p2_surname",
    ]

    missing_cols = [c for c in surname_cols if c not in matches.columns]
    if missing_cols:
        print("Missing surname columns in matches:", missing_cols)
        return

    mask_matches = matches[surname_cols].apply(
        lambda s: s.astype(str).str.lower().str.contains(surname, na=False)
    ).any(axis=1)

    m_rows = matches.loc[
        mask_matches,
        [TOURN_COL, "tourn_key", "year", TOURN_CODE_COL, STAGE_COL] + surname_cols
    ]

    if year is not None and "year" in m_rows.columns:
        m_rows = m_rows[m_rows["year"] == year]

    print("\n-- Matches rows (tournament / tourn_key / year / code / stage) --")
    if m_rows.empty:
        print("No matches rows found with this surname (and year filter)." if year else
              "No matches rows found with this surname.")
    else:
        display(m_rows)

    # --- 3) Ranking rows for that surname + initial ---
    if "surname" not in rankings.columns or "initial" not in rankings.columns:
        print("\nRankings must have 'surname' and 'initial' columns.")
        return

    rk_rows = rankings[
        (rankings["surname"].str.lower() == surname) &
        (rankings["initial"].str.lower() == (initial or ""))
    ]

    if year is not None and "year" in rk_rows.columns:
        rk_rows = rk_rows[rk_rows["year"] == year]

    print("\n-- Ranking rows for surname + initial --")
    if rk_rows.empty:
        print("No ranking rows found with this (surname, initial) key"
              + (" and year filter." if year else "."))
    else:
        cols_to_show = [c for c in ["player", "surname", "initial",
                                    "tourn_key", TOURN_COL, "year", RANK_COL]
                        if c in rankings.columns]
        display(rk_rows[cols_to_show].sort_values(["year", "tourn_key", RANK_COL]))

    # --- 4) Compare (tourn_key, year) sets ---
    gs_pairs = (
        set(map(tuple, m_rows[["tourn_key", "year"]].dropna().drop_duplicates().to_numpy()))
        if not m_rows.empty else set()
    )
    rk_pairs = (
        set(map(tuple, rk_rows[["tourn_key", "year"]].dropna().drop_duplicates().to_numpy()))
        if not rk_rows.empty else set()
    )

    print("\n(tourn_key, year) in MATCHES:", gs_pairs)
    print("(tourn_key, year) in RANKINGS:", rk_pairs)
    print("In MATCHES but not RANKINGS:", gs_pairs - rk_pairs)

    # --- 5) Optional fuzzy search if no rows and a 'player' column exists ---
    if rk_rows.empty and "player" in rankings.columns:
        print("\nNo exact (surname, initial) match in rankings; fuzzy search in 'player' column:")
        fuzzy = rankings[rankings["player"].str.contains(surname, case=False, na=False)]
        if year is not None and "year" in fuzzy.columns:
            fuzzy = fuzzy[fuzzy["year"] == year]

        if fuzzy.empty:
            print("Still nothing — likely a real missing ranking entry or key mismatch.")
        else:
            cols_to_show = [c for c in ["player", "surname", "initial",
                                        "tourn_key", TOURN_COL, "year", RANK_COL]
                            if c in fuzzy.columns]
            display(
                fuzzy[cols_to_show]
                .sort_values(["year", "tourn_key", RANK_COL])
                .head(30)
            )


In [None]:
# Create detailed unmatched table
unmatched_rows = []

for surname_col, role in PLAYER_ROLES:
    rank_col    = f"{role}_rank"
    initial_col = f"{role}_initial"

    if rank_col not in result.columns:
        print(f"Warning: missing {rank_col} in result")
        continue
    if initial_col not in result.columns:
        print(f"Warning: missing {initial_col} in result")
        continue

    # observations where no ranking was matched
    mask = result[rank_col].isna()

    tmp = result.loc[
        mask,
        [TOURN_COL, TOURN_CODE_COL, STAGE_COL, "year", surname_col, initial_col]
    ].copy()

    tmp = tmp.rename(columns={surname_col: "surname", initial_col: "initial"})
    tmp["role"] = role

    unmatched_rows.append(tmp)

# Combine all unmatched rows
unmatched_df = pd.concat(unmatched_rows, ignore_index=True)

print("Unmatched detailed dataset created.")
display(unmatched_df.head())

# Create unique (initial, surname) list
unmatched_df["surname"] = unmatched_df["surname"].astype(str).str.lower().str.strip()
unmatched_df["initial"] = unmatched_df["initial"].astype(str).str.lower().str.strip().str[0]

unmatched_pairs = (
    unmatched_df[["initial", "surname"]]
    .dropna()
    .drop_duplicates()
    .sort_values(["surname", "initial"])
)

# 3) Save (initial surname) to TXT
txt_path = os.path.join(INPUT_DIR, "unmatched_players.txt")
with open(txt_path, "w", encoding="utf-8") as f:
    for _, row in unmatched_pairs.iterrows():
        f.write(f"{row['initial']} {row['surname']}\n")

print(f"Saved unmatched player list to:\n{txt_path}")

# 4) Save full detailed table
xlsx_path = os.path.join(INPUT_DIR, "unmatched_details.xlsx")
unmatched_df.to_excel(xlsx_path, index=False)

print(f"Saved detailed dataset to:\n{xlsx_path}")


Unmatched detailed dataset created.


Unnamed: 0,tournament,tournament_code,stage,year,surname,initial,role
0,Roland Garros,520,Quarter-Finals,2019,kukushkin,m,losers_p1
1,Roland Garros,520,Quarter-Finals,2019,ram,r,losers_p1
2,Roland Garros,520,Quarter-Finals,2019,lajovic,d,losers_p1
3,Roland Garros,520,Quarter-Finals,2019,rojer,j,losers_p1
4,US Open,560,Round of 64,2022,coria,f,losers_p1


Saved unmatched player list to:
C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp/unmatched_players.txt
Saved detailed dataset to:
C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp/unmatched_details.xlsx


In [122]:
why_unmatched("r ram")
why_unmatched("r ram", year=2019)


== Debug for: r ram ==
surname key -> ram
initial key -> r

-- Matches rows (tournament / tourn_key / year / code / stage) --


Unnamed: 0,tournament,tourn_key,year,tournament_code,stage,winners_p1_surname,winners_p2_surname,losers_p1_surname,losers_p2_surname
7,Australian Open,australian open,2018,580,Round of 16,kubot,melo,ram,sharan
12,Australian Open,australian open,2018,580,Round of 16,groth,hewitt,andujar,ramos-vinolas
20,Australian Open,australian open,2018,580,Round of 32,ram,sharan,fognini,granollers
22,Australian Open,australian open,2018,580,Round of 32,andujar,ramos-vinolas,johnson,querrey
40,Australian Open,australian open,2018,580,Round of 64,ram,sharan,copil,troicki
...,...,...,...,...,...,...,...,...,...
1323,US Open,us open,2022,560,Round of 64,ram,salisbury,coria,rodriguez
1334,US Open,us open,2022,560,Round of 64,bolelli,fognini,cacic,ramanathan
1335,US Open,us open,2022,560,Round of 64,behar,escobar,ramos-vinolas,miralles
1359,US Open,us open,2023,560,Round of 32,ram,salisbury,mclachlan,nishioka



-- Ranking rows for surname + initial --


Unnamed: 0,surname,initial,tourn_key,tournament,year,rank
4349,ram,r,australian open,Australian Open,2018,22
10213,ram,r,roland garros,Roland Garros,2018,29
16182,ram,r,us open,US Open,2018,33
21343,ram,r,wimbledon,Wimbledon,2018,29
4350,ram,r,australian open,Australian Open,2019,22
10214,ram,r,roland garros,Roland Garros,2019,24
16183,ram,r,us open,US Open,2019,17
21344,ram,r,wimbledon,Wimbledon,2019,25
4351,ram,r,australian open,Australian Open,2020,22
10215,ram,r,roland garros,Roland Garros,2020,6



(tourn_key, year) in MATCHES: {('australian open', 2021), ('roland garros', 2022), ('us open', 2021), ('roland garros', 2018), ('australian open', 2018), ('us open', 2018), ('australian open', 2022), ('roland garros', 2023), ('us open', 2022), ('wimbledon', 2021), ('roland garros', 2019), ('australian open', 2019), ('us open', 2019), ('wimbledon', 2018), ('australian open', 2023), ('us open', 2023), ('wimbledon', 2022), ('roland garros', 2020), ('wimbledon', 2019), ('wimbledon', 2023), ('australian open', 2020), ('roland garros', 2021), ('us open', 2020)}
(tourn_key, year) in RANKINGS: {('australian open', 2021), ('roland garros', 2022), ('us open', 2021), ('roland garros', 2018), ('australian open', 2018), ('us open', 2018), ('us open', 2022), ('australian open', 2022), ('roland garros', 2023), ('wimbledon', 2021), ('roland garros', 2019), ('australian open', 2019), ('us open', 2019), ('wimbledon', 2018), ('us open', 2023), ('australian open', 2023), ('wimbledon', 2022), ('roland gar

Unnamed: 0,tournament,tourn_key,year,tournament_code,stage,winners_p1_surname,winners_p2_surname,losers_p1_surname,losers_p2_surname
70,Australian Open,australian open,2019,580,Round of 16,herbert,mahut,ram,salisbury
81,Australian Open,australian open,2019,580,Round of 32,ram,salisbury,cuevas,verdasco
99,Australian Open,australian open,2019,580,Round of 64,ram,salisbury,podlipnik-castillo,pella
119,Australian Open,australian open,2019,580,Round of 64,mannarino,mies,gojowczyk,ramos-vinolas
442,Roland Garros,roland garros,2019,520,Quarter-Finals,chardy,martin,ram,salisbury
447,Roland Garros,roland garros,2019,520,Round of 16,ram,salisbury,kontinen,peers
459,Roland Garros,roland garros,2019,520,Round of 32,ram,salisbury,couacaud,lamasine
475,Roland Garros,roland garros,2019,520,Round of 64,ram,salisbury,dzumhur,krajinovic
837,Wimbledon,wimbledon,2019,540,Round of 16,kontinen,peers,ram,salisbury
850,Wimbledon,wimbledon,2019,540,Round of 32,ram,salisbury,ebden,pospisil



-- Ranking rows for surname + initial --


Unnamed: 0,surname,initial,tourn_key,tournament,year,rank
4350,ram,r,australian open,Australian Open,2019,22
10214,ram,r,roland garros,Roland Garros,2019,24
16183,ram,r,us open,US Open,2019,17
21344,ram,r,wimbledon,Wimbledon,2019,25



(tourn_key, year) in MATCHES: {('us open', 2019), ('roland garros', 2019), ('wimbledon', 2019), ('australian open', 2019)}
(tourn_key, year) in RANKINGS: {('us open', 2019), ('roland garros', 2019), ('wimbledon', 2019), ('australian open', 2019)}
In MATCHES but not RANKINGS: set()
