In [50]:
import os
import pandas as pd
from getpass import getuser

In [51]:
# =============================
# CONFIG
# =============================
USER = getuser()
INPUT_DIR = f"C:/Users/{USER}/Documents/GitHub/tennis-homophily/data/atp/"

MEN_MATCHES_FILE  = os.path.join(INPUT_DIR, "men_matches.xlsx")
MEN_RANKINGS_FILE = os.path.join(INPUT_DIR, "men_rankings.xlsx")
OUTPUT_FILE       = os.path.join(INPUT_DIR, "men_matches_with_ranks.xlsx")

TOURN_COL       = "tournament"
TOURN_CODE_COL  = "tournament_code"
STAGE_COL       = "stage"
RANK_COL        = "rank"

PLAYER_ROLES = [
    ("winners_p1_surname", "winners_p1"),
    ("winners_p2_surname", "winners_p2"),
    ("losers_p1_surname",  "losers_p1"),
    ("losers_p2_surname",  "losers_p2"),
]

In [52]:
# =============================
# Helpers
# =============================
def normalize_tournament(s):
    if pd.isna(s):
        return None
    return " ".join(str(s).lower().strip().split())


In [53]:
# =============================
# Load data
# =============================
matches  = pd.read_excel(MEN_MATCHES_FILE)
rankings = pd.read_excel(MEN_RANKINGS_FILE)

# Create composite key for each match
matches["row_key"] = list(zip(matches[TOURN_CODE_COL], matches[STAGE_COL]))

In [54]:
# =============================
# Normalize tournament name keys
# =============================
matches["tourn_key"]  = matches[TOURN_COL].map(normalize_tournament)
rankings["tourn_key"] = rankings[TOURN_COL].map(normalize_tournament)


In [55]:
# normalize surname + year in rankings
rankings["surname"] = rankings["player"].str.lower().str.strip()

if "year" not in rankings.columns:
    # derive year from ranking date if needed
    rankings["year"] = pd.to_datetime(rankings["dateweek"], errors="coerce").dt.year

# sort and assign surname_slot per (tourn_key, surname, year)
rankings = rankings.sort_values(["tourn_key", "surname", "year", "player"], ignore_index=True)
rankings["surname_slot"] = rankings.groupby(["tourn_key", "surname", "year"]).cumcount()

rk_key_cols = ["tourn_key", "surname", "surname_slot", "year"]
profile_cols = [c for c in rankings.columns if c not in rk_key_cols]


In [56]:
# matches must already have 'year' column
if "year" not in matches.columns:
    raise KeyError("matches needs a 'year' column.")

long_rows = []

for surname_col, role in PLAYER_ROLES:
    if surname_col not in matches.columns:
        raise KeyError(f"Missing column: {surname_col}")

    tmp = matches[[TOURN_CODE_COL, STAGE_COL, "tourn_key", "year", surname_col]].copy()
    tmp.rename(columns={surname_col: "surname"}, inplace=True)
    tmp["surname"] = tmp["surname"].str.lower().str.strip()
    tmp["role"] = role

    # composite key for the match (code, stage, year)
    tmp["row_key"] = list(zip(tmp[TOURN_CODE_COL], tmp[STAGE_COL], tmp["year"]))

    long_rows.append(tmp)

players_long = pd.concat(long_rows, ignore_index=True)
players_long = players_long.sort_values(["row_key", "role"], ignore_index=True)

# assign surname_slot inside each match
players_long["surname_slot"] = players_long.groupby(
    ["row_key", "tourn_key", "surname"]
).cumcount()


In [57]:
merged_long = players_long.merge(
    rankings[rk_key_cols + profile_cols],
    on=["tourn_key", "surname", "surname_slot", "year"],
    how="left",
    # remove validate for now while debugging
    # validate="m:1"
)


In [58]:
# build wide profiles per role
role_frames = []

for _, role in PLAYER_ROLES:
    role_df = (
        merged_long[merged_long["role"] == role]
        .drop_duplicates(subset=["row_key"])          # defensive
        .set_index("row_key")[profile_cols]           # index by row_key
        .add_prefix(f"{role}_")                       # winners_p1_rank, etc.
    )
    role_frames.append(role_df)

profiles_wide = pd.concat(role_frames, axis=1)

matches["row_key"] = list(zip(matches[TOURN_CODE_COL], matches[STAGE_COL], matches["year"]))

result = matches.set_index("row_key").join(profiles_wide, how="left").reset_index(drop=True)

In [59]:
# =============================
# Save output
# =============================
result.to_excel(OUTPUT_FILE, index=False)
print(f"Saved merged file with rankings:\n{OUTPUT_FILE}")


Saved merged file with rankings:
C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp/men_matches_with_ranks.xlsx


In [60]:
def why_unmatched(name="Adrian Mannarino", year=None):
    """
    Debug helper for the men_matches / men_rankings pipeline.
    Looks at surname + (tourn_key, year) to see why a player has no attached rank.

    Parameters
    ----------
    name : str
        Any representation of the player name (e.g. 'Mannarino', 'Adrian Mannarino').
    year : int or None
        If provided, restricts the search to that season (e.g. 2019).
    """
    print(f"== Debug for: {name} ==")

    # 1) get surname key from the provided name
    surname = extract_surname(name)
    if surname is None:
        print("Could not extract surname from:", name)
        return
    print("surname key ->", surname)

    # 2) Where he appears in matches (men_matches)
    name_cols = [
        "winners_p1_surname",
        "winners_p2_surname",
        "losers_p1_surname",
        "losers_p2_surname",
    ]

    missing_cols = [c for c in name_cols if c not in matches.columns]
    if missing_cols:
        print("Missing surname columns in matches:", missing_cols)
        return

    mask_matches = matches[name_cols].apply(
        lambda s: s.astype(str).str.lower().str.contains(surname, na=False)
    ).any(axis=1)

    m_rows = matches.loc[mask_matches, [TOURN_COL, "tourn_key", "year",
                                        TOURN_CODE_COL, STAGE_COL] + name_cols]

    if year is not None and "year" in m_rows.columns:
        m_rows = m_rows[m_rows["year"] == year]

    print("\n-- Matches rows (tournament / tourn_key / year / code / stage) --")
    if m_rows.empty:
        print("No matches rows found with this surname (and year filter)." if year else
              "No matches rows found with this surname.")
    else:
        display(m_rows)

    # 3) Ranking rows for that surname
    if "surname" not in rankings.columns:
        print("\nRankings has no 'surname' column; ensure you created it with extract_surname().")
        return

    rk_rows = rankings[rankings["surname"].str.lower() == surname]

    if year is not None and "year" in rk_rows.columns:
        rk_rows = rk_rows[rk_rows["year"] == year]

    print("\n-- Ranking rows for surname --")
    if rk_rows.empty:
        print("No ranking rows found with this surname key (and year filter)." if year else
              "No ranking rows found with this surname key.")
    else:
        display(
            rk_rows[["player", "surname", "tourn_key", TOURN_COL, "year", RANK_COL]]
            .sort_values(["year", "tourn_key", RANK_COL])
        )

    # 4) Compare (tourn_key, year) sets
    gs_pairs = (
        set(map(tuple, m_rows[["tourn_key", "year"]].dropna().drop_duplicates().to_numpy()))
        if not m_rows.empty else set()
    )
    rk_pairs = (
        set(map(tuple, rk_rows[["tourn_key", "year"]].dropna().drop_duplicates().to_numpy()))
        if not rk_rows.empty else set()
    )

    print("\n(tourn_key, year) in MATCHES:", gs_pairs)
    print("(tourn_key, year) in RANKINGS:", rk_pairs)
    print("In MATCHES but not RANKINGS:", gs_pairs - rk_pairs)

    # 5) If no exact surname matches in rankings, try fuzzy search in 'player'
    if rk_rows.empty:
        print("\nNo exact surname-match in rankings; fuzzy search in 'player' column:")
        fuzzy = rankings[rankings["player"].str.contains(surname, case=False, na=False)]
        if year is not None and "year" in fuzzy.columns:
            fuzzy = fuzzy[fuzzy["year"] == year]

        if fuzzy.empty:
            print("Still nothing â€” likely a real missing ranking entry.")
        else:
            display(
                fuzzy[["player", "surname", "tourn_key", TOURN_COL, "year", RANK_COL]]
                .sort_values(["year", "tourn_key", RANK_COL])
                .head(30)
            )


In [61]:
# 1) Create detailed unmatched table
unmatched_rows = []

for surname_col, role in PLAYER_ROLES:
    rank_col = f"{role}_rank"

    if rank_col not in result.columns:
        print(f"Warning: missing {rank_col} in result")
        continue

    # observations where no ranking was matched
    mask = result[rank_col].isna()

    tmp = result.loc[mask, [TOURN_COL, TOURN_CODE_COL, STAGE_COL, surname_col]].copy()
    tmp = tmp.rename(columns={surname_col: "surname"})
    tmp["role"] = role

    unmatched_rows.append(tmp)

# Combine all unmatched rows
unmatched_df = pd.concat(unmatched_rows, ignore_index=True)

print("Unmatched detailed dataset created.")
display(unmatched_df.head())

# 2) Create unique surname list
unmatched_surnames = sorted(unmatched_df["surname"].dropna().unique().tolist())

# 3) Save surnames to TXT
txt_path = os.path.join(INPUT_DIR, "unmatched_players.txt")
with open(txt_path, "w", encoding="utf-8") as f:
    for name in unmatched_surnames:
        f.write(name + "\n")

print(f"Saved unmatched surname list to:\n{txt_path}")

# 4) OPTIONAL: save full detailed table
xlsx_path = os.path.join(INPUT_DIR, "unmatched_details.xlsx")
unmatched_df.to_excel(xlsx_path, index=False)

print(f"Saved detailed dataset to:\n{xlsx_path}")


Unmatched detailed dataset created.


Unnamed: 0,tournament,tournament_code,stage,surname,role
0,Roland Garros,520,Quarter-Finals,kukushkin,losers_p1
1,Roland Garros,520,Quarter-Finals,ram,losers_p1
2,Roland Garros,520,Quarter-Finals,lajovic,losers_p1
3,Roland Garros,520,Quarter-Finals,rojer,losers_p1
4,Roland Garros,520,Round of 32,baena,losers_p1


Saved unmatched surname list to:
C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp/unmatched_players.txt
Saved detailed dataset to:
C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp/unmatched_details.xlsx


In [62]:
why_unmatched("Mannarino")
why_unmatched("Adrian Mannarino", year=2022)


== Debug for: Mannarino ==
surname key -> mannarino

-- Matches rows (tournament / tourn_key / year / code / stage) --


Unnamed: 0,tournament,tourn_key,year,tournament_code,stage,winners_p1_surname,winners_p2_surname,losers_p1_surname,losers_p2_surname
77,Australian Open,australian open,2019,580,Round of 32,bryan,bryan,mannarino,mies
119,Australian Open,australian open,2019,580,Round of 64,mannarino,mies,gojowczyk,ramos-vinolas
145,Australian Open,australian open,2020,580,Round of 32,krajicek,skugor,barrere,mannarino
170,Australian Open,australian open,2020,580,Round of 64,barrere,mannarino,humbert,tiafoe
236,Australian Open,australian open,2021,580,Round of 64,daniell,oswald,mannarino,simon
287,Australian Open,australian open,2022,580,Round of 64,dodig,melo,mannarino,nys
365,Australian Open,australian open,2023,580,Round of 64,goransson,huesler,halys,mannarino
482,Roland Garros,roland garros,2019,520,Round of 64,bonzi,hoang,humbert,mannarino
528,Roland Garros,roland garros,2020,520,Round of 32,bonzi,hoang,mannarino,paire
558,Roland Garros,roland garros,2020,520,Round of 64,mannarino,paire,tiafoe,withrow



-- Ranking rows for surname --


Unnamed: 0,player,surname,tourn_key,tournament,year,rank
3179,mannarino,mannarino,australian open,Australian Open,2018,289
9041,mannarino,mannarino,roland garros,Roland Garros,2018,239
14993,mannarino,mannarino,us open,US Open,2018,521
19820,mannarino,mannarino,wimbledon,Wimbledon,2018,335
3180,mannarino,mannarino,australian open,Australian Open,2019,276
9042,mannarino,mannarino,roland garros,Roland Garros,2019,202
14994,mannarino,mannarino,us open,US Open,2019,200
19821,mannarino,mannarino,wimbledon,Wimbledon,2019,198
3181,mannarino,mannarino,australian open,Australian Open,2020,226
9043,mannarino,mannarino,roland garros,Roland Garros,2020,214



(tourn_key, year) in MATCHES: {('australian open', 2021), ('roland garros', 2022), ('wimbledon', 2023), ('australian open', 2020), ('roland garros', 2021), ('wimbledon', 2022), ('us open', 2021), ('roland garros', 2020), ('us open', 2023), ('roland garros', 2019), ('us open', 2022), ('australian open', 2019), ('us open', 2019), ('wimbledon', 2018), ('australian open', 2023), ('australian open', 2022)}
(tourn_key, year) in RANKINGS: {('australian open', 2021), ('roland garros', 2022), ('us open', 2021), ('roland garros', 2018), ('australian open', 2018), ('us open', 2018), ('us open', 2022), ('australian open', 2022), ('roland garros', 2023), ('wimbledon', 2021), ('roland garros', 2019), ('australian open', 2019), ('us open', 2019), ('wimbledon', 2018), ('us open', 2023), ('australian open', 2023), ('roland garros', 2020), ('wimbledon', 2019), ('wimbledon', 2023), ('australian open', 2020), ('roland garros', 2021), ('us open', 2020)}
In MATCHES but not RANKINGS: {('wimbledon', 2022)}
=

Unnamed: 0,tournament,tourn_key,year,tournament_code,stage,winners_p1_surname,winners_p2_surname,losers_p1_surname,losers_p2_surname
287,Australian Open,australian open,2022,580,Round of 64,dodig,melo,mannarino,nys
643,Roland Garros,roland garros,2022,520,Round of 32,granollers,zeballos,mannarino,olivetti
679,Roland Garros,roland garros,2022,520,Round of 64,mannarino,olivetti,eysseric,halys
984,Wimbledon,wimbledon,2022,540,Round of 64,cabal,farah,humbert,mannarino
1301,US Open,us open,2022,560,Round of 16,arevalo,rojer,halys,mannarino
1320,US Open,us open,2022,560,Round of 32,halys,mannarino,bublik,rune
1345,US Open,us open,2022,560,Round of 64,halys,mannarino,mahut,roger-vasselin



-- Ranking rows for surname --


Unnamed: 0,player,surname,tourn_key,tournament,year,rank
3183,mannarino,mannarino,australian open,Australian Open,2022,164
9045,mannarino,mannarino,roland garros,Roland Garros,2022,407
14997,mannarino,mannarino,us open,US Open,2022,542



(tourn_key, year) in MATCHES: {('wimbledon', 2022), ('roland garros', 2022), ('us open', 2022), ('australian open', 2022)}
(tourn_key, year) in RANKINGS: {('roland garros', 2022), ('us open', 2022), ('australian open', 2022)}
In MATCHES but not RANKINGS: {('wimbledon', 2022)}


In [63]:
matches["year"]  = matches["year"].astype("Int64")
rankings["year"] = rankings["year"].astype("Int64")
