In [8]:
import os
import re
import glob
import unicodedata
import pandas as pd
from datetime import datetime
from getpass import getuser

# =============================
# CONFIG
# =============================
USER = getuser()
INPUT_DIR = f"C:/Users/{USER}/Documents/GitHub/tennis-homophily/data/atp/"
GRAND_SLAM_FILE = os.path.join(INPUT_DIR, "grand_slam_matches_2018_2023.xlsx")
OUTPUT_FILE = os.path.join(INPUT_DIR, "grand_slam_matches_with_ranks_2018_2023.xlsx")

# Ranking file columns (edit if headers differ)
RANKING_PLAYER_COL = "Player"     # e.g. "J. Cabal"
RANKING_DATE_COL   = "DateWeek"   # Monday of the ATP ranking week
RANKING_VALUE_COL  = "Rank"       # numeric rank

# If your ranking files already have tournament/year columns, rename here if needed
RANKING_TOURN_CANDIDATES = ["tournament", "Tournament", "event", "Event"]
RANKING_YEAR_CANDIDATES  = ["year", "Year", "season", "Season"]

# Keep ALL ranking columns by default. If you want a subset, list it here.
PROFILE_KEEP_WHITELIST = []  # e.g. ["W-L YTD","W-L Career","Titles YTD","Titles Career","DOB","Country","Plays","Coach"]

# If True, strip diacritics in names so "Pavlásek" == "Pavlasek"
STRIP_ACCENTS = True

# Optional manual disambiguations (both sides use "first-initial lastname" lowercased)
MANUAL_KEY_MAP = {
    "a pavlasek": "a pavlasek",   # handles "-2A. Pavlasek" vs "Adam Pavlasek"
    # add more here
}

# =============================
# Helpers: text normalization
# =============================
def strip_accents(s):
    if s is None:
        return s
    s = str(s)
    return "".join(c for c in unicodedata.normalize("NFKD", s) if not unicodedata.combining(c))

def normalize_spaces(s):
    return re.sub(r"\s+", " ", str(s)).strip()

def _maybe_strip_accents(s):
    return strip_accents(s) if STRIP_ACCENTS else s

def strip_parentheses_payload(name):
    """Remove '(11)' etc."""
    if pd.isna(name):
        return name
    return re.sub(r"\([^)]*\)", "", str(name)).strip()

def to_initial_last_key_from_full(fullname):
    """'Juan Sebastian Cabal(11)' -> 'j cabal'"""
    if pd.isna(fullname) or not str(fullname).strip():
        return None
    s = normalize_spaces(strip_parentheses_payload(fullname)).replace("-", " ")
    s = _maybe_strip_accents(s)
    parts = [p for p in s.split(" ") if p]
    if not parts:
        return None
    return f"{parts[0][0].lower()} {parts[-1].lower()}"

def clean_ranking_player_raw(raw):
    """Drop junk prefixes like '-2' or '1 ' → keep 'A. Pavlasek'"""
    if pd.isna(raw) or not str(raw).strip():
        return None
    s = re.sub(r"^[^A-Za-z]+", "", str(raw))
    return normalize_spaces(s)

def to_initial_last_key_from_initialdot(name):
    """'J. Cabal' or 'J Cabal' -> 'j cabal'"""
    if pd.isna(name) or not str(name).strip():
        return None
    s = normalize_spaces(name).replace(".", "").replace("-", " ")
    s = _maybe_strip_accents(s)
    parts = [p for p in s.split(" ") if p]
    if not parts:
        return None
    if len(parts) == 1:
        return f"{parts[0][0].lower()} {parts[0].lower()}"
    return f"{parts[0][0].lower()} {parts[-1].lower()}"

def gs_key(name):
    key = to_initial_last_key_from_full(name)
    return MANUAL_KEY_MAP.get(key, key)

def rk_key(name):
    return to_initial_last_key_from_initialdot(clean_ranking_player_raw(name))

def normalize_tournament(s):
    """Simple tournament normalizer: lowercase, strip, collapse spaces, strip accents."""
    if pd.isna(s):
        return None
    s = normalize_spaces(str(s)).lower()
    s = _maybe_strip_accents(s)
    # strip some common punctuation
    s = re.sub(r"[–—-]", " ", s)
    s = normalize_spaces(s)
    return s

# =============================
# Helpers: profile-field cleaning
# =============================
DOB_PAT = re.compile(r"\((\d{4})[/-](\d{1,2})[/-](\d{1,2})\)")
KG_PAT  = re.compile(r"\((\d{2,3})\s*kg\)", flags=re.I)
CM_PAT  = re.compile(r"\((\d{2,3})\s*cm\)", flags=re.I)

def extract_dob_from_age(age_val):
    if pd.isna(age_val):
        return None
    m = DOB_PAT.search(str(age_val))
    if not m:
        return None
    y, mo, d = m.groups()
    try:
        return pd.Timestamp(int(y), int(mo), int(d)).strftime("%Y-%m-%d")
    except Exception:
        return None

def extract_kg(weight_val):
    if pd.isna(weight_val):
        return None
    s = str(weight_val)
    m = KG_PAT.search(s) or re.search(r"(\d{2,3})\s*kg", s, flags=re.I)
    return float(m.group(1)) if m else None

def extract_cm(height_val):
    if pd.isna(height_val):
        return None
    s = str(height_val)
    m = CM_PAT.search(s) or re.search(r"(\d{2,3})\s*cm", s, flags=re.I)
    return float(m.group(1)) if m else None

# =============================
# Load Grand Slam
# =============================
print("Loading Grand Slam matches…")
if not os.path.exists(GRAND_SLAM_FILE):
    raise FileNotFoundError(f"Grand Slam file not found: {GRAND_SLAM_FILE}")

gs = pd.read_excel(GRAND_SLAM_FILE)

# ensure year
if "year" not in gs.columns:
    if "date" in gs.columns:
        gs["year"] = gs["date"].astype(str).str.extract(r"([12][0-9]{3})").astype(int)
    else:
        raise KeyError("Grand Slam file needs a 'year' or a parsable 'date' column.")

# normalize tournament key
if "tournament" not in gs.columns:
    raise KeyError("Grand Slam file must have a 'tournament' column.")
gs["tourn_key"] = gs["tournament"].map(normalize_tournament)

gs = gs.reset_index().rename(columns={"index": "match_id"})

# =============================
# Load & clean ranking files
# =============================
print("Loading and stacking ranking doubles files…")
ranking_files = sorted(glob.glob(os.path.join(INPUT_DIR, "ranking_doubles_*.xlsx")))
if not ranking_files:
    raise FileNotFoundError("No ranking_doubles_*.xlsx files found in INPUT_DIR")

rk_rows = []
for fp in ranking_files:
    df = pd.read_excel(fp)

    # Required base columns
    missing = [c for c in [RANKING_PLAYER_COL, RANKING_DATE_COL, RANKING_VALUE_COL] if c not in df.columns]
    if missing:
        raise KeyError(f"Missing columns {missing} in ranking file {os.path.basename(fp)}")

    # Keep ALL columns or whitelist subset
    if PROFILE_KEEP_WHITELIST:
        base_cols = {RANKING_PLAYER_COL, RANKING_DATE_COL, RANKING_VALUE_COL}
        keep_cols = list(base_cols.union([c for c in PROFILE_KEEP_WHITELIST if c in df.columns]))
        df = df[keep_cols].copy()
    else:
        df = df.copy()

    # Profile cleaning
    if "Age" in df.columns:
        dob_from_age = df["Age"].apply(extract_dob_from_age)
        if "DOB" in df.columns:
            df["DOB"] = df["DOB"].where(df["DOB"].notna() & (df["DOB"].astype(str).str.len() > 0), dob_from_age)
        else:
            df["DOB"] = dob_from_age
        df.drop(columns=["Age"], inplace=True)

    if "Weight" in df.columns:
        df["WeightKg"] = df["Weight"].apply(extract_kg)
        df.drop(columns=["Weight"], inplace=True)

    if "Height" in df.columns:
        df["HeightCm"] = df["Height"].apply(extract_cm)
        df.drop(columns=["Height"], inplace=True)

    # Date & key
    df[RANKING_DATE_COL] = pd.to_datetime(df[RANKING_DATE_COL], errors="coerce").dt.tz_localize(None)
    df["player_key"] = df[RANKING_PLAYER_COL].map(rk_key)

    # tournament column (case-insensitive search)
    rk_tourn_col = next((c for c in RANKING_TOURN_CANDIDATES if c in df.columns), None)
    if rk_tourn_col is None:
        raise KeyError(f"No tournament column found in {os.path.basename(fp)}. Tried {RANKING_TOURN_CANDIDATES}")

    df["tourn_key"] = df[rk_tourn_col].map(normalize_tournament)

    # year column (use existing if present; else derive from DateWeek)
    rk_year_col = next((c for c in RANKING_YEAR_CANDIDATES if c in df.columns), None)
    if rk_year_col is None:
        df["year"] = df[RANKING_DATE_COL].dt.year
    else:
        df["year"] = pd.to_numeric(df[rk_year_col], errors="coerce")

    df = df.dropna(subset=["player_key", RANKING_DATE_COL, "tourn_key", "year"]).copy()

    rk_rows.append(df)

rk = pd.concat(rk_rows, ignore_index=True)

# For each (player_key, tourn_key, year), keep the latest DateWeek row to avoid duplicates
rk = rk.sort_values([ "player_key", "tourn_key", "year", RANKING_DATE_COL ], kind="mergesort")
rk_latest = rk.groupby(["player_key", "tourn_key", "year"], as_index=False).tail(1).reset_index(drop=True)

# =============================
# Merge by (player_key, tournament, year)
# =============================
def attach_rank_for(gs_df, player_col, out_prefix):
    tmp = gs_df[["match_id", "tourn_key", "year", player_col]].copy()
    tmp["player_key"] = tmp[player_col].map(gs_key)
    tmp = tmp.dropna(subset=["player_key", "tourn_key", "year"]).copy()

    # bring all ranking columns except helper columns & raw keys
    rk_cols = [c for c in rk_latest.columns if c not in ("player_key", "tourn_key", "year")]

    merged = tmp.merge(
        rk_latest[["player_key", "tourn_key", "year"] + rk_cols],
        on=["player_key", "tourn_key", "year"],
        how="left"
    )

    # add prefixed columns
    out = gs_df.copy()
    for c in rk_cols:
        out[f"{out_prefix}{c}"] = merged[c].values
    return out

print("Attaching ranks (tournament-year)…")
roles = [
    ("winners_p1", "winners_p1_"),
    ("winners_p2", "winners_p2_"),
    ("losers_p1",  "losers_p1_"),
    ("losers_p2",  "losers_p2_"),
]
for col, prefix in roles:
    if col not in gs.columns:
        raise KeyError(f"Column '{col}' is missing from Grand Slam file.")
    gs = attach_rank_for(gs, col, prefix)

# =============================
# Unmatched names helper list
# =============================
problem_names = set()
for col, prefix in roles:
    rank_col = f"{prefix}{RANKING_VALUE_COL}"   # e.g. winners_p1_Rank
    if rank_col in gs.columns:
        mask = gs[rank_col].isna()
        problem_names.update(gs.loc[mask, col].dropna().unique().tolist())

if problem_names:
    problems_path = os.path.join(INPUT_DIR, "_unmatched_names_for_manual_mapping.txt")
    with open(problems_path, "w", encoding="utf-8") as f:
        for n in sorted(problem_names):
            f.write(str(n) + "\n")
    print(f"Wrote unmatched names list to: {problems_path}")

# =============================
# Save
# =============================
print(f"Saving to {OUTPUT_FILE} …")
with pd.ExcelWriter(OUTPUT_FILE, engine="xlsxwriter") as xlw:
    gs.to_excel(xlw, index=False, sheet_name="matches_with_ranks")

print("Done.")


Loading Grand Slam matches…
Loading and stacking ranking doubles files…
Attaching ranks (tournament-year)…
Wrote unmatched names list to: C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp/_unmatched_names_for_manual_mapping.txt
Saving to C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp/grand_slam_matches_with_ranks_2018_2023.xlsx …
Done.


In [11]:
def why_unmatched(full_name="Adrian Mannarino"):
    print(f"== Debug for: {full_name} ==")
    pkey = gs_key(full_name)
    print("player_key  ->", pkey)

    # 1) Where he appears in GS (and what keys we use there)
    mask_gs = gs[["winners_p1","winners_p2","losers_p1","losers_p2"]].apply(
        lambda r: r.astype(str).str.contains("mannarino", case=False, na=False)
    ).any(axis=1)

    gs_rows = gs.loc[mask_gs, ["match_id","tournament","tourn_key","year","winners_p1","winners_p2","losers_p1","losers_p2"]]
    print("\n-- GS rows (tournament/tourn_key/year) --")
    display(gs_rows)

    # 2) What ranking rows we have for him (raw and collapsed)
    rk_raw_rows = rk.loc[rk["player_key"] == pkey,
                         ["player_key","tourn_key","year", RANKING_DATE_COL, RANKING_VALUE_COL, RANKING_PLAYER_COL]]
    print("\n-- Ranking RAW rows for player --")
    display(rk_raw_rows.sort_values([RANKING_DATE_COL]).tail(30))

    rk_latest_rows = rk_latest.loc[rk_latest["player_key"] == pkey,
                                   ["player_key","tourn_key","year", RANKING_DATE_COL, RANKING_VALUE_COL, RANKING_PLAYER_COL]]
    print("\n-- Ranking LATEST rows per (player, tourn_key, year) --")
    display(rk_latest_rows.sort_values(["year","tourn_key"]))

    # 3) Compare (tourn_key, year) sets
    gs_pairs = set(map(tuple, gs_rows[["tourn_key","year"]].drop_duplicates().to_numpy())) if not gs_rows.empty else set()
    rk_pairs = set(map(tuple, rk_latest_rows[["tourn_key","year"]].drop_duplicates().to_numpy())) if not rk_latest_rows.empty else set()
    print("\nGS pairs:", gs_pairs)
    print("RK pairs:", rk_pairs)
    print("Missing in RK:", gs_pairs - rk_pairs)

    # 4) If no rk rows at all, show Player spellings containing 'Ganes'
    if rk_raw_rows.empty:
        print("\nNo ranking rows for key:", pkey, " — checking string matches in ranking 'Player'...")
        display(
            rk[rk[RANKING_PLAYER_COL].str.contains("Ganes", case=False, na=False)]
              [[RANKING_PLAYER_COL,"tourn_key","year",RANKING_DATE_COL,RANKING_VALUE_COL]]
              .sort_values(RANKING_DATE_COL)
              .tail(30)
        )

why_unmatched("Adrian Mannarino")


== Debug for: Adrian Mannarino ==
player_key  -> a mannarino

-- GS rows (tournament/tourn_key/year) --


Unnamed: 0,match_id,tournament,tourn_key,year,winners_p1,winners_p2,losers_p1,losers_p2
77,77,Australian Open,australian open,2019,Bob Bryan(4),Mike Bryan(4),Adrian Mannarino,Andreas Mies
119,119,Australian Open,australian open,2019,Adrian Mannarino,Andreas Mies,Peter Gojowczyk,Albert Ramos-Vinolas
145,145,Australian Open,australian open,2020,Austin Krajicek(16),Franko Skugor(16),Gregoire Barrere,Adrian Mannarino
170,170,Australian Open,australian open,2020,Gregoire Barrere,Adrian Mannarino,Ugo Humbert,Frances Tiafoe
236,236,Australian Open,australian open,2021,Marcus Daniell,Philipp Oswald,Adrian Mannarino,Gilles Simon
287,287,Australian Open,australian open,2022,Ivan Dodig(9),Marcelo Melo(9),Adrian Mannarino,Hugo Nys
365,365,Australian Open,australian open,2023,Andre Goransson,Marc-Andrea Huesler,Quentin Halys,Adrian Mannarino
482,482,Roland Garros,roland garros,2019,Benjamin Bonzi(WC),Antoine Hoang(WC),Ugo Humbert,Adrian Mannarino
528,528,Roland Garros,roland garros,2020,Benjamin Bonzi(WC),Antoine Hoang(WC),Adrian Mannarino,Benoit Paire
558,558,Roland Garros,roland garros,2020,Adrian Mannarino,Benoit Paire,Frances Tiafoe,Jackson Withrow



-- Ranking RAW rows for player --


Unnamed: 0,player_key,tourn_key,year,DateWeek,Rank,Player
288,a mannarino,australian open,2018,2018-01-15,289,A. Mannarino
537,a mannarino,roland garros,2018,2018-05-21,239,1A. Mannarino
1471,a mannarino,australian open,2019,2019-01-14,276,1A. Mannarino
1696,a mannarino,roland garros,2019,2019-05-20,202,1A. Mannarino
1991,a mannarino,wimbledon,2019,2019-07-01,198,1A. Mannarino
2292,a mannarino,us open,2019,2019-08-26,200,-2A. Mannarino
2617,a mannarino,australian open,2020,2020-01-20,226,A. Mannarino
2901,a mannarino,us open,2020,2020-08-31,211,-1A. Mannarino
3203,a mannarino,roland garros,2020,2020-09-21,214,-1A. Mannarino
3478,a mannarino,australian open,2021,2021-02-08,190,-2A. Mannarino



-- Ranking LATEST rows per (player, tourn_key, year) --


Unnamed: 0,player_key,tourn_key,year,DateWeek,Rank,Player
326,a mannarino,australian open,2018,2018-01-15,289,A. Mannarino
332,a mannarino,roland garros,2018,2018-05-21,239,1A. Mannarino
327,a mannarino,australian open,2019,2019-01-14,276,1A. Mannarino
333,a mannarino,roland garros,2019,2019-05-20,202,1A. Mannarino
337,a mannarino,us open,2019,2019-08-26,200,-2A. Mannarino
341,a mannarino,wimbledon,2019,2019-07-01,198,1A. Mannarino
328,a mannarino,australian open,2020,2020-01-20,226,A. Mannarino
334,a mannarino,roland garros,2020,2020-09-21,214,-1A. Mannarino
338,a mannarino,us open,2020,2020-08-31,211,-1A. Mannarino
329,a mannarino,australian open,2021,2021-02-08,190,-2A. Mannarino



GS pairs: {('roland garros', 2020), ('australian open', 2019), ('roland garros', 2021), ('wimbledon', 2018), ('us open', 2019), ('us open', 2022), ('us open', 2023), ('wimbledon', 2023), ('us open', 2021), ('roland garros', 2019), ('australian open', 2023), ('wimbledon', 2022), ('australian open', 2022), ('roland garros', 2022), ('australian open', 2021), ('australian open', 2020)}
RK pairs: {('roland garros', 2020), ('australian open', 2019), ('roland garros', 2021), ('us open', 2019), ('australian open', 2018), ('us open', 2023), ('wimbledon', 2019), ('us open', 2020), ('us open', 2021), ('wimbledon', 2023), ('roland garros', 2018), ('roland garros', 2019), ('wimbledon', 2021), ('australian open', 2023), ('australian open', 2022), ('australian open', 2021), ('roland garros', 2023), ('australian open', 2020)}
Missing in RK: {('roland garros', 2022), ('wimbledon', 2018), ('us open', 2022), ('wimbledon', 2022)}
