In [45]:
import os
import re
import glob
import pandas as pd
from datetime import datetime
from getpass import getuser

# -----------------------------
# CONFIG
# -----------------------------
USER = getuser()
INPUT_DIR = f"C:/Users/{USER}/Documents/GitHub/tennis-homophily/data/atp/"
GRAND_SLAM_FILE = os.path.join(INPUT_DIR, "grand_slam_matches_2018_2023.xlsx")
OUTPUT_FILE = os.path.join(INPUT_DIR, "grand_slam_matches_with_ranks_2018_2023.xlsx")

# Ranking file columns (edit if your headers differ)
RANKING_PLAYER_COL = "Player"     # e.g. "J. Cabal"
RANKING_DATE_COL   = "DateWeek"   # Monday of the ATP ranking week
RANKING_VALUE_COL  = "Rank"       # numeric rank

# Keep ALL ranking columns by default. If you want a subset, list it here.
PROFILE_KEEP_WHITELIST: list[str] = []  # e.g. ["W-L YTD","W-L Career","Titles YTD","Titles Career","DOB","Country","Plays","Coach"]

# -----------------------------
# Name normalization
# -----------------------------
def strip_parentheses_payload(name: str) -> str:
    if pd.isna(name):
        return name
    return re.sub(r"\([^)]*\)", "", str(name)).strip()

def normalize_spaces(s: str) -> str:
    return re.sub(r"\s+", " ", str(s)).strip()

def to_initial_last_key_from_full(fullname: str) -> str | None:
    """'Juan Sebastian Cabal(11)' -> 'j cabal'"""
    if pd.isna(fullname) or not str(fullname).strip():
        return None
    s = normalize_spaces(strip_parentheses_payload(fullname)).replace("-", " ")
    parts = [p for p in s.split(" ") if p]
    if not parts:
        return None
    return f"{parts[0][0].lower()} {parts[-1].lower()}"

def clean_ranking_player_raw(raw: str) -> str | None:
    if pd.isna(raw) or not str(raw).strip():
        return None
    s = re.sub(r"^[^A-Za-z]+", "", str(raw))  # drop prefixes like "-1"
    return normalize_spaces(s)

def to_initial_last_key_from_initialdot(name: str) -> str | None:
    """'J. Cabal'/'J Cabal' -> 'j cabal'"""
    if pd.isna(name) or not str(name).strip():
        return None
    s = normalize_spaces(name).replace(".", "").replace("-", " ")
    parts = [p for p in s.split(" ") if p]
    if not parts:
        return None
    if len(parts) == 1:
        return f"{parts[0][0].lower()} {parts[0].lower()}"
    return f"{parts[0][0].lower()} {parts[-1].lower()}"

# Manual disambiguations (fill as needed)
MANUAL_KEY_MAP: dict[str, str] = {
    # "j smith": "j smith",
}

def gs_key(name: str) -> str | None:
    key = to_initial_last_key_from_full(name)
    return MANUAL_KEY_MAP.get(key, key)

def rk_key(name: str) -> str | None:
    return to_initial_last_key_from_initialdot(clean_ranking_player_raw(name))

# -----------------------------
# Profile-field cleaning (ranking files)
# -----------------------------
DOB_PAT = re.compile(r"\((\d{4})[/-](\d{1,2})[/-](\d{1,2})\)")  # (YYYY/MM/DD) or (YYYY-MM-DD)
KG_PAT  = re.compile(r"\((\d{2,3})\s*kg\)", flags=re.I)
CM_PAT  = re.compile(r"\((\d{2,3})\s*cm\)", flags=re.I)

def extract_dob_from_age(age_val):
    # "32 (1993/07/04)" -> "1993-07-04"
    if pd.isna(age_val):
        return None
    m = DOB_PAT.search(str(age_val))
    if not m:
        return None
    y, mo, d = m.groups()
    try:
        return pd.Timestamp(int(y), int(mo), int(d)).strftime("%Y-%m-%d")
    except Exception:
        return None

def extract_kg(weight_val):
    # "183 lbs (83kg)" or "83 kg" -> 83.0
    if pd.isna(weight_val):
        return None
    s = str(weight_val)
    m = KG_PAT.search(s) or re.search(r"(\d{2,3})\s*kg", s, flags=re.I)
    return float(m.group(1)) if m else None

def extract_cm(height_val):
    # "6'3\" (191cm)" or "191 cm" -> 191.0
    if pd.isna(height_val):
        return None
    s = str(height_val)
    m = CM_PAT.search(s) or re.search(r"(\d{2,3})\s*cm", s, flags=re.I)
    return float(m.group(1)) if m else None

# -----------------------------
# Parse tournament start (from gs 'date')
# -----------------------------
def parse_tournament_start(date_str: str, fallback_year: int) -> pd.Timestamp:
    # e.g., "15-28 Jan, 2018" -> 2018-01-15
    if pd.isna(date_str):
        return pd.Timestamp(year=fallback_year, month=1, day=1)
    s = str(date_str)
    m = re.search(r"(\d{1,2})\s*[-–]\s*(\d{1,2})?\s*([A-Za-z]{3,})[^0-9]*([12][0-9]{3})", s)
    if m:
        day1 = int(m.group(1))
        mon  = m.group(3)
        year = int(m.group(4))
        try:
            month_num = datetime.strptime(mon[:3], "%b").month
        except ValueError:
            month_num = 1
        return pd.Timestamp(year=year, month=month_num, day=day1)
    for fmt in ("%Y-%m-%d", "%d/%m/%Y", "%d-%m-%Y"):
        try:
            return pd.to_datetime(s, format=fmt)
        except Exception:
            pass
    return pd.Timestamp(year=fallback_year, month=1, day=1)

# -----------------------------
# Load Grand Slam
# -----------------------------
print("Loading Grand Slam matches…")
if not os.path.exists(GRAND_SLAM_FILE):
    raise FileNotFoundError(f"Grand Slam file not found: {GRAND_SLAM_FILE}")

gs = pd.read_excel(GRAND_SLAM_FILE)
if "year" not in gs.columns:
    if "date" in gs.columns:
        gs["year"] = gs["date"].astype(str).str.extract(r"([12][0-9]{3})").astype(int)
    else:
        raise KeyError("Grand Slam file needs a 'year' or a parsable 'date' column.")

if "match_date" not in gs.columns:
    gs["match_date"] = gs.apply(lambda r: parse_tournament_start(r.get("date", None), int(r["year"])), axis=1)
gs = gs.reset_index().rename(columns={"index": "match_id"})

# -----------------------------
# Load & clean ranking files
# -----------------------------
print("Loading and stacking ranking doubles files…")
ranking_files = sorted(glob.glob(os.path.join(INPUT_DIR, "ranking_doubles_*.xlsx")))
if not ranking_files:
    raise FileNotFoundError("No ranking_doubles_*.xlsx files found in INPUT_DIR")

rk_rows = []
for fp in ranking_files:
    df = pd.read_excel(fp)

    # Required columns check
    missing = [c for c in [RANKING_PLAYER_COL, RANKING_DATE_COL, RANKING_VALUE_COL] if c not in df.columns]
    if missing:
        raise KeyError(f"Missing columns {missing} in ranking file {os.path.basename(fp)}")

    # Keep ALL columns or whitelist subset
    if PROFILE_KEEP_WHITELIST:
        base_cols = {RANKING_PLAYER_COL, RANKING_DATE_COL, RANKING_VALUE_COL}
        keep_cols = list(base_cols.union([c for c in PROFILE_KEEP_WHITELIST if c in df.columns]))
        df = df[keep_cols].copy()
    else:
        df = df.copy()

    # Field cleaning BEFORE keying
    if "Age" in df.columns:
        dob_from_age = df["Age"].apply(extract_dob_from_age)
        if "DOB" in df.columns:
            df["DOB"] = df["DOB"].where(df["DOB"].notna() & (df["DOB"].astype(str).str.len() > 0), dob_from_age)
        else:
            df["DOB"] = dob_from_age
        df.drop(columns=["Age"], inplace=True)

    if "Weight" in df.columns:
        df["WeightKg"] = df["Weight"].apply(extract_kg)
        df.drop(columns=["Weight"], inplace=True)

    if "Height" in df.columns:
        df["HeightCm"] = df["Height"].apply(extract_cm)
        df.drop(columns=["Height"], inplace=True)

    # Date & key
    df[RANKING_DATE_COL] = pd.to_datetime(df[RANKING_DATE_COL], errors="coerce").dt.tz_localize(None)
    df["player_key"] = df[RANKING_PLAYER_COL].map(rk_key)
    df = df.dropna(subset=["player_key", RANKING_DATE_COL]).copy()

    rk_rows.append(df)

rk = pd.concat(rk_rows, ignore_index=True)
rk = rk.sort_values(["player_key", RANKING_DATE_COL], kind="mergesort").reset_index(drop=True)

# -----------------------------
# Per-player as-of attach (keep ALL ranking columns, prefixed)
# -----------------------------
def attach_rank_for(gs_df: pd.DataFrame, player_col: str, out_prefix: str) -> pd.DataFrame:
    tmp = gs_df[["match_id", "match_date", player_col]].copy()
    tmp["match_date"] = pd.to_datetime(tmp["match_date"], errors="coerce").dt.tz_localize(None)
    tmp["player_key"] = tmp[player_col].map(gs_key)
    tmp = tmp.dropna(subset=["player_key", "match_date"]).copy()

    # bring all ranking columns except helper key
    rk_cols = [c for c in rk.columns if c not in ("player_key",)]

    out_chunks = []
    for pkey, g in tmp.groupby("player_key", sort=False):
        g = g.sort_values("match_date").reset_index(drop=True)
        r = rk.loc[rk["player_key"] == pkey, rk_cols].copy()

        if r.empty:
            for c in rk_cols:
                g[f"{out_prefix}{c}"] = pd.NA
        else:
            r[RANKING_DATE_COL] = pd.to_datetime(r[RANKING_DATE_COL], errors="coerce").dt.tz_localize(None)
            r = r.dropna(subset=[RANKING_DATE_COL]).sort_values(RANKING_DATE_COL).reset_index(drop=True)

            merged = pd.merge_asof(
                g, r,
                left_on="match_date",
                right_on=RANKING_DATE_COL,
                direction="backward",
                allow_exact_matches=True,
            )
            for c in rk_cols:
                g[f"{out_prefix}{c}"] = merged[c].values

        out_chunks.append(g[["match_id"] + [f"{out_prefix}{c}" for c in rk_cols]])

    out_df = pd.concat(out_chunks, ignore_index=True) if out_chunks else pd.DataFrame({"match_id": []})
    return gs_df.merge(out_df, on="match_id", how="left")

# -----------------------------
# Attach for all four players
# -----------------------------
print("Attaching ranks…")
roles = [
    ("winners_p1", "winners_p1_"),
    ("winners_p2", "winners_p2_"),
    ("losers_p1",  "losers_p1_"),
    ("losers_p2",  "losers_p2_"),
]
for col, prefix in roles:
    if col not in gs.columns:
        raise KeyError(f"Column '{col}' is missing from Grand Slam file.")
    gs = attach_rank_for(gs, col, prefix)

# -----------------------------
# Unmatched names helper list
# -----------------------------
problem_names = set()
for col, prefix in roles:
    rank_col = f"{prefix}{RANKING_VALUE_COL}"   # e.g. winners_p1_Rank
    if rank_col in gs.columns:
        mask = gs[rank_col].isna()
        problem_names.update(gs.loc[mask, col].dropna().unique().tolist())
    else:
        date_col = f"{prefix}{RANKING_DATE_COL}"
        if date_col in gs.columns:
            mask = gs[date_col].isna()
            problem_names.update(gs.loc[mask, col].dropna().unique().tolist())

if problem_names:
    problems_path = os.path.join(INPUT_DIR, "_unmatched_names_for_manual_mapping.txt")
    with open(problems_path, "w", encoding="utf-8") as f:
        for n in sorted(problem_names):
            f.write(str(n) + "\n")
    print(f"Wrote unmatched names list to: {problems_path}")

# -----------------------------
# Save
# -----------------------------
print(f"Saving to {OUTPUT_FILE} …")
with pd.ExcelWriter(OUTPUT_FILE, engine="xlsxwriter") as xlw:
    gs.to_excel(xlw, index=False, sheet_name="matches_with_ranks")

print("Done.")


Loading Grand Slam matches…
Loading and stacking ranking doubles files…
Attaching ranks…


  out_df = pd.concat(out_chunks, ignore_index=True) if out_chunks else pd.DataFrame({"match_id": []})
  out_df = pd.concat(out_chunks, ignore_index=True) if out_chunks else pd.DataFrame({"match_id": []})
  out_df = pd.concat(out_chunks, ignore_index=True) if out_chunks else pd.DataFrame({"match_id": []})
  out_df = pd.concat(out_chunks, ignore_index=True) if out_chunks else pd.DataFrame({"match_id": []})
  out_df = pd.concat(out_chunks, ignore_index=True) if out_chunks else pd.DataFrame({"match_id": []})
  out_df = pd.concat(out_chunks, ignore_index=True) if out_chunks else pd.DataFrame({"match_id": []})
  out_df = pd.concat(out_chunks, ignore_index=True) if out_chunks else pd.DataFrame({"match_id": []})
  out_df = pd.concat(out_chunks, ignore_index=True) if out_chunks else pd.DataFrame({"match_id": []})


Wrote unmatched names list to: C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp/_unmatched_names_for_manual_mapping.txt
Saving to C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp/grand_slam_matches_with_ranks_2018_2023.xlsx …
Done.
