In [18]:
import os
import re
import numpy as np
import pandas as pd
from getpass import getuser

In [19]:
# =============================
# CONFIG
# =============================
USER = getuser()
INPUT_DIR = f"C:/Users/{USER}/Documents/GitHub/tennis-homophily/data/atp/"

MEN_MATCHES_FILE  = os.path.join(INPUT_DIR, "men_matches.xlsx")
MEN_RANKINGS_FILE = os.path.join(INPUT_DIR, "men_rankings.xlsx")
OUTPUT_FILE       = os.path.join(INPUT_DIR, "men_matches_with_ranks.xlsx")

TOURN_COL       = "tournament"
TOURN_CODE_COL  = "tournament_code"
STAGE_COL       = "stage"
RANK_COL        = "rank_num"

PLAYER_ROLES = [
    ("winners_p1_surname", "winners_p1"),
    ("winners_p2_surname", "winners_p2"),
    ("losers_p1_surname",  "losers_p1"),
    ("losers_p2_surname",  "losers_p2"),
]

In [20]:
# =============================
# Helpers
# =============================

def normalize_tournament(s):
    if pd.isna(s):
        return None
    return " ".join(str(s).lower().strip().split())

def extract_initial(s):
    """
    Extract first alphabetic character as initial from a name string.
    Works with 'Mike Bryan(6)', 'R. Ram', etc.
    """
    if pd.isna(s):
        return None
    s = str(s).strip()
    if not s:
        return None
    m = re.search(r"[A-Za-z]", s)
    return m.group(0).lower() if m else None

def get_unique_rankings(rk_df, keys):
    """
    From rankings df, keep only rows for which the key combination `keys`
    appears exactly once. Used for level-1 and level-2 matching.
    """
    counts = (
        rk_df
        .groupby(keys, dropna=False)[RANK_COL]
        .size()
        .reset_index(name="n")
    )
    uniq_keys = counts[counts["n"] == 1][keys]
    return rk_df.merge(uniq_keys, on=keys, how="inner")

In [21]:
# =============================
# Load data
# =============================
matches  = pd.read_excel(MEN_MATCHES_FILE)
rankings = pd.read_excel(MEN_RANKINGS_FILE)

In [22]:
# =============================
# tourn_key in both tables
# =============================
if "tourn_key" not in matches.columns:
    if TOURN_COL not in matches.columns:
        raise KeyError(f"'tourn_key' or '{TOURN_COL}' must be in matches.")
    matches["tourn_key"] = matches[TOURN_COL].map(normalize_tournament)

if "tourn_key" not in rankings.columns:
    if TOURN_COL not in rankings.columns:
        raise KeyError(f"'tourn_key' or '{TOURN_COL}' must be in rankings.")
    rankings["tourn_key"] = rankings[TOURN_COL].map(normalize_tournament)


In [23]:
# =============================
# Ensure 'year' exists
# =============================
if "year" not in matches.columns:
    raise KeyError("matches needs a 'year' column.")

if "year" not in rankings.columns:
    if "dateweek" in rankings.columns:
        rankings["year"] = pd.to_datetime(rankings["dateweek"], errors="coerce").dt.year
    else:
        raise KeyError("rankings needs a 'year' column or 'dateweek' to derive it.")


In [24]:
# =============================
# Create unique match_id
# =============================
matches = matches.reset_index(drop=True).copy()
matches["match_id"] = matches.index

In [25]:
# =============================
# Create initials in matches if missing
# =============================
for surname_col, role in PLAYER_ROLES:
    base = surname_col.replace("_surname", "")   # e.g. 'winners_p1'
    initial_col = f"{base}_initial"
    if initial_col in matches.columns:
        continue  # already there

    # try a dedicated name column: winners_p1_name
    name_col = f"{base}_name"
    if name_col in matches.columns:
        matches[initial_col] = matches[name_col].apply(extract_initial)
        continue

    # fall back to the original role column (e.g. 'winners_p1') if present
    if base in matches.columns:
        matches[initial_col] = matches[base].apply(extract_initial)
        continue

    raise KeyError(
        f"Cannot create {initial_col}: neither '{name_col}' nor '{base}' found in matches."
    )


In [26]:

# =============================
# Prepare rankings: surname, initial, rank, dtypes
# =============================

# use player_surname and player_initial directly
if "player_surname" not in rankings.columns or "player_initial" not in rankings.columns:
    raise KeyError("rankings must contain 'player_surname' and 'player_initial' columns.")

rankings["surname"] = rankings["player_surname"].astype(str).str.lower().str.strip()
rankings["initial"] = rankings["player_initial"].astype(str).str.lower().str.strip().str[0]

for col in ["tourn_key", "surname", "initial"]:
    rankings[col] = rankings[col].astype(str).str.lower().str.strip()

rankings["year"] = pd.to_numeric(rankings["year"], errors="coerce").astype("Int64")

rankings[RANK_COL] = (
    rankings[RANK_COL].astype(str).str.strip()
    .str.replace(r"[^0-9]+$", "", regex=True)
)
rankings[RANK_COL] = pd.to_numeric(rankings[RANK_COL], errors="coerce").astype("Int64")


In [27]:

# For these players:
#   - in the listed years → keep the *lowest* rank
#   - in all other years → keep the *highest* rank

SPECIAL_RANK_RULES = {
    ("lopez", "m"): {2018, 2019, 2020, 2021},  
    ("chung", "h"): {2018},    
    ("zhang", "z"): {2019, 2023}, 
    ("martin", "a"): {2021},
    ("smith", "j"): {2022, 2023},
    ("martinez", "l"): {2023},
    ("nakashima", "b"): {2023},
}

def collapse_special_rank(rk_df, special_rules, rank_col="rank_num"):
    """
    For players listed in `special_rules` (dict: (surname, initial) -> set(years_min)),
    if there are multiple rows with the same (tourn_key, surname, initial, year),
    keep only one:

      - if year ∈ years_min  -> keep row with *lowest* rank
      - else                 -> keep row with *highest* rank
    """

    if not special_rules:
        return rk_df

    rk = rk_df.copy()

    # Build mask for all special players
    mask_special = False
    for (s, i) in special_rules.keys():
        mask_special |= (rk["surname"] == s) & (rk["initial"] == i)

    special = rk[mask_special].copy()
    rest    = rk[~mask_special].copy()

    if special.empty:
        return rk_df

    grp_keys = ["tourn_key", "surname", "initial", "year"]
    keep_idx = []

    # Ensure rank is numeric for min/max
    special[rank_col] = pd.to_numeric(special[rank_col], errors="coerce")

    for (tk, s, i, yr), g in special.groupby(grp_keys):
        years_min = special_rules.get((s, i), set())

        # if all ranks are NaN, just keep the first row
        if g[rank_col].notna().any():
            if yr in years_min:
                idx = g[rank_col].idxmin()
            else:
                idx = g[rank_col].idxmax()
        else:
            idx = g.index[0]

        keep_idx.append(idx)

    special_single = special.loc[keep_idx]

    # Combine back
    rk_new = pd.concat([rest, special_single], ignore_index=True)

    return rk_new

# apply special rule to rankings
rankings = collapse_special_rank(rankings, SPECIAL_RANK_RULES, rank_col=RANK_COL)


# keys including initial (used for profile_cols definition)
key_cols_full = ["tourn_key", "surname", "initial", "year"]
rk_key_cols   = key_cols_full + [RANK_COL]
profile_cols  = [c for c in rankings.columns if c not in rk_key_cols]

In [28]:
def assign_twins_min_rank_per_match(
    merged_long,
    rankings,
    twins,
    profile_cols,
    rank_col="rank",
    roles=("losers_p1", "losers_p2"),
):
    """
    For each (surname, initial) in `twins`, if in a given match (tourn_key, year, match_id)
    there are multiple unmatched rows for that player in the specified `roles`,
    and there are at least as many ranking rows for that (tourn_key, year),
    assign distinct ranking/profile rows to them.

    - Within a (tourn_key, year) we sort rankings by rank ascending (best first)
      and assign sequentially; order between p1/p2 doesn't matter.
    - Only fills rows where rank_col is currently NaN.
    """

    ml = merged_long.copy()

    for surname, initial in twins:
        s = surname.lower()
        i = initial.lower()

        # rankings rows for this twin (all tournaments/years)
        rk_twin = rankings[
            (rankings["surname"] == s) &
            (rankings["initial"] == i) &
            rankings[rank_col].notna()
        ].copy()

        if rk_twin.empty:
            continue

        # unmatched twin-rows in merged_long, restricted to given roles
        mask_unmatched = (
            (ml["surname"] == s) &
            (ml["initial"] == i) &
            ml[rank_col].isna() &
            ml["role"].isin(roles)
        )

        if not mask_unmatched.any():
            continue

        twin_unmatched = ml[mask_unmatched]

        # work match by match
        for (tk, yr, mid), grp in twin_unmatched.groupby(
            ["tourn_key", "year", "match_id"]
        ):
            # candidate rankings for this event/year
            cand = rk_twin[
                (rk_twin["tourn_key"] == tk) &
                (rk_twin["year"] == yr)
            ].sort_values(rank_col)

            if cand.empty:
                # nothing to assign for this event/year
                continue

            # we only need as many rows as unmatched players in this match
            k = min(len(grp), len(cand))
            cand_use = cand.head(k)

            # assign sequentially – order doesn't matter
            for (idx_row, _), (_, rk_row) in zip(grp.iloc[:k].iterrows(),
                                                cand_use.iterrows()):
                for col in [rank_col] + profile_cols:
                    ml.at[idx_row, col] = rk_row[col]

    return ml


# =============================
# Build players_long with surname, initial, rank_hint
# =============================
long_rows = []

for surname_col, role in PLAYER_ROLES:
    if surname_col not in matches.columns:
        raise KeyError(f"Missing column: {surname_col}")

    base = surname_col.replace("_surname", "")     # winners_p1, etc.
    initial_col = f"{base}_initial"
    rank_col    = f"{base}_ranking"

    if initial_col not in matches.columns:
        raise KeyError(f"Missing column: {initial_col}")
    if rank_col not in matches.columns:
        raise KeyError(f"Missing column: {rank_col}")

    tmp = matches[[
        "match_id",
        "tourn_key",
        "year",
        surname_col,
        initial_col,
        rank_col
    ]].copy()

    tmp.rename(columns={
        surname_col: "surname",
        initial_col: "initial",
        rank_col:    "rank_hint"
    }, inplace=True)

    tmp["surname"] = tmp["surname"].astype(str).str.lower().str.strip()
    tmp["initial"] = tmp["initial"].astype(str).str.lower().str.strip().str[0]

    tmp["rank_hint"] = (
        tmp["rank_hint"].astype(str).str.strip()
        .str.replace(r"[^0-9]+$", "", regex=True)
    )
    tmp["rank_hint"] = pd.to_numeric(tmp["rank_hint"], errors="coerce").astype("Int64")

    tmp["role"] = role
    long_rows.append(tmp)

players_long = pd.concat(long_rows, ignore_index=True)
players_long = players_long.sort_values(["match_id", "role"], ignore_index=True)

# align dtypes on keys
for col in ["tourn_key", "surname", "initial"]:
    players_long[col] = players_long[col].astype(str).str.lower().str.strip()
players_long["year"]      = pd.to_numeric(players_long["year"], errors="coerce").astype("Int64")
players_long["rank_hint"] = pd.to_numeric(players_long["rank_hint"], errors="coerce").astype("Int64")

In [29]:


# =============================
# Hierarchical merge:
# 1) surname-only  2) surname+initial  3) surname+initial+rank
# =============================

# Normalise rankings types 
for col in ["tourn_key", "surname", "initial"]:
    rankings[col] = rankings[col].astype(str).str.lower().str.strip()
rankings["year"]    = pd.to_numeric(rankings["year"], errors="coerce").astype("Int64")
rankings[RANK_COL]  = pd.to_numeric(rankings[RANK_COL], errors="coerce").astype("Int64")

unmatched     = players_long.copy()
matched_parts = []

# ---- LEVEL 1: (tourn_key, surname, year) ----
key1 = ["tourn_key", "surname", "year"]
rk1  = get_unique_rankings(rankings, key1)

m1 = unmatched.merge(
    rk1[key1 + [RANK_COL] + profile_cols],
    on=key1,
    how="left",
    indicator=True,
    suffixes=("", "_rk")
)

matched1  = m1[m1["_merge"] == "both"].drop(columns="_merge")
unmatched = m1[m1["_merge"] == "left_only"][players_long.columns]  # drop ranking cols
matched_parts.append(matched1)

# ---- LEVEL 2: (tourn_key, surname, initial, year) ----
key2 = ["tourn_key", "surname", "initial", "year"]
rk2  = get_unique_rankings(rankings, key2)

m2 = unmatched.merge(
    rk2[key2 + [RANK_COL] + profile_cols],
    on=key2,
    how="left",
    indicator=True,
    suffixes=("", "_rk")
)

matched2  = m2[m2["_merge"] == "both"].drop(columns="_merge")
unmatched = m2[m2["_merge"] == "left_only"][players_long.columns]
matched_parts.append(matched2)

# ---- LEVEL 3: (tourn_key, surname, initial, year, rank_hint) ----
u3 = unmatched[unmatched["rank_hint"].notna()].copy()
rk3 = rankings[rankings[RANK_COL].notna()].copy()

m3 = u3.merge(
    rk3[key2 + [RANK_COL] + profile_cols],
    left_on=key2 + ["rank_hint"],
    right_on=key2 + [RANK_COL],
    how="left",
    indicator=True,
    suffixes=("", "_rk")
)

matched3 = m3[m3["_merge"] == "both"].drop(columns="_merge")
matched_parts.append(matched3)

# rows still unmatched after level 3
matched3_idx = matched3.set_index(["match_id", "role"]).index
unmatched_final = unmatched[
    ~unmatched.set_index(["match_id", "role"]).index.isin(matched3_idx)
].copy()

# ensure unmatched_final has ranking/profile columns (as NaN)
for col in [RANK_COL] + profile_cols:
    if col not in unmatched_final.columns:
        unmatched_final[col] = pd.NA

# combined long table with rankings attached
merged_long = pd.concat(matched_parts + [unmatched_final], ignore_index=True)

# =============================================
# Special handling for the Ratiwatana twins
# =============================================
TWINS = {("ratiwatana", "s")}  # (surname, initial) in lowercase

merged_long = assign_twins_min_rank_per_match(
    merged_long=merged_long,
    rankings=rankings,
    twins=TWINS,
    profile_cols=profile_cols,
    rank_col=RANK_COL,
    roles=("losers_p1", "losers_p2"),  # you can extend to winners_* if needed
)


# =============================
# Build wide profiles per role (index = match_id)
# =============================
role_frames = []

for surname_col, role in PLAYER_ROLES:
    role_df = (
        merged_long[merged_long["role"] == role]
        .drop_duplicates(subset=["match_id"])
        .set_index("match_id")[[RANK_COL] + profile_cols]
        .add_prefix(f"{role}_")   # winners_p1_rank, winners_p1_dob, etc.
    )
    role_frames.append(role_df)

profiles_wide = pd.concat(role_frames, axis=1)

# =============================
# Join back to matches => final result
# =============================
result = (
    matches
    .set_index("match_id")
    .join(profiles_wide, how="left")
    .reset_index()   # keep match_id; drop if you don't need it
)

# =============================
# Save output
# =============================
result.to_excel(OUTPUT_FILE, index=False)
print(f"Saved merged file with rankings:\n{OUTPUT_FILE}")


Saved merged file with rankings:
C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp/men_matches_with_ranks.xlsx


In [30]:

def build_unmatched_report(merged_long, rankings, key_cols, rank_col="rank_num"):
    """
    Return a dataframe with one row per unmatched player (match_id, role)
    and diagnostics about why the ranking was not attached.

    Logic:
      - n_candidates = number of rows in rankings sharing (tourn_key, surname, initial, year)
      - If n_candidates == 0  -> key does not exist in rankings
      - If n_candidates > 1 and rank_hint is NaN -> ambiguous key, no rank to disambiguate
      - If n_candidates > 1 and rank_hint not NaN -> multiple candidates; rank did not match any
      - If n_candidates == 1 but still unmatched -> generic data inconsistency flag
    """

    # 1) players with NO rank attached in merged_long
    unmatched = merged_long[
        merged_long[rank_col].isna()
    ][["match_id", "role", "tourn_key", "year", "surname", "initial", "rank_hint"]].copy()

    if unmatched.empty:
        print("All players matched a ranking row.")
        return unmatched

    # Normalise key columns to strings so that grouping/merging is stable
    for col in ["tourn_key", "surname", "initial"]:
        unmatched[col] = unmatched[col].astype("string").str.lower().str.strip()
        rankings[col]  = rankings[col].astype("string").str.lower().str.strip()

    unmatched["year"] = pd.to_numeric(unmatched["year"], errors="coerce").astype("Int64")
    rankings["year"]  = pd.to_numeric(rankings["year"], errors="coerce").astype("Int64")

    # 2) count number of ranking candidates per key (ignoring rank)
    cand_counts = (
        rankings
        .groupby(key_cols, dropna=False)[rank_col]
        .size()
        .reset_index(name="n_candidates")
    )

    # merge candidate count onto unmatched players
    diag = unmatched.merge(cand_counts, on=key_cols, how="left")
    diag["n_candidates"] = diag["n_candidates"].fillna(0).astype(int)

    # 3) classify the reason
    conds = [
        diag["n_candidates"] == 0,
        (diag["n_candidates"] > 1) & diag["rank_hint"].isna(),
        (diag["n_candidates"] > 1) & diag["rank_hint"].notna(),
        (diag["n_candidates"] == 1),
    ]
    choices = [
        "no key in rankings (surname+initial+year+tourn_key not present)",
        "ambiguous key (multiple players share key, no rank_hint to disambiguate)",
        "multiple candidates; rank_hint did not match any ranking row",
        "single candidate but still unmatched (check data/merge logic)",
    ]

    diag["reason"] = np.select(conds, choices, default="unclassified")

    # final tidy columns
    cols_order = [
        "match_id", "role",
        "tourn_key", "year",
        "surname", "initial", "rank_hint",
        "n_candidates", "reason"
    ]
    diag = diag[cols_order]

    return diag


In [31]:
# key_cols should already be defined as:
key_cols = ["tourn_key", "surname", "initial", "year"]

unmatched_report = build_unmatched_report(
    merged_long=merged_long,
    rankings=rankings,
    key_cols=key_cols,
    rank_col=RANK_COL  # usually "rank"
)

display(unmatched_report)
print("Total unmatched players:", len(unmatched_report))


Unnamed: 0,match_id,role,tourn_key,year,surname,initial,rank_hint,n_candidates,reason
0,1,losers_p2,australian open,2018,struff,j,,0,no key in rankings (surname+initial+year+tourn...
1,5,losers_p2,australian open,2018,hewitt,l,,0,no key in rankings (surname+initial+year+tourn...
2,6,winners_p2,australian open,2018,struff,j,,0,no key in rankings (surname+initial+year+tourn...
3,8,losers_p1,australian open,2018,chardy,j,,0,no key in rankings (surname+initial+year+tourn...
4,12,losers_p1,australian open,2018,andujar,p,,0,no key in rankings (surname+initial+year+tourn...
...,...,...,...,...,...,...,...,...,...
1582,1402,winners_p2,us open,2023,nishioka,y,,0,no key in rankings (surname+initial+year+tourn...
1583,1403,losers_p1,us open,2023,fils,a,,0,no key in rankings (surname+initial+year+tourn...
1584,1403,losers_p2,us open,2023,assche,l,,0,no key in rankings (surname+initial+year+tourn...
1585,1404,winners_p1,us open,2023,tsitsipas,s,,0,no key in rankings (surname+initial+year+tourn...


Total unmatched players: 1587


In [32]:
unmatched_report[unmatched_report["surname"] == "nakashima"]


Unnamed: 0,match_id,role,tourn_key,year,surname,initial,rank_hint,n_candidates,reason
1191,1048,losers_p2,wimbledon,2023,nakashima,b,,0,no key in rankings (surname+initial+year+tourn...
1426,1276,losers_p2,us open,2021,nakashima,b,,0,no key in rankings (surname+initial+year+tourn...


In [33]:
def inspect_player_keys(surname="surname", initial="i"):
    s = surname.lower()
    i = initial.lower()

    # rows for this player in matches (all roles)
    pl = players_long[
        (players_long["surname"] == s) &
        (players_long["initial"] == i)
    ].copy()

    print("=== In players_long (matches) ===")
    if pl.empty:
        print("No rows for", s, i)
    else:
        display(
            pl[["match_id", "role", "tourn_key", "year", "rank_hint"]]
            .sort_values(["year", "tourn_key", "role"])
        )

    # rows for this player in rankings
    rk = rankings[
        (rankings["surname"] == s) &
        (rankings["initial"] == i)
    ].copy()

    print("\n=== In rankings ===")
    if rk.empty:
        print("No rows for", s, i, "in rankings.")
    else:
        display(
            rk[["tourn_key", "year", RANK_COL]]
            .sort_values(["year", "tourn_key", RANK_COL])
        )

    # compare (tourn_key, year) pairs
    match_pairs = set(map(tuple, pl[["tourn_key", "year"]].drop_duplicates().to_numpy())) if not pl.empty else set()
    rank_pairs  = set(map(tuple, rk[["tourn_key", "year"]].drop_duplicates().to_numpy())) if not rk.empty else set()

    print("\n(tourn_key, year) in MATCHES:", match_pairs)
    print("(tourn_key, year) in RANKINGS:", rank_pairs)
    print("In MATCHES but not RANKINGS:", match_pairs - rank_pairs)




In [34]:
# Example
inspect_player_keys("nakashima", "b")

=== In players_long (matches) ===


Unnamed: 0,match_id,role,tourn_key,year,rank_hint
5105,1276,losers_p2,us open,2021,
4193,1048,losers_p2,wimbledon,2023,



=== In rankings ===
No rows for nakashima b in rankings.

(tourn_key, year) in MATCHES: {('us open', 2021), ('wimbledon', 2023)}
(tourn_key, year) in RANKINGS: set()
In MATCHES but not RANKINGS: {('us open', 2021), ('wimbledon', 2023)}
