In [8]:
import os
import pandas as pd
from getpass import getuser
import re

In [9]:
USER = getuser()
INPUT_DIR = f"C:/Users/{USER}/Documents/GitHub/tennis-homophily/data/atp/"
MATCHES = os.path.join(INPUT_DIR, "grand_slam_matches_2018_2023.xlsx")

# =============================
# Load Grand Slam
# =============================
print("Loading Grand Slam matches…")
if not os.path.exists(MATCHES):
    raise FileNotFoundError(f"Grand Slam file not found: {MATCHES}")

df = pd.read_excel(MATCHES)

Loading Grand Slam matches…


In [10]:
def convert_date_range(x):
    if pd.isna(x):
        return None

    x = str(x).strip()

    # Pattern to match things like "15-28 Jan, 2018"
    match = re.match(r"(\d+)-\d+\s+([A-Za-z]+),\s*(\d{4})", x)
    if match:
        day = int(match.group(1))          # first number: 15
        month = match.group(2)             # month: Jan
        year = int(match.group(3))         # year: 2018
        return pd.to_datetime(f"{day} {month} {year}", format="%d %b %Y")

    # Pattern to match things like "15 Jan, 2018"
    match = re.match(r"(\d+)\s+([A-Za-z]+),\s*(\d{4})", x)
    if match:
        return pd.to_datetime(match.group(0), format="%d %b, %Y")

    return None

df["date"] = df["date"].apply(convert_date_range)


In [11]:
player_vars = ["winners_p1", "winners_p2", "losers_p1", "losers_p2"]

def split_player_info(x):
    """
    Takes strings like:
       'Mike Bryan(6)'       -> ('mike', 'bryan', 6, None)
       'Mike Bryan (WC)'     -> ('mike', 'bryan', None, 'wc')
       'Mike Bryan ( PR )'   -> ('mike', 'bryan', None, 'pr')
       'Mike Bryan (Q)'      -> ('mike', 'bryan', None, 'q')

    Returns:
       (first_name, surname, ranking_number, status_code)
    """
    if pd.isna(x):
        return (None, None, None, None)

    x = str(x).strip()

    # Extract whatever is inside parentheses
    par = re.search(r"\(\s*([A-Za-z0-9]+)\s*\)", x)
    ranking = None
    status = None

    if par:
        token = par.group(1)

        if token.isdigit():       # numeric ranking
            ranking = int(token)
        else:                     # wildcard / qualifier / PR / LL / AL / etc.
            status = token.lower()

    # Remove parentheses expression
    name_part = re.sub(r"\(\s*[A-Za-z0-9]+\s*\)", "", x).strip()

    # Extract first name + surname
    parts = name_part.split()
    if len(parts) >= 2:
        first = parts[0].lower()
        last  = parts[-1].lower()
    else:
        first = parts[0].lower()
        last = None

    return first, last, ranking, status


# Apply the splitting to all player variables
for var in player_vars:
    df[f"{var}_name"], df[f"{var}_surname"], df[f"{var}_ranking"], df[f"{var}_status"] = zip(
        *df[var].apply(split_player_info)
    )


In [12]:
df.drop(columns=["winners_p1", "winners_p2", "losers_p1", "losers_p2"], inplace=True)


In [13]:
# Save output
output_path = os.path.join(INPUT_DIR, "men_matches.xlsx")
df.to_excel(output_path, index=False)

print(f"\nMerged file saved as: {output_path}")


Merged file saved as: C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp/men_matches.xlsx
