In [22]:
import os
import pandas as pd
from getpass import getuser
import re

In [23]:

USER = getuser()
INPUT_DIR = f"C:/Users/{USER}/Documents/GitHub/tennis-homophily/data/atp/"

# Collect all ranking_doubles files
files = [
    f for f in os.listdir(INPUT_DIR)
    if f.startswith("ranking_doubles_") and f.endswith(".xlsx")
]

print(f"Found {len(files)} files.")

dfs = []
for file in sorted(files):
    path = os.path.join(INPUT_DIR, file)
    
    try:
        df = pd.read_excel(path)

        # Drop unwanted columns if they exist
        cols_to_drop = ["Unnamed: 15", "Player Profile Link"]
        df = df.drop(columns=[c for c in cols_to_drop if c in df.columns], errors="ignore")
        
        # Standardize column names: lower case + replace space with -
        df.columns = df.columns.str.lower().str.replace(" ", "-", regex=False)

        dfs.append(df)
        print(f"Processed: {file}")

    except Exception as e:
        print(f"Error reading {file}: {e}")


# Count rows in each single df
rows_per_df = [len(d) for d in dfs]

# Total expected rows
expected_rows = sum(rows_per_df)

print("Rows per file:", rows_per_df)
print("Expected total rows:", expected_rows)

# Merge
merged_df = pd.concat(dfs, ignore_index=True)

actual_rows = len(merged_df)

print("Actual merged rows:", actual_rows)

if actual_rows == expected_rows:
    print("✅ Row count matches! No rows lost or duplicated during merge.")
else:
    print("❌ Mismatch! Expected:", expected_rows, "but got:", actual_rows)



Found 23 files.
Processed: ranking_doubles_2018-01-15.xlsx
Processed: ranking_doubles_2018-05-21.xlsx
Processed: ranking_doubles_2018-07-02.xlsx
Processed: ranking_doubles_2018-09-24.xlsx
Processed: ranking_doubles_2019-01-14.xlsx
Processed: ranking_doubles_2019-05-20.xlsx
Processed: ranking_doubles_2019-07-01.xlsx
Processed: ranking_doubles_2019-08-26.xlsx
Processed: ranking_doubles_2020-01-20.xlsx
Processed: ranking_doubles_2020-08-31.xlsx
Processed: ranking_doubles_2020-09-21.xlsx
Processed: ranking_doubles_2021-02-08.xlsx
Processed: ranking_doubles_2021-05-24.xlsx
Processed: ranking_doubles_2021-06-28.xlsx
Processed: ranking_doubles_2021-08-30.xlsx
Processed: ranking_doubles_2022-01-17.xlsx
Processed: ranking_doubles_2022-05-16.xlsx
Processed: ranking_doubles_2022-06-27.xlsx
Processed: ranking_doubles_2022-08-22.xlsx
Processed: ranking_doubles_2023-01-16.xlsx
Processed: ranking_doubles_2023-05-22.xlsx
Processed: ranking_doubles_2023-07-03.xlsx
Processed: ranking_doubles_2023-08-28.

# clear players'names

In [24]:

# Clean all columns starting with 'player'
player_cols = [col for col in merged_df.columns if col.startswith("player")]

def clean_player_name(x):
    if pd.isna(x):
        return x
    x = str(x).lower().strip()

    # remove prefixes like "1R. ", "1O. ", "-1R. ", etc.
    x = re.sub(r"^[\-\d]*[a-zA-Z]\.\s*", "", x)  # removes "L. ", "R. ", "1O. ", "-1R. " etc.

    # remove numeric or alphanumeric blocks before surname
    # examples: "1r. harrison", "1. harrison", "2q. garcia"
    x = re.sub(r"^[\-\d]*[a-zA-Z]*\d*\.\s*", "", x)

    # ensure surname only (in case initials remain)
    # e.g. "l.kubot" -> "kubot"
    x = re.sub(r"^[a-z]\.", "", x).strip()

    return x

for col in player_cols:
    merged_df[col] = merged_df[col].apply(clean_player_name)


# split w-l columns in two separate variables 

In [25]:
# Identify all W-L columns in merged_df
wl_cols = [col for col in merged_df.columns if col.startswith("w-l-")]

def split_wl(value):
    """
    Converts strings like '24 - 28' or '435-305' into (24, 28).
    Returns (None, None) if parsing fails.
    """
    if pd.isna(value):
        return (None, None)

    value = str(value).strip()

    # Extract two numbers
    match = re.findall(r"\d+", value)
    if len(match) == 2:
        return int(match[0]), int(match[1])
    else:
        return (None, None)

for col in wl_cols:
    # Example: w-l-ytd → ytd
    suffix = col.replace("w-l-", "")

    # Create new columns
    merged_df[f"w-{suffix}"], merged_df[f"l-{suffix}"] = zip(
        *merged_df[col].apply(split_wl)
    )

    # Remove the original column
    merged_df.drop(columns=col, inplace=True)

print("W-L columns successfully split.")


W-L columns successfully split.


# extract the missing values of dob from age 

In [26]:
def extract_dob_from_age(age_value):
    """
    Extracts the date inside parentheses in the format YYYY/MM/DD.
    Example: '42 (1983/09/23)' → '1983/09/23'
    """
    if pd.isna(age_value):
        return None

    age_str = str(age_value)

    # Look for a date inside parentheses
    match = re.search(r"\((\d{4}/\d{2}/\d{2})\)", age_str)
    if match:
        return match.group(1)
    return None

# Apply only to rows where dob is missing
missing_dob_mask = merged_df["dob"].isna()

merged_df.loc[missing_dob_mask, "dob"] = merged_df.loc[missing_dob_mask, "age"].apply(
    extract_dob_from_age
)

print("DOB values updated where possible.")


DOB values updated where possible.


In [27]:
merged_df = merged_df.drop(columns=["age"])


# weight and height split in two different measures 


In [28]:
# ---------- WEIGHT CLEANING ----------
def extract_weight_lbs(x):
    if pd.isna(x):
        return None
    match = re.search(r"(\d+)\s*lbs", str(x).lower())
    return int(match.group(1)) if match else None

def extract_weight_kg(x):
    if pd.isna(x):
        return None
    match = re.search(r"\((\d+)\s*kg\)", str(x).lower())
    return int(match.group(1)) if match else None

if "weight" in merged_df.columns:
    merged_df["weight-lbs"] = merged_df["weight"].apply(extract_weight_lbs)
    merged_df["weight-kg"]  = merged_df["weight"].apply(extract_weight_kg)
    merged_df.drop(columns=["weight"], inplace=True)


# ---------- HEIGHT CLEANING ----------
def extract_height_ft(x):
    if pd.isna(x):
        return None
    match = re.search(r"(\d+)'\s*(\d+)\"", str(x).lower())
    return int(match.group(1)) if match else None

def extract_height_in(x):
    if pd.isna(x):
        return None
    match = re.search(r"(\d+)'\s*(\d+)\"", str(x).lower())
    return int(match.group(2)) if match else None

def extract_height_cm(x):
    if pd.isna(x):
        return None
    match = re.search(r"\((\d+)\s*cm\)", str(x).lower())
    return int(match.group(1)) if match else None

if "height" in merged_df.columns:
    merged_df["height-ft"] = merged_df["height"].apply(extract_height_ft)
    merged_df["height-in"] = merged_df["height"].apply(extract_height_in)
    merged_df["height-cm"] = merged_df["height"].apply(extract_height_cm)
    merged_df.drop(columns=["height"], inplace=True)

print("Weight and height cleaned successfully.")


Weight and height cleaned successfully.


# variable experience = year - turned pro

In [29]:
# Create experience variable only if both columns exist
if "year" in merged_df.columns and "turned-pro" in merged_df.columns:
    merged_df["experience"] = merged_df["year"] - merged_df["turned-pro"]
else:
    print("Column 'year' or 'turned-pro' is missing.")


# birthplace

In [30]:
def split_birthplace(x):
    if pd.isna(x):
        return (None, None)
    
    parts = str(x).split(",")
    
    if len(parts) == 2:
        city = parts[0].strip()
        country = parts[1].strip()
        return city, country
    else:
        # If the format is not city, country → return entire string as city
        return str(x).strip(), None

# Apply split
merged_df["birthplace-city"], merged_df["birthplace-country"] = zip(
    *merged_df["birthplace"].apply(split_birthplace)
)

# Optional: drop the original column
merged_df.drop(columns=["birthplace"], inplace=True)


# hand and backhand 

In [31]:
import pandas as pd

def split_plays(x):
    if pd.isna(x):
        return (None, None)
    
    parts = str(x).split(",")
    
    if len(parts) == 2:
        hand = parts[0].strip()
        backhand = parts[1].strip()
        return hand, backhand
    else:
        # If format unexpected, store everything in "hand"
        return str(x).strip(), None

merged_df["hand"], merged_df["backhand"] = zip(
    *merged_df["plays"].apply(split_plays)
)

# Optional: drop the original column
merged_df.drop(columns=["plays"], inplace=True)


In [32]:
# Save output
output_path = os.path.join(INPUT_DIR, "men_rankings.xlsx")
merged_df.to_excel(output_path, index=False)

print(f"\nMerged file saved as: {output_path}")


Merged file saved as: C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp/men_rankings.xlsx
