In [None]:
%pip install pandas pyarrow


In [None]:
import os
print("Notebook working directory:", os.getcwd())


In [None]:
print("Current dir:", os.getcwd())
print("\ndata/:")
print(os.listdir("data"))
print("\ndata/tournaments/:")
print(os.listdir(os.path.join("data", "tournaments")))


In [None]:
import pandas as pd

column_names = [
    "tourney_year_id", "tourney_order", "tourney_type", "tourney_name", "tourney_id",
    "tourney_slug", "tourney_location", "tourney_date", "year", "tourney_month",
    "tourney_day", "tourney_singles_draw", "tourney_doubles_draw", "tourney_conditions",
    "tourney_surface", "tourney_fin_commit_raw", "currency", "tourney_fin_commit",
    "tourney_url_suffix", "singles_winner_name", "singles_winner_url",
    "singles_winner_player_slug", "singles_winner_player_id", "doubles_winner_1_name",
    "doubles_winner_1_url", "doubles_winner_1_player_slug", "doubles_winner_1_player_id",
    "doubles_winner_2_name", "doubles_winner_2_url", "doubles_winner_2_player_slug",
    "doubles_winner_2_player_id"
]

tourn_path = "data/tournaments/tournaments_2020-2022.csv"
tournaments = pd.read_csv(tourn_path, header=None, names=column_names)

usopen_tourn = tournaments[
    (tournaments["tourney_name"] == "US Open") & (tournaments["year"] == 2022)
].copy()

print("Filtered US Open 2022 Tournament:")
print(usopen_tourn[["tourney_year_id", "tourney_name", "year"]])


In [None]:
match_scores_path = "data/match_scores/match_scores_2020-2022.csv"

match_scores_columns = [
    "tourney_year_id", "tourney_order", "tourney_name", "tourney_slug",
    "tourney_url_suffix", "start_date", "start_year", "start_month", "start_day",
    "end_date", "end_year", "end_month", "end_day", "currency", "prize_money",
    "match_index", "tourney_round_name", "round_order", "match_order",
    "winner_name", "winner_player_id", "winner_slug", "loser_name",
    "loser_player_id", "loser_slug", "winner_seed", "loser_seed",
    "match_score_tiebreaks", "winner_sets_won", "loser_sets_won",
    "winner_games_won", "loser_games_won", "winner_tiebreaks_won",
    "loser_tiebreaks_won", "match_id", "match_stats_url_suffix"
]

match_scores = pd.read_csv(match_scores_path, header=None, names=match_scores_columns)
usopen_id = usopen_tourn["tourney_year_id"].iloc[0]
usopen_matches = match_scores[match_scores["tourney_year_id"] == usopen_id].copy()
print("US Open 2022 Match Scores (First 5 rows):")
display(usopen_matches.head())


In [None]:
pd.set_option("display.max_columns", None)
stats_raw = pd.read_csv("data/stats/match_stats_extended_2022.csv", header=None)

stat_cols = [
    "match_id", "tourney_slug", "match_stats_url_suffix", "winner_slug",
    "winner_serve_rating", "winner_aces", "winner_double_faults",
    "winner_first_serves_in", "winner_first_serves_total",
    "winner_first_serve_points_won", "winner_first_serve_points_total",
    "winner_second_serve_points_won", "winner_second_serve_points_total",
    "winner_break_points_saved", "winner_break_points_serve_total",
    "winner_service_games_played", "winner_return_rating",
    "winner_first_serve_return_won", "winner_first_serve_return_total",
    "winner_second_serve_return_won", "winner_second_serve_return_total",
    "winner_break_points_converted", "winner_break_points_return_total",
    "winner_return_games_played", "winner_service_points_won"
]
stats_raw.columns = stat_cols

stats = stats_raw.add_prefix("stats_")
stats = stats.rename(columns={"stats_match_id": "match_id"})

stats = stats[stats["match_id"].isin(usopen_matches["match_id"])]
usopen_matches = usopen_matches.merge(stats, on="match_id", how="left")
usopen_matches = usopen_matches.dropna(axis=1, how="all")

print("After merging all 25 stat fields:", usopen_matches.shape)
display(usopen_matches.head())


In [None]:
cwd = os.getcwd()
repo_root = os.path.abspath(os.path.join(cwd, os.pardir))
processed_dir = os.path.join(repo_root, "data", "processed")
out_path = os.path.join(processed_dir, "usopen_2022_matches.csv")

print("Notebook cwd:         ", cwd)
print("Expected repo root:   ", repo_root)
print("Processed directory:  ", processed_dir)
print("Full CSV output path: ", out_path)
print("Exists?:", os.path.exists(out_path))
