# InfoSys API Pre Cleaning
The aim of this notebook is to acquire and clean current year match data from the infosys API to prep for general processing

In [1]:
import pandas as pd
from infotennis.scrapers.scraping_functions_atp import scrape_ATP_tournament, scrape_ATP_calendar
from infotennis.scrapers.scrape_match_data import scrape_ATP_match_data
from rapidfuzz import process, fuzz
from tqdm import tqdm
import time
import numpy as np
import re
from json import JSONDecodeError
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

### Tournament Calendar
Load the current year tournament calendar

In [2]:
year = 2025
df_tournaments = scrape_ATP_calendar(year)
df_tournaments[['start', 'end']] = df_tournaments['date_start'].str.split(' - ', expand=True)

### Matches Calendar
Load the current years meta match data

In [4]:
all_matches = []
for _, tourn in df_tournaments.iterrows():
    # If the “Results” URL is blank, skip
    if not tourn.url:
        continue
    df_matches = scrape_ATP_tournament(
        url=tourn.url,
        tournament=tourn.tournament,
        tournament_id=tourn.tournament_id,
        year=year,
        format="S"
    )
    all_matches.append(df_matches)

matches_raw = pd.concat(all_matches, ignore_index=True)

In [5]:
matches = matches_raw.copy()

# Extract match id's from urls
matches['match_id'] = matches['url'].str.split('/').str[-1]

# drop duplicate match entry
mask = (matches['tournament_id'] == '321') & (matches['match_id'] == 'ms013')
idx_to_drop = matches[mask].index[:1]
matches = matches.drop(idx_to_drop)

# drop doubles & womens matches
matches = matches[~matches['player1_name'].str.contains(',', na=False)]
matches['player1_id'].replace('', np.nan, inplace=True)
matches['player2_id'].replace('', np.nan, inplace=True)
matches = matches.dropna(subset=['player1_id', 'player2_id'])

#drop unnecessary columns
matches = matches.drop(columns=['url', 'player1_nation', 'player2_nation', 'player1_seed', 'player2_seed', 'court_vision'])

In [6]:
matches['tournament_id'] = matches['tournament_id'].astype(str)
df_tournaments['tournament_id'] = df_tournaments['tournament_id'].astype(str)

matches = pd.merge(matches, df_tournaments[['tournament_id', 'start', 'category', 'draw', 'surface']], on='tournament_id', how='left')

matches['draw'] = matches['draw'].str.extract(r'SGL\s*(\d+)', expand=False).astype(float)
matches = matches.rename(columns={'start': 'tourney_date'})
matches['surface'] = matches['surface'].replace({
    'Outdoor Hard': 'Hard',
    'Outdoor Clay': 'Clay',
    'Outdoor Grass': 'Grass',
    'Indoor Hard': 'Hard',
    'Indoor ' : 'Hard'
})
matches.head()

Unnamed: 0,year,tournament,tournament_id,round,player1_name,player1_id,player2_name,player2_id,score,match_id,tourney_date,category,draw,surface
0,2025,United Cup,9900,Final,Taylor Fritz,fb98,Hubert Hurkacz,hb71,64 57 76(4),ms011,2024.12.27,Other,18.0,Hard
1,2025,United Cup,9900,Semifinals,Taylor Fritz,fb98,Tomas Machac,m0fh,67(4) 65,ms021,2024.12.27,Other,18.0,Hard
2,2025,United Cup,9900,Semifinals,Hubert Hurkacz,hb71,Alexander Shevchenko,s0h2,63 62,ms031,2024.12.27,Other,18.0,Hard
3,2025,United Cup,9900,Quarterfinals,Tomas Machac,m0fh,Flavio Cobolli,c0e9,61 62,ms051,2024.12.27,Other,18.0,Hard
4,2025,United Cup,9900,Quarterfinals,Hubert Hurkacz,hb71,Billy Harris,hd68,76(3) 75,ms071,2024.12.27,Other,18.0,Hard


In [7]:
def convert_score_format(score):
    """
    Convert tennis score from format '65 46 76(6)' to '6-5 4-6 7-6(6)'
    
    Args:
        score (str): Tennis score in format like '65 46 76(6)'
        
    Returns:
        str: Tennis score in format like '6-5 4-6 7-6(6)'
    """
    if pd.isna(score) or score == '':
        return score
    
    # Split the score into individual sets
    sets = score.split()
    converted_sets = []
    
    for set_score in sets:
        # Handle tiebreak notation (e.g., '76(6)')
        if '(' in set_score:
            # Extract main score and tiebreak
            main_score = set_score.split('(')[0]
            tiebreak = set_score.split('(')[1].rstrip(')')
            
            # Convert main score (e.g., '76' -> '7-6')
            if len(main_score) == 2:
                converted_main = f"{main_score[0]}-{main_score[1]}"
                converted_sets.append(f"{converted_main}({tiebreak})")
            else:
                converted_sets.append(set_score)  # Keep original if unexpected format
        else:
            # Regular set score (e.g., '65' -> '6-5')
            if len(set_score) == 2:
                converted_sets.append(f"{set_score[0]}-{set_score[1]}")
            else:
                converted_sets.append(set_score)  # Keep original if unexpected format
    
    return ' '.join(converted_sets)

matches['score'] = matches['score'].apply(convert_score_format)

In [8]:
matches['round'] = matches['round'].replace({
    'Quarterfinals': 'QF',
    'Semifinals': 'SF',
    'Final': 'F',
    'Round Of 16': 'R16',
    'Round Of 32': 'R32',
    'Round Of 64': 'R64',
    'Round Of 128': 'R128',
    '1st Round Qualifying': 'ER',
    '2nd Round Qualifying': 'ER',
    '3rd Round Qualifying': 'ER',
    'Round Robin': 'RR',
    'Round Robin Day 2': 'RR',
    'Round Robin Day 3': 'RR',
    'Round Robin Day 4': 'RR',
    'Round Robin Day 5': 'RR',
    'Round Robin Day 6': 'RR',
})

matches['tourney_level'] = matches['category'].replace({
    'Grand Slam': "G",
    'ATP Masters 1000': "M",
    'ATP 500': "A",
    'ATP 250': "A",
    'United Cup': "A",    
})

matches['best_of'] = np.where(
    matches['tourney_level'] == 'G', 5, 3
)

matches.rename(columns={
    'tournament_id': 'tourney_id',
    'tournament': 'tourney_name',
    'draw': 'draw_size',
}, inplace=True)
matches.drop(columns=['category'], inplace=True)

In [9]:
matches = matches[matches['round'] != 'ER']
matches = matches[matches['tourney_name'] != 'United Cup']

In [15]:
#United Cup + 4 Grand Slams
SKIP_TOURNAMENTS = {"9900", "580", "520", "540", "560"}

# JSON Stats tag keys
SIMPLE_COUNT_STATS = {
    "Aces":                   "ace",
    "Double Faults":          "df",
    "Service Games Played":   "SvGms",
}

RATIO_STATS = {
    "1st Serve":                "1stIn",
    "1st Serve Points Won":     "1stWon",
    "2nd Serve Points Won":     "2ndWon",
    "Service Points Won":       "svpt",
    "Break Points Saved":       "bpSaved",
}

_frac_re = re.compile(r"(\d+)\/(\d+)")

# ── HELPERS ────────────────────────────────────────────────────────────────────

def _aggregate_set_counts(raw: dict) -> dict:
    """
    Pull only the overall‐match stats block ('set0' if present, else first block)
    and return a dict of all your count‐based stats.
    """
    # Initialise Stats
    agg = {}
    for short in SIMPLE_COUNT_STATS.values():
        agg[f"p1_{short}"] = 0
        agg[f"p2_{short}"] = 0
    for short in RATIO_STATS.values():
        agg[f"p1_{short}"] = 0
        agg[f"p2_{short}"] = 0
        if short == "bpSaved":
            agg[f"p1_{short}Faced"] = 0
            agg[f"p2_{short}Faced"] = 0

    # Get Stats and Process
    set_stats = raw.get("setStats", {})
    if "set0" in set_stats:
        stats_list = set_stats["set0"]
    else:
        stats_list = next(iter(set_stats.values()), [])

    for stat in stats_list:
        name, p1, p2 = stat["name"], stat["player1"], stat["player2"]

        if name in SIMPLE_COUNT_STATS:
            key = SIMPLE_COUNT_STATS[name]
            agg[f"p1_{key}"] += int(p1)
            agg[f"p2_{key}"] += int(p2)

        elif name in RATIO_STATS:
            key = RATIO_STATS[name]
            m1 = _frac_re.match(p1)
            m2 = _frac_re.match(p2)
            if m1:
                n1, d1 = map(int, m1.groups())
                agg[f"p1_{key}"] += n1
                if key == "bpSaved":
                    agg[f"p1_{key}Faced"] += d1
            if m2:
                n2, d2 = map(int, m2.groups())
                agg[f"p2_{key}"] += n2
                if key == "bpSaved":
                    agg[f"p2_{key}Faced"] += d2

    # Output dict format
    out = {}
    for display, short in SIMPLE_COUNT_STATS.items():
        out[f"player1_{short}"] = agg[f"p1_{short}"]
        out[f"player2_{short}"] = agg[f"p2_{short}"]

    for display, short in RATIO_STATS.items():
        out[f"player1_{short}"] = agg[f"p1_{short}"]
        out[f"player2_{short}"] = agg[f"p2_{short}"]
        if short == "bpSaved":
            out["player1_bpFaced"] = agg[f"p1_{short}Faced"]
            out["player2_bpFaced"] = agg[f"p2_{short}Faced"]

    return out

# ── MAIN SCRAPER ───────────────────────────────────────────────────────────────

def scrape_and_append_key_stats(
    matches_df: pd.DataFrame,
    year: int = 2025
) -> (pd.DataFrame, pd.DataFrame):
    """
    For each match in matches_df (with columns
      ['tourney_id','match_id','player1_id','player2_id']),
    fetch the JSON key-stats, aggregate them, and assign each JSON
    player’s stats to the correct side based on id match.
    Returns:
      - merged_df: original matches_df plus player1_*/player2_* stats
      - failed_df: subset of rows where neither JSON ID matched
    """
    
    results = []
    failures = []
    decode_failures = 0

    subset = matches_df.copy()
    for _, row in tqdm(subset.iterrows(),
                       total=len(subset),
                       desc="Scraping key-stats"):
        tourn_id = row["tourney_id"]
        match_id = row["match_id"]
        sys1     = str(row["player1_id"]).strip().upper()
        sys2     = str(row["player2_id"]).strip().upper()

        # Fetch JSON
        raw = None
        for _ in range(2):
            try:
                raw = scrape_ATP_match_data(year, str(tourn_id), match_id, "key-stats")
                break
            except JSONDecodeError:
                time.sleep(3)
        if raw is None:
            decode_failures += 1
            failures.append({
                **row.to_dict(),
                **{f"player{i}_{k}": None
                   for i in (1,2)
                   for k in list(SIMPLE_COUNT_STATS.values())+list(RATIO_STATS.values())}
            })
            continue

        # Extract JSON player blocks and IDs
        players_json = raw.get("players", [])
        if len(players_json) < 2:
            failures.append({**row.to_dict(), **{}})
            continue

        j1 = str(players_json[0].get("player1Id")).strip().upper()
        j2 = str(players_json[1].get("player1Id")).strip().upper()

        # Map Stats to correct player based on ID
        mapping = {}
        mapping["player1"] = ("player1" if j1 == sys1 else
                              "player2" if j1 == sys2 else None)
        mapping["player2"] = ("player1" if j2 == sys1 else
                              "player2" if j2 == sys2 else None)

        stats = _aggregate_set_counts(raw)

        remapped = {}
        for json_side in ("player1","player2"):
            match_side = mapping[json_side]
            for key, val in stats.items():
                prefix, metric = key.split("_",1)
                if prefix == json_side:
                    if match_side is None:
                        remapped[f"{json_side}_{metric}"] = None
                    else:
                        remapped[f"{match_side}_{metric}"] = val

        # Output rows formatting
        out = {
            "tourney_id":    tourn_id,
            "match_id":      match_id,
            "player1_id": sys1,
            "player2_id": sys2,
        }

        all_metrics = list(SIMPLE_COUNT_STATS.values()) + list(RATIO_STATS.values())
        for m in all_metrics:
            out.setdefault(f"player1_{m}", None)
            out.setdefault(f"player2_{m}", None)

        out.update(remapped)
        results.append(out)

        # Record failed match stat scrapes
        if mapping["player1"] is None and mapping["player2"] is None:
            failures.append(out)

        time.sleep(0.2)

    if decode_failures:
        print(f"⚠️ {decode_failures} JSON decode failures")

    # Assemble DataFrames
    stats_df  = pd.DataFrame(results)
    failed_df = pd.DataFrame(failures).drop_duplicates(subset=["tourney_id","match_id"])

    merged = matches_df.merge(
        stats_df,
        on=["tourney_id","match_id"],
        how="left"
    )
    return merged, failed_df


In [17]:
matches_save = matches.copy()
full, fail = scrape_and_append_key_stats(matches)
full.to_csv("./data/current_year/TempPostScrape", index=False)
full.head()

Scraping key-stats: 100%|██████████| 2288/2288 [1:18:18<00:00,  2.05s/it]


⚠️ 644 JSON decode failures


Unnamed: 0,year,tourney_name,tourney_id,round,player1_name,player1_id_x,player2_name,player2_id_x,score,match_id,tourney_date,draw_size,surface,tourney_level,best_of,player1_id_y,player2_id_y,player1_ace,player2_ace,player1_df,player2_df,player1_SvGms,player2_SvGms,player1_1stIn,player2_1stIn,player1_1stWon,player2_1stWon,player1_2ndWon,player2_2ndWon,player1_svpt,player2_svpt,player1_bpSaved,player2_bpSaved,player1_bpFaced,player2_bpFaced
0,2025,Brisbane International presented by Evie,339,F,Jiri Lehecka,l0bv,Reilly Opelka,o522,4-1,ms001,2024.12.29,32.0,Hard,A,3,,,,,,,,,,,,,,,,,,,,
1,2025,Brisbane International presented by Evie,339,SF,Jiri Lehecka,l0bv,Grigor Dimitrov,d875,6-4 4-4,ms003,2024.12.29,32.0,Hard,A,3,L0BV,D875,4.0,6.0,3.0,2.0,9.0,9.0,26.0,32.0,22.0,24.0,17.0,12.0,39.0,36.0,0.0,1.0,0.0,2.0
2,2025,Brisbane International presented by Evie,339,SF,Reilly Opelka,o522,Giovanni Mpetshi Perricard,m0gz,6-3 7-6(4),ms002,2024.12.29,32.0,Hard,A,3,O522,M0GZ,12.0,10.0,2.0,5.0,11.0,10.0,45.0,48.0,38.0,38.0,13.0,9.0,51.0,47.0,4.0,1.0,4.0,2.0
3,2025,Brisbane International presented by Evie,339,QF,Reilly Opelka,o522,Novak Djokovic,d643,7-6(6) 6-3,ms004,2024.12.29,32.0,Hard,A,3,O522,D643,16.0,8.0,1.0,1.0,11.0,10.0,49.0,53.0,38.0,38.0,13.0,12.0,51.0,50.0,1.0,4.0,1.0,5.0
4,2025,Brisbane International presented by Evie,339,QF,Grigor Dimitrov,d875,Jordan Thompson,tc61,6-1 2-1,ms007,2024.12.29,32.0,Hard,A,3,D875,TC61,1.0,1.0,1.0,1.0,5.0,5.0,21.0,12.0,19.0,7.0,5.0,6.0,24.0,13.0,0.0,1.0,0.0,4.0


In [None]:
full = pd.read_csv("./data/CurrnetYear/TempPostScrape")
matches = full.drop(columns=['match_id', 'player1_id_y', 'player2_id_y', 'year'])
matches = matches.rename(columns={'player1_id_x': 'player1_id', 'player2_id_x': 'player2_id'})
matches['match_num'] = matches.groupby('tourney_id').cumcount() + 1
exclude_cols = ['player1_id', 'player2_id', 'player1_name', 'player2_name']  # columns to keep unchanged
new_columns = []
for col in matches.columns:
    if col in exclude_cols:
        new_columns.append(col)  # keep original name
    else:
        new_columns.append(col.replace('player1', 'p1').replace('player2', 'p2'))
matches.columns = new_columns

In [19]:
matches['player1_rank'] = ''
matches['player2_rank'] = ''
matches['player1_atp_points'] = ''
matches['player2_atp_points'] = ''
matches['player1_ht'] = ''
matches['player2_ht'] = ''
matches['player1_hand'] = ''
matches['player2_hand'] = ''
matches['player1_hand'] = ''
matches['player2_hand'] = ''
matches['minutes'] = ''

In [22]:
matches['source'] = 1
matches = matches.reindex(columns=['tourney_name', 'tourney_id', 'tourney_date', 'tourney_level', 'surface', 'draw_size', 'match_num', 'round', 'best_of', 'minutes', 'player1_id', 'player1_name', 'player2_id', 'player2_name', 'player1_rank', 'player1_atp_points', 'player2_rank', 'player2_atp_points', 'player1_ht', 'player1_hand', 'player2_ht', 'player2_hand', 'p1_ace', 'p2_ace', 'p1_df', 'p2_df', 'p1_SvGms', 'p2_SvGms', 'p1_1stIn', 'p2_1stIn', 'p1_1stWon', 'p2_1stWon', 'p1_2ndWon', 'p2_2ndWon', 'p1_svpt', 'p2_svpt', 'p1_bpSaved', 'p2_bpSaved', 'p1_bpFaced', 'p2_bpFaced', 'score', 'source'], copy=False)
matches.head()

Unnamed: 0,tourney_name,tourney_id,tourney_date,tourney_level,surface,draw_size,match_num,round,best_of,minutes,player1_id,player1_name,player2_id,player2_name,player1_rank,player1_atp_points,player2_rank,player2_atp_points,player1_ht,player1_hand,player2_ht,player2_hand,p1_ace,p2_ace,p1_df,p2_df,p1_SvGms,p2_SvGms,p1_1stIn,p2_1stIn,p1_1stWon,p2_1stWon,p1_2ndWon,p2_2ndWon,p1_svpt,p2_svpt,p1_bpSaved,p2_bpSaved,p1_bpFaced,p2_bpFaced,score,source
0,Brisbane International presented by Evie,339,2024.12.29,A,Hard,32.0,1,F,3,,l0bv,Jiri Lehecka,o522,Reilly Opelka,,,,,,,,,,,,,,,,,,,,,,,,,,,4-1,1
1,Brisbane International presented by Evie,339,2024.12.29,A,Hard,32.0,2,SF,3,,l0bv,Jiri Lehecka,d875,Grigor Dimitrov,,,,,,,,,4.0,6.0,3.0,2.0,9.0,9.0,26.0,32.0,22.0,24.0,17.0,12.0,39.0,36.0,0.0,1.0,0.0,2.0,6-4 4-4,1
2,Brisbane International presented by Evie,339,2024.12.29,A,Hard,32.0,3,SF,3,,o522,Reilly Opelka,m0gz,Giovanni Mpetshi Perricard,,,,,,,,,12.0,10.0,2.0,5.0,11.0,10.0,45.0,48.0,38.0,38.0,13.0,9.0,51.0,47.0,4.0,1.0,4.0,2.0,6-3 7-6(4),1
3,Brisbane International presented by Evie,339,2024.12.29,A,Hard,32.0,4,QF,3,,o522,Reilly Opelka,d643,Novak Djokovic,,,,,,,,,16.0,8.0,1.0,1.0,11.0,10.0,49.0,53.0,38.0,38.0,13.0,12.0,51.0,50.0,1.0,4.0,1.0,5.0,7-6(6) 6-3,1
4,Brisbane International presented by Evie,339,2024.12.29,A,Hard,32.0,5,QF,3,,d875,Grigor Dimitrov,tc61,Jordan Thompson,,,,,,,,,1.0,1.0,1.0,1.0,5.0,5.0,21.0,12.0,19.0,7.0,5.0,6.0,24.0,13.0,0.0,1.0,0.0,4.0,6-1 2-1,1


In [27]:
matches.loc[matches['tourney_level'] == 'Other', 'tourney_level'] = 'A'

In [29]:
matches.to_csv("./data/PreCleanedMatches/CurrentYear.csv", index=False)