In [8]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import random
import time
from datetime import datetime, timedelta
import pandas as pd
import json
import requests
import re
import os
import numpy as np
import pickle
from scraping_utils import get_tournament, create_driver, get_live_player_rankings, extract_results_url, get_tournament_results, get_stat_names, get_stats

## Get Tournaments


Done through link. Will get all finished tournaments

In [2]:
                                                              # Launch browser
tourn_archive_url = ["https://www.atptour.com/en/scores/results-archive?year=2025"]     # ATP tournamnet archive
df_tournaments = get_tournament(tourn_archive_url = tourn_archive_url, save_cache=False)


In [3]:
df_tournaments

Unnamed: 0,id,name,level,location,end_date,url
0,339,Brisbane International presented by Evie,ATP 250,"Brisbane, Australia",2025-01-05,https://www.atptour.com/en/tournaments/brisban...
1,336,Bank of China Hong Kong Tennis Open,ATP 250,"Hong Kong, Hong Kong",2025-01-05,https://www.atptour.com/en/tournaments/hong-ko...
2,8998,Adelaide International,ATP 250,"Adelaide, Australia",2025-01-11,https://www.atptour.com/en/tournaments/adelaid...
3,301,ASB Classic,ATP 250,"Auckland, New Zealand",2025-01-11,https://www.atptour.com/en/tournaments/aucklan...
4,580,Australian Open,Grand Slam,"Melbourne, Australia",2025-01-26,https://www.atptour.com/en/tournaments/austral...
5,375,Open Occitanie,ATP 250,"Montpellier, France",2025-02-02,https://www.atptour.com/en/tournaments/montpel...
6,424,Dallas Open,ATP 500,"Dallas, United States",2025-02-09,https://www.atptour.com/en/tournaments/dallas/...
7,407,ABN AMRO Open,ATP 500,"Rotterdam, Netherlands",2025-02-09,https://www.atptour.com/en/tournaments/rotterd...
8,496,Open 13 Provence,ATP 250,"Marseille, France",2025-02-16,https://www.atptour.com/en/tournaments/marseil...
9,499,Delray Beach Open,ATP 250,"Delray Beach, United States",2025-02-16,https://www.atptour.com/en/tournaments/delray-...


## Get Top 500

In [4]:
df_top_500_players = get_live_player_rankings(500).filter(['Name', 'id']).rename(columns={'Name': 'name'})

In [9]:
df_top_500_players

Unnamed: 0,name,id
0,J. Sinner,s0ag
1,C. Alcaraz,a0e2
2,A. Zverev,z355
3,T. Fritz,fb98
4,J. Draper,d0co
...,...,...
493,E. Vasa,v993
494,J. Delaney,dc38
495,F. Peliwo,pf65
496,C. Sigsgaard,sy00


## Get Results

### Get Url's For Automation

This will check if last date scraped is stored and use that for the date to start scraping results urls from.

In [None]:
date_file = "s3://matchedge-pipeline/logs/last_scraped_date.csv"        # Path to your file storing the last scraped date

# --- Step 1: Check if file exists and read last scrape date ---
if os.path.exists(date_file):
    try:
        last_scraped_df = pd.read_csv(date_file)
        start_date = pd.to_datetime(last_scraped_df.iloc[0,0])
    except Exception as e:
        raise FileNotFoundError(f"{date_file} not found. Cannot determine start date for scraping results.")

# --- Step 2: Use start_date in your scraping function ---
results_url = extract_results_url(df_tournament=df_tournaments, start_date=start_date)

# --- Step 3: Save today's date to file for next run ---
today = datetime.today().date()
pd.DataFrame([today]).to_csv(date_file, index=False, header=False)

In [7]:
results_url

['https://www.atptour.com/en/scores/archive/madrid/1536/2025/results',
 'https://www.atptour.com/en/scores/archive/rome/416/2025/results',
 'https://www.atptour.com/en/scores/archive/hamburg/414/2025/results',
 'https://www.atptour.com/en/scores/archive/geneva/322/2025/results',
 'https://www.atptour.com/en/scores/archive/roland-garros/520/2025/results',
 'https://www.atptour.com/en/scores/archive/stuttgart/321/2025/results',
 'https://www.atptour.com/en/scores/archive/s-hertogenbosch/440/2025/results',
 'https://www.atptour.com/en/scores/archive/london/311/2025/results',
 'https://www.atptour.com/en/scores/archive/halle/500/2025/results',
 'https://www.atptour.com/en/scores/archive/mallorca/8994/2025/results',
 'https://www.atptour.com/en/scores/archive/eastbourne/741/2025/results',
 'https://www.atptour.com/en/scores/archive/wimbledon/540/2025/results',
 'https://www.atptour.com/en/scores/archive/los-cabos/7480/2025/results',
 'https://www.atptour.com/en/scores/archive/gstaad/314/202

### Get The Results

In [None]:
df_tournament_results = get_tournament_results(results_url)

## Get Stats

In [None]:
stat_url = df_tournament_results['stats_link'].to_list()
stats = get_stats(stat_url)

## Save

In [None]:
df_tournaments.to_csv(f"s3://matchedge-pipeline/data/raw/All_Tournaments_{today}.csv")
df_top_500_players.to_csv(f"s3://matchedge-pipeline/data/raw/top_500_players_{today}.csv")
df_tournament_results.to_csv(f"s3://matchedge-pipeline/data/raw/all_results_{today}.csv")
stats.to_csv(f"s3://matchedge-pipeline/data/raw/all_stats_GS_{today}.csv")