In [2]:

import os
import time
import random
import pandas as pd
from getpass import getuser
from typing import List, Dict, Tuple

from bs4 import BeautifulSoup

# Selenium essentials
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By          
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException


In [3]:

# User-based Windows paths
USER = getuser()

OUTPUT_DIR = f"C:/Users/{USER}/Documents/GitHub/tennis-homophily/data/atp"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# How many players to scrape from the rankings page
N_ROWS_DEFAULT = 5  # adjust as needed

# Global timeouts and retries
PAGELOAD_TIMEOUT = 20
IMPLICIT_WAIT = 2
SLEEP_MIN, SLEEP_MAX = 0.6, 2.0  # polite randomized delay
RETRIES = 3  # number of retries for loading a page

# The schedule of weeks/tournaments/years to scrape
DATEWEEKS_TOURNAMENTS: List[Tuple[str, str, str]] = [
    ("2018-01-15", "Australian Open", "2018"), ("2019-01-14", "Australian Open", "2019"),
    ("2020-01-20", "Australian Open", "2020"), ("2021-02-08", "Australian Open", "2021"),
    ("2022-01-17", "Australian Open", "2022"), ("2023-01-16", "Australian Open", "2023"),
    ("2018-05-21", "Roland Garros", "2018"), ("2019-05-20", "Roland Garros", "2019"),
    ("2020-09-21", "Roland Garros", "2020"), ("2021-05-24", "Roland Garros", "2021"),
    ("2022-05-16", "Roland Garros", "2022"), ("2023-05-22", "Roland Garros", "2023"),
    ("2018-09-24", "US Open", "2018"), ("2019-08-26", "US Open", "2019"),
    ("2020-08-31", "US Open", "2020"), ("2021-08-30", "US Open", "2021"),
    ("2022-08-22", "US Open", "2022"), ("2023-08-28", "US Open", "2023"),
    ("2018-07-02", "Wimbledon", "2018"), ("2019-07-01", "Wimbledon", "2019"),
    ("2021-06-28", "Wimbledon", "2021"), ("2022-06-27", "Wimbledon", "2022"),
    ("2023-07-03", "Wimbledon", "2023")
]


In [4]:
# --- Cell 2: Selenium driver configuration ---

def configure_driver(headless: bool = True) -> Firefox:
    opts = Options()
    opts.add_argument("--headless=new")  # headless; remove if you want to see the browser

    # IMPORTANT: don't pass executable_path; Selenium Manager will fetch a correct geckodriver
    service = Service()  # no path

    drv = Firefox(service=service, options=opts)
    drv.set_page_load_timeout(PAGELOAD_TIMEOUT)
    drv.implicitly_wait(IMPLICIT_WAIT)
    return drv


def random_sleep(min_seconds: float = SLEEP_MIN, max_seconds: float = SLEEP_MAX) -> None:
    time.sleep(random.uniform(min_seconds, max_seconds))


In [5]:
# --- Cell 3: HTML parsing helpers ---

def parse_rankings_table(html: str, max_rows: int) -> List[Dict]:
    """
    Parse the rankings HTML (page_source) to extract:
    Rank, Player, Player Profile Link. 'Tourns' will be filled using Selenium column later.
    """
    soup = BeautifulSoup(html, "html.parser")
    table = soup.find("tbody")

    if not table:
        return []

    rows = table.find_all("tr")
    out = []
    for row in rows[:max_rows]:
        rank_cell = row.find("td", class_="rank")
        player_cell = row.find("td", class_="player")
        # Fallback to more specific classes if needed:
        if not rank_cell:
            rank_cell = row.find("td", class_="rank bold heavy tiny-cell")
        if not player_cell:
            player_cell = row.find("td", class_="player bold heavy large-cell")

        if not (rank_cell and player_cell):
            continue

        link_tag = player_cell.find("a")
        profile_link = ""
        if link_tag and link_tag.get("href"):
            href = link_tag.get("href").strip()
            profile_link = href if href.startswith("http") else "https://www.atptour.com" + href

        out.append({
            "Rank": rank_cell.get_text(strip=True),
            "Player": player_cell.get_text(strip=True),
            "Player Profile Link": profile_link
        })
    return out


def parse_player_profile_overview(soup: BeautifulSoup) -> Dict:
    """
    Extract 'W-L YTD', 'W-L Career', 'Titles YTD', 'Titles Career'
    """
    data = {}
    wins = soup.find_all("div", class_='wins')
    titles = soup.find_all("div", class_='titles')

    for timerange, win in zip(['YTD', 'Career'], wins):
        # original code stripped 4 chars; safer: split by whitespace or use replace
        data[f'W-L {timerange}'] = win.get_text(strip=True).replace("W-L", "").strip()

    for timerange, title in zip(['YTD', 'Career'], titles):
        data[f'Titles {timerange}'] = title.get_text(strip=True).replace("Titles", "").strip()

    return data


def parse_player_profile_details(soup: BeautifulSoup) -> Dict:
    """
    Extract key/value pairs from the left/right detail lists (ul.pd_left / ul.pd_right).
    Skips 'Follow player'.
    """
    data = {}
    for html_class in ("pd_left", "pd_right"):
        section = soup.find("ul", class_=html_class)
        if not section:
            continue
        for item in section.find_all("li"):
            spans = item.find_all("span")
            if len(spans) > 1:
                key = spans[0].get_text(strip=True)
                if key == "Follow player":
                    continue
                value = spans[1].get_text(strip=True)
                data[key] = value
    return data


In [6]:
# --- Cell 4: Scrape top-N player rows (rank, player, links, 'Tourns') ---

def scrape_player_urls(dateweek: str, tournament: str, year: str, n_rows: int = N_ROWS_DEFAULT) -> List[Dict]:
    ranking_url = f"https://www.atptour.com/en/rankings/doubles?RankRange=1-100&Region=all&DateWeek={dateweek}"
    driver = configure_driver()
    try:
        driver.get(ranking_url)
        random_sleep()

        # Wait for the table to render
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, "tbody")))

        # Parse the HTML table (Rank / Player / Link)
        page_source = driver.page_source
        players = parse_rankings_table(page_source, n_rows)

        # Collect the “Tourns” column using Selenium (it may be dynamic)
        tourn_cells = driver.find_elements(By.XPATH, "//td[contains(@class,'tourns')]")
        # Align with the first n_rows
        tourn_texts = [cell.text.strip() for cell in tourn_cells[:len(players)]]

        # Merge back and add static info
        merged = []
        for i, p in enumerate(players):
            p = p.copy()
            p["Tourns"] = tourn_texts[i] if i < len(tourn_texts) else ""
            p["Tournament"] = tournament
            p["Year"] = year
            p["DateWeek"] = dateweek
            merged.append(p)

        return merged

    except (WebDriverException, TimeoutException) as e:
        print(f"[Rankings] Error for {dateweek} {tournament} {year}: {e}")
        return []
    finally:
        driver.quit()


In [7]:
# --- Cell 5: Scrape a single player profile page ---

def scrape_player_profile(profile_link: str) -> Dict:
    driver = configure_driver()
    try:
        for attempt in range(1, RETRIES + 1):
            try:
                driver.get(profile_link)
                # Wait for any profile section to appear
                WebDriverWait(driver, 15).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, "ul.pd_left, ul.pd_right, div.wins, div.titles"))
                )
                random_sleep(0.4, 1.2)
                soup = BeautifulSoup(driver.page_source, "html.parser")

                data = {}
                data.update(parse_player_profile_overview(soup))
                data.update(parse_player_profile_details(soup))
                return data

            except (TimeoutException, WebDriverException) as e:
                print(f"[Profile] Attempt {attempt}/{RETRIES} failed: {e}")
                random_sleep(1.5, 3.0)

        print(f"[Profile] Failed to scrape after {RETRIES} attempts: {profile_link}")
        return {}

    except WebDriverException as e:
        print(f"[Profile] Critical driver error: {e}")
        return {}
    finally:
        driver.quit()


In [8]:
# --- Cell 6: Save results to Excel ---

def save_player_info_to_excel(records: List[Dict], dateweek: str, out_dir: str = OUTPUT_DIR) -> str:
    if not records:
        print(f"[Save] No data to save for {dateweek}.")
        return ""

    df = pd.DataFrame(records)
    out_path = os.path.join(out_dir, f"ranking_doubles_{dateweek}.xlsx")
    df.to_excel(out_path, index=False)
    print(f"[Save] {len(df)} rows saved to: {out_path}")
    return out_path


In [9]:
# --- Cell 8: Full run over DATEWEEKS_TOURNAMENTS (self-contained, no run_one_week) ---

def run_full_schedule(dateweeks_tournaments=DATEWEEKS_TOURNAMENTS, n_rows=N_ROWS_DEFAULT) -> None:
    for dateweek, tournament, year in dateweeks_tournaments:
        try:
            # 1) Rankings page: collect players
            players = scrape_player_urls(dateweek, tournament, year, n_rows=n_rows)
            print(f"[Week] {dateweek} {tournament} {year}: found {len(players)} players.")

            # 2) For each player, enrich with profile details
            for idx, p in enumerate(players, start=1):
                link = p.get("Player Profile Link") or ""
                if link:
                    profile = scrape_player_profile(link)
                    p.update(profile)
                print(f"    - ({idx}/{len(players)}) {p.get('Player','?')} processed.")
                random_sleep(0.5, 1.3)

            # 3) Save results
            if players:
                save_player_info_to_excel(players, dateweek, out_dir=OUTPUT_DIR)
            else:
                print(f"[Week] No records for {dateweek}. Skipping save.")

        except Exception as e:
            print(f"[Week] Unexpected error for {dateweek} {tournament} {year}: {e}")
            continue


In [10]:
# --- Cell 9: Kick it off ---

# Start with a small test to verify everything runs:
# test_subset = DATEWEEKS_TOURNAMENTS[:1]
# run_full_schedule(test_subset, n_rows=5)

# Run full list when ready:
run_full_schedule(DATEWEEKS_TOURNAMENTS, n_rows=100)


[Week] 2018-01-15 Australian Open 2018: found 99 players.
    - (1/99) L. Kubot processed.
    - (2/99) M. Melo processed.
    - (3/99) H. Kontinen processed.
    - (4/99) J. Peers processed.
    - (5/99) I. Dodig processed.
    - (6/99) N. Mahut processed.
    - (7/99) J. Rojer processed.
    - (8/99) H. Tecau processed.
    - (9/99) J. Murray processed.
    - (10/99) B. Soares processed.
    - (11/99) P. Herbert processed.
    - (12/99) B. Bryan processed.
    - (13/99) M. Bryan processed.
    - (14/99) M. Granollers processed.
    - (15/99) M. Venus processed.
    - (16/99) M. Pavic processed.
    - (17/99) 1O. Marach processed.
    - (18/99) -1R. Harrison processed.
    - (19/99) R. Bopanna processed.
    - (20/99) M. Lopez processed.
    - (21/99) P. Cuevas processed.
    - (22/99) R. Ram processed.
    - (23/99) J. Cabal processed.
    - (24/99) F. Lopez processed.
    - (25/99) R. Klaasen processed.
    - (26/99) E. Roger-Vasselin processed.
    - (27/99) R. Farah processed.
   