In [144]:
# =========================
# LIBRARIES
# =========================
import os
import re
import time
import random
import pandas as pd
from getpass import getuser
from typing import List, Dict, Tuple, Optional

from bs4 import BeautifulSoup

# Selenium essentials
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException

In [145]:
DEBUG = False

In [146]:
def dbg(*args):
    if DEBUG:
        print(*args)


In [147]:
# =========================
# CONFIG
# =========================
USER = getuser()

OUTPUT_DIR = f"C:/Users/{USER}/Documents/GitHub/tennis-homophily/data/atp"
os.makedirs(OUTPUT_DIR, exist_ok=True)

N_ROWS_DEFAULT = 1000

PAGELOAD_TIMEOUT = 20
IMPLICIT_WAIT = 2
SLEEP_MIN, SLEEP_MAX = 0.6, 2.0
RETRIES = 3

DATEWEEKS_TOURNAMENTS: List[Tuple[str, str, str]] = [
    ("2018-01-15", "Australian Open", "2018"), ("2019-01-14", "Australian Open", "2019"),
    ("2020-01-20", "Australian Open", "2020"), ("2021-02-08", "Australian Open", "2021"),
    ("2022-01-17", "Australian Open", "2022"), ("2023-01-16", "Australian Open", "2023"),
    ("2018-05-21", "Roland Garros", "2018"), ("2019-05-20", "Roland Garros", "2019"),
    ("2020-09-21", "Roland Garros", "2020"), ("2021-05-24", "Roland Garros", "2021"),
    ("2022-05-16", "Roland Garros", "2022"), ("2023-05-22", "Roland Garros", "2023"),
    ("2018-09-24", "US Open", "2018"), ("2019-08-26", "US Open", "2019"),
    ("2020-08-31", "US Open", "2020"), ("2021-08-30", "US Open", "2021"),
    ("2022-08-22", "US Open", "2022"), ("2023-08-28", "US Open", "2023"),
    ("2018-07-02", "Wimbledon", "2018"), ("2019-07-01", "Wimbledon", "2019"),
    ("2021-06-28", "Wimbledon", "2021"), ("2022-06-27", "Wimbledon", "2022"),
    ("2023-07-03", "Wimbledon", "2023")
]


In [148]:
# =========================
# DRIVER + UTILS
# =========================
def configure_driver(headless: bool = True) -> Firefox:
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")

    # Your local geckodriver path
    service = Service(executable_path=f"C:/Users/{USER}/Downloads/geckodriver.exe")

    drv = Firefox(service=service, options=opts)
    drv.set_page_load_timeout(PAGELOAD_TIMEOUT)
    drv.implicitly_wait(IMPLICIT_WAIT)
    return drv


def random_sleep(min_seconds: float = SLEEP_MIN, max_seconds: float = SLEEP_MAX) -> None:
    time.sleep(random.uniform(min_seconds, max_seconds))


In [149]:
# =========================
# RANKINGS TABLE PARSER
# =========================
def parse_rankings_table(html: str, max_rows: int) -> List[Dict]:
    """
    Parse rankings HTML to extract: Rank, Player, Player Profile Link.
    """
    soup = BeautifulSoup(html, "html.parser")
    tbody = soup.find("tbody")
    if not tbody:
        return []

    rows = tbody.find_all("tr")
    out: List[Dict] = []

    for row in rows[:max_rows]:
        rank_cell = row.find("td", class_="rank")
        player_cell = row.find("td", class_="player")

        if not rank_cell:
            rank_cell = row.find("td", class_="rank bold heavy tiny-cell")
        if not player_cell:
            player_cell = row.find("td", class_="player bold heavy large-cell")

        if not (rank_cell and player_cell):
            continue

        link_tag = player_cell.find("a")
        profile_link = ""
        if link_tag and link_tag.get("href"):
            href = link_tag.get("href").strip()
            profile_link = href if href.startswith("http") else "https://www.atptour.com" + href

        out.append({
            "Rank": rank_cell.get_text(strip=True),
            "Player": player_cell.get_text(strip=True),
            "Player Profile Link": profile_link
        })

    return out

In [150]:
# =========================
# PROFILE PARSERS (DOUBLES)
# =========================
def parse_player_profile_overview(soup: BeautifulSoup) -> Dict:
    """
    Extract tab-dependent overview: W-L YTD/Career and Titles YTD/Career.
    WARNING: values depend on the currently active tab (Singles/Doubles).
    """
    data: Dict = {}
    wins = soup.find_all("div", class_="wins")
    titles = soup.find_all("div", class_="titles")

    for timerange, win in zip(["YTD", "Career"], wins):
        data[f"W-L {timerange}"] = win.get_text(strip=True).replace("W-L", "").strip()

    for timerange, title in zip(["YTD", "Career"], titles):
        data[f"Titles {timerange}"] = title.get_text(strip=True).replace("Titles", "").strip()

    return data


def parse_player_profile_details(soup: BeautifulSoup) -> Dict:
    """
    Extract key/value pairs from ul.pd_left and ul.pd_right (DOB, Turned Pro, Height, Weight, Plays, Coach, etc.)
    """
    data: Dict = {}
    for html_class in ("pd_left", "pd_right"):
        section = soup.find("ul", class_=html_class)
        if not section:
            continue
        for item in section.find_all("li"):
            spans = item.find_all("span")
            if len(spans) > 1:
                key = spans[0].get_text(strip=True)
                if key == "Follow player":
                    continue
                value = spans[1].get_text(strip=True)
                data[key] = value
    return data


In [151]:
# =========================
# ROBUST TAB SWITCHING
# =========================
def click_tab(driver: Firefox, tab_name: str, timeout: int = 12) -> None:
    """
    Click Singles/Doubles tab and WAIT until it becomes active.
    """
    wait = WebDriverWait(driver, timeout)
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "a.tab-switcher-link")))

    tab_xpath = f"//a[contains(@class,'tab-switcher-link')][contains(normalize-space(.), '{tab_name}')]"
    el = wait.until(EC.element_to_be_clickable((By.XPATH, tab_xpath)))
    driver.execute_script("arguments[0].click();", el)

    # Wait until active/selected (ATP markup varies; check common patterns)
    def _is_active(drv):
        try:
            e = drv.find_element(By.XPATH, tab_xpath)
            cls = (e.get_attribute("class") or "").lower()
            aria = (e.get_attribute("aria-selected") or "").lower()
            return ("active" in cls) or (aria == "true")
        except Exception:
            return False

    wait.until(_is_active)
    time.sleep(0.5)  # buffer for re-render

In [152]:
# =========================
# DEBUGGING + WAITING UTILS
# =========================


def dump_dom_snapshot(driver, label: str, enabled: bool = True, max_chars: int = 800):
    """Print small, safe snippets that help confirm we are on the right tab."""
    if not enabled:
        return
    html = driver.page_source
    dbg(f"\n--- SNAPSHOT [{label}] ---", enabled)
    dbg(f"URL: {driver.current_url}", enabled)
    dbg(f"HTML length: {len(html)}", enabled)
    # quick “fingerprints”
    m = re.search(r"Career High Rank", html)
    dbg(f"Contains 'Career High Rank'? {bool(m)}", enabled)
    dbg(f"First {max_chars} chars:\n{html[:max_chars]}\n", enabled)

def active_tab_name(driver) -> Optional[str]:
    """
    Tries to infer which tab is active.
    This is heuristic because ATP-like pages vary.
    """
    try:
        tabs = driver.find_elements(By.CSS_SELECTOR, "a.tab-switcher-link")
        for t in tabs:
            cls = (t.get_attribute("class") or "").lower()
            if "active" in cls or "is-active" in cls or "selected" in cls:
                return t.text.strip()
        # fallback: return all tab texts if no active class
        return None
    except Exception:
        return None

def wait_for_tab_content_change(driver, before_html: str, timeout: float = 8.0) -> bool:
    """Wait until page_source changes meaningfully after a tab click."""
    end = time.time() + timeout
    while time.time() < end:
        after_html = driver.page_source
        if after_html != before_html and abs(len(after_html) - len(before_html)) > 50:
            return True
        time.sleep(0.15)
    return False

def print_context_around(html: str, needle: str, window: int = 400, enabled: bool = True):
    if not enabled:
        return
    idx = html.find(needle)
    dbg(f"[CTX] '{needle}' index in HTML: {idx}")
    if idx == -1:
        return
    start = max(0, idx - window)
    end = min(len(html), idx + len(needle) + window)
    snippet = html[start:end]
    dbg(f"[CTX] HTML around '{needle}' (len={len(snippet)}):\n{snippet}\n")

from bs4 import BeautifulSoup

def print_text_context(html: str, needle: str, window: int = 200, enabled: bool = True):
    if not enabled:
        return
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text(" ", strip=True)
    idx = text.find(needle)
    dbg(f"[CTX-TXT] '{needle}' index in TEXT: {idx}")
    if idx == -1:
        return
    start = max(0, idx - window)
    end = min(len(text), idx + len(needle) + window)
    dbg(f"[CTX-TXT] TEXT around '{needle}':\n{text[start:end]}\n")



In [153]:
# =========================
# SINGLES: DOM LABEL->VALUE EXTRACTOR + PARSER
# =========================
def extract_label_value_stats(soup: BeautifulSoup) -> Dict[str, str]:
    """
    Generic extractor of label/value stat blocks.
    We try several selector patterns used across ATP layouts.
    """
    stats: Dict[str, str] = {}

    # Candidate containers
    candidates = []
    candidates += soup.select("div.stat")
    candidates += soup.select("div.stat-item")
    candidates += soup.select("div.player-stats-item")
    if not candidates:
        candidates = soup.select("div[class*='stat']")

    for c in candidates:
        label_el = (
            c.select_one(".stat-label")
            or c.select_one(".label")
            or c.select_one("span.label")
            or c.find("div", class_=re.compile("label", re.I))
            or c.find("span", class_=re.compile("label", re.I))
        )
        value_el = (
            c.select_one(".stat-value")
            or c.select_one(".value")
            or c.select_one("span.value")
            or c.find("div", class_=re.compile("value", re.I))
            or c.find("span", class_=re.compile("value", re.I))
        )

        if not label_el or not value_el:
            continue

        label = label_el.get_text(" ", strip=True)
        value = value_el.get_text(" ", strip=True)

        if label and value:
            stats[label] = value

    return stats

In [154]:

def parse_singles_from_page_source(html: str) -> dict:
    soup = BeautifulSoup(html, "html.parser")

    out = {
        "Singles_Career_High_Rank": None,
        "Singles_Date_Career_High_Rank": None,
        "Singles_WL_Career": None,
        "Singles_Titles_Career": None,
    }

    # 1) pick the correct "Career" box among multiple player-stats-details
    boxes = soup.select("div.player-stats-details")
    career_box = None
    for b in boxes:
        t = b.select_one("div.type")
        if t and t.get_text(" ", strip=True).lower() == "career":
            career_box = b
            break

    if career_box is None:
        # fallback: sometimes the word Career is present in the box text
        for b in boxes:
            if "Career High Rank" in b.get_text(" ", strip=True):
                career_box = b
                break

    if career_box is None:
        return out

    # 2) Career High Rank (value comes before label span)
    stat = career_box.select_one("div.stat")
    if stat:
        full = stat.get_text(" ", strip=True)  # e.g. "41 Career High Rank (2010.04.12)"
        m_rank = re.search(r"\b(\d{1,4})\b", full)
        if m_rank:
            out["Singles_Career_High_Rank"] = int(m_rank.group(1))

        m_date = re.search(r"\((\d{4}\.\d{2}\.\d{2})\)", full)
        if m_date:
            out["Singles_Date_Career_High_Rank"] = m_date.group(1)

    # 3) Career W-L
    wins = career_box.select_one("div.wins")
    if wins:
        full = wins.get_text(" ", strip=True)  # e.g. "97 - 130 W-L"
        m = re.search(r"\b(\d+)\s*-\s*(\d+)\b", full)
        if m:
            out["Singles_WL_Career"] = f"{m.group(1)}-{m.group(2)}"

    # 4) Career Titles
    titles = career_box.select_one("div.titles")
    if titles:
        full = titles.get_text(" ", strip=True)  # e.g. "0 Titles"
        m = re.search(r"\b(\d+)\b", full)
        if m:
            out["Singles_Titles_Career"] = int(m.group(1))

    return out



In [155]:
# =========================
# SCRAPE RANKINGS PAGE (DOUBLES)
# =========================
def scrape_player_urls(dateweek: str, tournament: str, year: str, n_rows: int = N_ROWS_DEFAULT) -> List[Dict]:
    ranking_url = f"https://www.atptour.com/en/rankings/doubles?RankRange=1-1000&Region=all&DateWeek={dateweek}"
    driver = configure_driver()
    try:
        driver.get(ranking_url)
        random_sleep()

        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "tbody")))
        players = parse_rankings_table(driver.page_source, n_rows)

        # "Tourns" dynamic column via Selenium
        tourn_cells = driver.find_elements(By.XPATH, "//td[contains(@class,'tourns')]")
        tourn_texts = [cell.text.strip() for cell in tourn_cells[:len(players)]]

        merged = []
        for i, p in enumerate(players):
            pp = p.copy()
            pp["Tourns"] = tourn_texts[i] if i < len(tourn_texts) else ""
            pp["Tournament"] = tournament
            pp["Year"] = year
            pp["DateWeek"] = dateweek
            merged.append(pp)

        return merged

    except (WebDriverException, TimeoutException) as e:
        print(f"[Rankings] Error for {dateweek} {tournament} {year}: {e}")
        return []
    finally:
        driver.quit()

In [156]:
# =========================
# SCRAPE PLAYER PROFILE (DOUBLES + SINGLES)
# =========================
def scrape_player_profile(profile_link: str, debug: bool = True) -> Dict:
    driver = configure_driver()
    try:
        for attempt in range(1, RETRIES + 1):
            try:
                dbg(f"\n[Profile] Attempt {attempt}/{RETRIES} -> {profile_link}", debug)
                driver.get(profile_link)

                WebDriverWait(driver, 12).until(
                    EC.presence_of_all_elements_located(
                        (By.CSS_SELECTOR, "ul.pd_left, ul.pd_right, a.tab-switcher-link")
                    )
                )
                dbg("[Profile] Base elements loaded (pd_left/pd_right/tabs).", debug)

                data: Dict = {}

                # 1) DOUBLES first
                try:
                    before = driver.page_source
                    dbg("[Doubles] Clicking tab...", debug)
                    click_tab(driver, "Doubles", timeout=12)
                    changed = wait_for_tab_content_change(driver, before, timeout=8.0)
                    dbg(f"[Doubles] DOM changed after click? {changed}", debug)
                    dbg(f"[Doubles] Active tab (heuristic): {active_tab_name(driver)}", debug)
                except Exception as e:
                    dbg(f"[Doubles] Click failed (continuing): {repr(e)}", debug)

                soup = BeautifulSoup(driver.page_source, "html.parser")
                doubles_overview = parse_player_profile_overview(soup)
                doubles_details = parse_player_profile_details(soup)

                data.update({f"Doubles_{k}": v for k, v in doubles_overview.items()})
                data.update(doubles_details)

                dbg(f"[Doubles] Parsed overview keys: {list(doubles_overview.keys())}", debug)
                dbg(f"[Doubles] Parsed details keys: {list(doubles_details.keys())}", debug)

                # 2) SINGLES (debuggable)
                singles_data = {
                    "Singles_Career_High_Rank": None,
                    "Singles_WL_Career": None,
                    "Singles_Titles_Career": None
                }

                try:
                    dbg("[Singles] Clicking tab...", debug)
                    before = driver.page_source
                    click_tab(driver, "Singles", timeout=12)

                    changed = wait_for_tab_content_change(driver, before, timeout=10.0)
                    dbg(f"[Singles] DOM changed after click? {changed}", debug)
                    dbg(f"[Singles] Active tab (heuristic): {active_tab_name(driver)}", debug)

                    # IMPORTANT: wait for something *specific* to singles panel
                    # If your site has a clear container, replace selector accordingly.
                    # Example: "div.player-stats-details" (you mentioned this earlier)
                    wait_selectors = [
                        "div.player-stats-details",
                        "div.wins", "div.titles",
                        "div[class*='stat']"
                    ]

                    found_any = False
                    for sel in wait_selectors:
                        try:
                            WebDriverWait(driver, 6).until(
                                EC.presence_of_element_located((By.CSS_SELECTOR, sel))
                            )
                            dbg(f"[Singles] Found selector: {sel}", debug)
                            found_any = True
                            break
                        except Exception:
                            continue

                    dbg(f"[Singles] Found any expected selector? {found_any}", debug)
                    dump_dom_snapshot(driver, "after_singles_click", enabled=debug, max_chars=600)

                    html = driver.page_source
                    print_context_around(html, "Career High Rank", enabled=debug)
                    print_text_context(html, needle="Career High Rank", enabled=debug)
                    singles_data = parse_singles_from_page_source(html)

                    if debug:
                        soup = BeautifulSoup(driver.page_source, "html.parser")
                        boxes = soup.select("div.player-stats-details")
                        print(f"[TEST] player-stats-details count: {len(boxes)}")
                        for i, b in enumerate(boxes):
                            t = b.select_one("div.type")
                            dbg(f"[TEST] box[{i}] type:", t.get_text(" ", strip=True) if t else None)
                            dbg(f"[TEST] box[{i}] text head:", b.get_text(" ", strip=True)[:80])


                    dbg(f"[Singles] Parsed singles_data: {singles_data}", debug)

                    # sanity checks
                    if all(v is None for v in singles_data.values()):
                        dbg("[Singles][WARN] All parsed Singles values are None. Likely wrong DOM/text.", debug)

                except Exception as e:
                    dbg(f"[Singles] Failed: {repr(e)}", debug)
                    # keep fallback None values

                data.update(singles_data)
                return data

            except (TimeoutException, WebDriverException) as e:
                dbg(f"[Profile] Attempt {attempt}/{RETRIES} failed: {repr(e)}", debug)
                random_sleep(1.5, 3.0)

        dbg(f"[Profile] Failed after {RETRIES} attempts: {profile_link}", debug)
        return {}

    finally:
        driver.quit()


In [157]:
# =========================
# SAVE TO EXCEL
# =========================
def save_player_info_to_excel(
    records: List[Dict],
    dateweek: str,
    out_dir: str = OUTPUT_DIR,
    suffix: str = ""
) -> str:
    if not records:
        print(f"[Save] No data to save for {dateweek}.")
        return ""

    df_out = pd.DataFrame(records)
    fname = f"ranking_doubles_{dateweek}{suffix}.xlsx"
    out_path = os.path.join(out_dir, fname)

    df_out.to_excel(out_path, index=False)
    print(f"[Save] {len(df_out)} rows saved to: {out_path}")
    return out_path


# =========================
# FULL RUN
# =========================
def run_full_schedule(
    dateweeks_tournaments=DATEWEEKS_TOURNAMENTS,
    n_rows=N_ROWS_DEFAULT,
    test_run: bool = False
) -> None:
    for dateweek, tournament, year in dateweeks_tournaments:
        try:
            players = scrape_player_urls(dateweek, tournament, year, n_rows=n_rows)
            print(f"[Week] {dateweek} {tournament} {year}: found {len(players)} players.")

            for idx, p in enumerate(players, start=1):
                link = p.get("Player Profile Link") or ""
                if link:
                    profile = scrape_player_profile(link)
                    p.update(profile)

                print(f"    - ({idx}/{len(players)}) {p.get('Player','?')} processed.")
                random_sleep(0.5, 1.3)

            if players:
                suffix = "_TEST" if test_run else ""
                save_player_info_to_excel(players, dateweek, suffix=suffix)

        except Exception as e:
            print(f"[Week] Unexpected error for {dateweek} {tournament} {year}: {e}")
            continue



In [158]:
# =========================
# TEST RUN (10 players)
# =========================
# test_subset = DATEWEEKS_TOURNAMENTS[:1]
# run_full_schedule(test_subset, n_rows=1, test_run=True)

# Full run when ready:
run_full_schedule(DATEWEEKS_TOURNAMENTS, n_rows=100, test_run=False)

[Week] 2018-01-15 Australian Open 2018: found 99 players.
[TEST] player-stats-details count: 2
    - (1/99) L. Kubot processed.
[TEST] player-stats-details count: 2
    - (2/99) M. Melo processed.
[TEST] player-stats-details count: 2
    - (3/99) H. Kontinen processed.
[TEST] player-stats-details count: 2
    - (4/99) J. Peers processed.
[TEST] player-stats-details count: 2
    - (5/99) I. Dodig processed.
[TEST] player-stats-details count: 2
    - (6/99) N. Mahut processed.
[TEST] player-stats-details count: 2
    - (7/99) J. Rojer processed.
[TEST] player-stats-details count: 2
    - (8/99) H. Tecau processed.
[TEST] player-stats-details count: 2
    - (9/99) J. Murray processed.
[TEST] player-stats-details count: 2
    - (10/99) B. Soares processed.
[TEST] player-stats-details count: 2
    - (11/99) P. Herbert processed.
[TEST] player-stats-details count: 2
    - (12/99) B. Bryan processed.
[TEST] player-stats-details count: 2
    - (13/99) M. Bryan processed.
[TEST] player-stats-de

KeyboardInterrupt: 