In [13]:
# ============================================================
# FULL CODE (REWRITTEN) â€” Doubles scraping preserved + Singles info fixed
# - Robust tab switching (waits until active)
# - Singles parsed via DOM label->value extraction + regex fallback
# - Doubles overview is prefixed to avoid being overwritten
# ============================================================

import os
import re
import time
import random
import pandas as pd
from getpass import getuser
from typing import List, Dict, Tuple

from bs4 import BeautifulSoup

# Selenium essentials
from selenium.webdriver import Firefox
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException


# =========================
# CONFIG
# =========================
USER = getuser()

OUTPUT_DIR = f"C:/Users/{USER}/Documents/GitHub/tennis-homophily/data/atp"
os.makedirs(OUTPUT_DIR, exist_ok=True)

N_ROWS_DEFAULT = 1000

PAGELOAD_TIMEOUT = 20
IMPLICIT_WAIT = 2
SLEEP_MIN, SLEEP_MAX = 0.6, 2.0
RETRIES = 3

DATEWEEKS_TOURNAMENTS: List[Tuple[str, str, str]] = [
    ("2018-01-15", "Australian Open", "2018"), ("2019-01-14", "Australian Open", "2019"),
    ("2020-01-20", "Australian Open", "2020"), ("2021-02-08", "Australian Open", "2021"),
    ("2022-01-17", "Australian Open", "2022"), ("2023-01-16", "Australian Open", "2023"),
    ("2018-05-21", "Roland Garros", "2018"), ("2019-05-20", "Roland Garros", "2019"),
    ("2020-09-21", "Roland Garros", "2020"), ("2021-05-24", "Roland Garros", "2021"),
    ("2022-05-16", "Roland Garros", "2022"), ("2023-05-22", "Roland Garros", "2023"),
    ("2018-09-24", "US Open", "2018"), ("2019-08-26", "US Open", "2019"),
    ("2020-08-31", "US Open", "2020"), ("2021-08-30", "US Open", "2021"),
    ("2022-08-22", "US Open", "2022"), ("2023-08-28", "US Open", "2023"),
    ("2018-07-02", "Wimbledon", "2018"), ("2019-07-01", "Wimbledon", "2019"),
    ("2021-06-28", "Wimbledon", "2021"), ("2022-06-27", "Wimbledon", "2022"),
    ("2023-07-03", "Wimbledon", "2023")
]


# =========================
# DRIVER + UTILS
# =========================
def configure_driver(headless: bool = True) -> Firefox:
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")

    # Your local geckodriver path
    service = Service(executable_path=f"C:/Users/{USER}/Downloads/geckodriver.exe")

    drv = Firefox(service=service, options=opts)
    drv.set_page_load_timeout(PAGELOAD_TIMEOUT)
    drv.implicitly_wait(IMPLICIT_WAIT)
    return drv


def random_sleep(min_seconds: float = SLEEP_MIN, max_seconds: float = SLEEP_MAX) -> None:
    time.sleep(random.uniform(min_seconds, max_seconds))


# =========================
# RANKINGS TABLE PARSER
# =========================
def parse_rankings_table(html: str, max_rows: int) -> List[Dict]:
    """
    Parse rankings HTML to extract: Rank, Player, Player Profile Link.
    """
    soup = BeautifulSoup(html, "html.parser")
    tbody = soup.find("tbody")
    if not tbody:
        return []

    rows = tbody.find_all("tr")
    out: List[Dict] = []

    for row in rows[:max_rows]:
        rank_cell = row.find("td", class_="rank")
        player_cell = row.find("td", class_="player")

        if not rank_cell:
            rank_cell = row.find("td", class_="rank bold heavy tiny-cell")
        if not player_cell:
            player_cell = row.find("td", class_="player bold heavy large-cell")

        if not (rank_cell and player_cell):
            continue

        link_tag = player_cell.find("a")
        profile_link = ""
        if link_tag and link_tag.get("href"):
            href = link_tag.get("href").strip()
            profile_link = href if href.startswith("http") else "https://www.atptour.com" + href

        out.append({
            "Rank": rank_cell.get_text(strip=True),
            "Player": player_cell.get_text(strip=True),
            "Player Profile Link": profile_link
        })

    return out


# =========================
# PROFILE PARSERS (DOUBLES)
# =========================
def parse_player_profile_overview(soup: BeautifulSoup) -> Dict:
    """
    Extract tab-dependent overview: W-L YTD/Career and Titles YTD/Career.
    WARNING: values depend on the currently active tab (Singles/Doubles).
    """
    data: Dict = {}
    wins = soup.find_all("div", class_="wins")
    titles = soup.find_all("div", class_="titles")

    for timerange, win in zip(["YTD", "Career"], wins):
        data[f"W-L {timerange}"] = win.get_text(strip=True).replace("W-L", "").strip()

    for timerange, title in zip(["YTD", "Career"], titles):
        data[f"Titles {timerange}"] = title.get_text(strip=True).replace("Titles", "").strip()

    return data


def parse_player_profile_details(soup: BeautifulSoup) -> Dict:
    """
    Extract key/value pairs from ul.pd_left and ul.pd_right (DOB, Turned Pro, Height, Weight, Plays, Coach, etc.)
    """
    data: Dict = {}
    for html_class in ("pd_left", "pd_right"):
        section = soup.find("ul", class_=html_class)
        if not section:
            continue
        for item in section.find_all("li"):
            spans = item.find_all("span")
            if len(spans) > 1:
                key = spans[0].get_text(strip=True)
                if key == "Follow player":
                    continue
                value = spans[1].get_text(strip=True)
                data[key] = value
    return data


# =========================
# ROBUST TAB SWITCHING
# =========================
def click_tab(driver: Firefox, tab_name: str, timeout: int = 12) -> None:
    """
    Click Singles/Doubles tab and WAIT until it becomes active.
    """
    wait = WebDriverWait(driver, timeout)
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "a.tab-switcher-link")))

    tab_xpath = f"//a[contains(@class,'tab-switcher-link')][contains(normalize-space(.), '{tab_name}')]"
    el = wait.until(EC.element_to_be_clickable((By.XPATH, tab_xpath)))
    driver.execute_script("arguments[0].click();", el)

    # Wait until active/selected (ATP markup varies; check common patterns)
    def _is_active(drv):
        try:
            e = drv.find_element(By.XPATH, tab_xpath)
            cls = (e.get_attribute("class") or "").lower()
            aria = (e.get_attribute("aria-selected") or "").lower()
            return ("active" in cls) or (aria == "true")
        except Exception:
            return False

    wait.until(_is_active)
    time.sleep(0.5)  # buffer for re-render


# =========================
# SINGLES: DOM LABEL->VALUE EXTRACTOR + PARSER
# =========================
def extract_label_value_stats(soup: BeautifulSoup) -> Dict[str, str]:
    """
    Generic extractor of label/value stat blocks.
    We try several selector patterns used across ATP layouts.
    """
    stats: Dict[str, str] = {}

    # Candidate containers
    candidates = []
    candidates += soup.select("div.stat")
    candidates += soup.select("div.stat-item")
    candidates += soup.select("div.player-stats-item")
    if not candidates:
        candidates = soup.select("div[class*='stat']")

    for c in candidates:
        label_el = (
            c.select_one(".stat-label")
            or c.select_one(".label")
            or c.select_one("span.label")
            or c.find("div", class_=re.compile("label", re.I))
            or c.find("span", class_=re.compile("label", re.I))
        )
        value_el = (
            c.select_one(".stat-value")
            or c.select_one(".value")
            or c.select_one("span.value")
            or c.find("div", class_=re.compile("value", re.I))
            or c.find("span", class_=re.compile("value", re.I))
        )

        if not label_el or not value_el:
            continue

        label = label_el.get_text(" ", strip=True)
        value = value_el.get_text(" ", strip=True)

        if label and value:
            stats[label] = value

    return stats


def parse_singles_from_page_source(html: str) -> Dict:
    """
    Parse Singles stats after Singles tab is active.
    1) DOM label/value extraction
    2) fallback: normalized text regex
    """
    soup = BeautifulSoup(html, "html.parser")

    out = {
        "Singles_Career_High_Rank": None,
        "Singles_WL_Career": None,
        "Singles_Titles_Career": None,
    }

    # ---- 1) DOM extraction ----
    stats = extract_label_value_stats(soup)

    label_map = {
        "Career High Rank": "Singles_Career_High_Rank",
        "Career High Ranking": "Singles_Career_High_Rank",
        "Career W-L": "Singles_WL_Career",
        "Career W/L": "Singles_WL_Career",
        "Career Win/Loss": "Singles_WL_Career",
        "Career Titles": "Singles_Titles_Career",
    }

    for k, v in stats.items():
        k_norm = k.strip()
        if k_norm in label_map:
            out[label_map[k_norm]] = v

    # Clean numeric formats
    if out["Singles_Career_High_Rank"] is not None:
        m = re.search(r"(\d+)", str(out["Singles_Career_High_Rank"]))
        out["Singles_Career_High_Rank"] = int(m.group(1)) if m else None

    if out["Singles_Titles_Career"] is not None:
        m = re.search(r"(\d+)", str(out["Singles_Titles_Career"]))
        out["Singles_Titles_Career"] = int(m.group(1)) if m else None

    # If we got something, return
    if any(v is not None for v in out.values()):
        return out

    # ---- 2) fallback regex on normalized text ----
    text = soup.get_text(" ", strip=True).replace("\xa0", " ")
    text = re.sub(r"\s+", " ", text)

    m = re.search(r"Career High Rank[: ]+(\d+)", text, flags=re.IGNORECASE)
    out["Singles_Career_High_Rank"] = int(m.group(1)) if m else None

    m = re.search(r"Career\s+W-?L[: ]+(\d+)\s*-\s*(\d+)", text, flags=re.IGNORECASE)
    out["Singles_WL_Career"] = f"{m.group(1)}-{m.group(2)}" if m else None

    m = re.search(r"Career\s+Titles[: ]+(\d+)", text, flags=re.IGNORECASE)
    out["Singles_Titles_Career"] = int(m.group(1)) if m else None

    return out


# =========================
# SCRAPE RANKINGS PAGE (DOUBLES)
# =========================
def scrape_player_urls(dateweek: str, tournament: str, year: str, n_rows: int = N_ROWS_DEFAULT) -> List[Dict]:
    ranking_url = f"https://www.atptour.com/en/rankings/doubles?RankRange=1-1000&Region=all&DateWeek={dateweek}"
    driver = configure_driver()
    try:
        driver.get(ranking_url)
        random_sleep()

        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "tbody")))
        players = parse_rankings_table(driver.page_source, n_rows)

        # "Tourns" dynamic column via Selenium
        tourn_cells = driver.find_elements(By.XPATH, "//td[contains(@class,'tourns')]")
        tourn_texts = [cell.text.strip() for cell in tourn_cells[:len(players)]]

        merged = []
        for i, p in enumerate(players):
            pp = p.copy()
            pp["Tourns"] = tourn_texts[i] if i < len(tourn_texts) else ""
            pp["Tournament"] = tournament
            pp["Year"] = year
            pp["DateWeek"] = dateweek
            merged.append(pp)

        return merged

    except (WebDriverException, TimeoutException) as e:
        print(f"[Rankings] Error for {dateweek} {tournament} {year}: {e}")
        return []
    finally:
        driver.quit()


# =========================
# SCRAPE PLAYER PROFILE (DOUBLES + SINGLES)
# =========================
def scrape_player_profile(profile_link: str) -> Dict:
    """
    Preserves doubles data and adds singles data.
    - Doubles overview fields are prefixed as Doubles_*
    - Singles fields are Singles_*
    - Profile details (DOB, Plays, Coach, etc.) are kept unchanged
    """
    driver = configure_driver()
    try:
        for attempt in range(1, RETRIES + 1):
            try:
                driver.get(profile_link)

                WebDriverWait(driver, 12).until(
                    EC.presence_of_all_elements_located(
                        (By.CSS_SELECTOR, "ul.pd_left, ul.pd_right, a.tab-switcher-link")
                    )
                )
                random_sleep(0.4, 1.0)

                data: Dict = {}

                # 1) DOUBLES first
                try:
                    click_tab(driver, "Doubles", timeout=12)
                except Exception:
                    pass

                soup = BeautifulSoup(driver.page_source, "html.parser")
                doubles_overview = parse_player_profile_overview(soup)
                doubles_details = parse_player_profile_details(soup)

                data.update({f"Doubles_{k}": v for k, v in doubles_overview.items()})
                data.update(doubles_details)

                # 2) SINGLES
                try:
                    click_tab(driver, "Singles", timeout=12)
                    WebDriverWait(driver, 12).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, "div.wins, div.titles, div[class*='stat']"))
                    )
                    singles_data = parse_singles_from_page_source(driver.page_source)
                except Exception:
                    singles_data = {
                        "Singles_Career_High_Rank": None,
                        "Singles_WL_Career": None,
                        "Singles_Titles_Career": None
                    }

                data.update(singles_data)
                return data

            except (TimeoutException, WebDriverException) as e:
                print(f"[Profile] Attempt {attempt}/{RETRIES} failed: {e}")
                random_sleep(1.5, 3.0)

        print(f"[Profile] Failed after {RETRIES} attempts: {profile_link}")
        return {}

    finally:
        driver.quit()


# =========================
# SAVE TO EXCEL
# =========================
def save_player_info_to_excel(
    records: List[Dict],
    dateweek: str,
    out_dir: str = OUTPUT_DIR,
    suffix: str = ""
) -> str:
    if not records:
        print(f"[Save] No data to save for {dateweek}.")
        return ""

    df_out = pd.DataFrame(records)
    fname = f"ranking_doubles_{dateweek}{suffix}.xlsx"
    out_path = os.path.join(out_dir, fname)

    df_out.to_excel(out_path, index=False)
    print(f"[Save] {len(df_out)} rows saved to: {out_path}")
    return out_path


# =========================
# FULL RUN
# =========================
def run_full_schedule(
    dateweeks_tournaments=DATEWEEKS_TOURNAMENTS,
    n_rows=N_ROWS_DEFAULT,
    test_run: bool = False
) -> None:
    for dateweek, tournament, year in dateweeks_tournaments:
        try:
            players = scrape_player_urls(dateweek, tournament, year, n_rows=n_rows)
            print(f"[Week] {dateweek} {tournament} {year}: found {len(players)} players.")

            for idx, p in enumerate(players, start=1):
                link = p.get("Player Profile Link") or ""
                if link:
                    profile = scrape_player_profile(link)
                    p.update(profile)

                print(f"    - ({idx}/{len(players)}) {p.get('Player','?')} processed.")
                random_sleep(0.5, 1.3)

            if players:
                suffix = "_TEST" if test_run else ""
                save_player_info_to_excel(players, dateweek, suffix=suffix)

        except Exception as e:
            print(f"[Week] Unexpected error for {dateweek} {tournament} {year}: {e}")
            continue


# =========================
# TEST RUN (10 players)
# =========================
test_subset = DATEWEEKS_TOURNAMENTS[:1]
run_full_schedule(test_subset, n_rows=10, test_run=True)

# Full run when ready:
# run_full_schedule(DATEWEEKS_TOURNAMENTS, n_rows=1000, test_run=False)


[Week] 2018-01-15 Australian Open 2018: found 10 players.
    - (1/10) L. Kubot processed.
    - (2/10) M. Melo processed.
    - (3/10) H. Kontinen processed.
    - (4/10) J. Peers processed.
    - (5/10) I. Dodig processed.
    - (6/10) N. Mahut processed.
    - (7/10) J. Rojer processed.
    - (8/10) H. Tecau processed.
    - (9/10) J. Murray processed.
    - (10/10) B. Soares processed.
[Save] 10 rows saved to: C:/Users/ALESSANDRO/Documents/GitHub/tennis-homophily/data/atp\ranking_doubles_2018-01-15_TEST.xlsx
