In [9]:
import os
import csv
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd


In [10]:
user = "aldi"

In [11]:
# Path to the ChromeDriver
chrome_driver_path = f"C:/Users/{user}/Downloads/chromedriver.exe"

In [12]:
output_folder = f"C:/Users/{user}/Documents/GitHub/tennis-homophily/data/atp"


In [13]:
url = "https://www.atptour.com/en/rankings/doubles?rankRange=1-5000"

In [14]:
def get_highest_dates(date_values):
    highest_dates = {}
    for date in date_values:
        year_month = date[:7]  # Extract the year and month part of the date
        day = int(date[-2:])  # Extract the day part of the date and convert to an integer

        # If the current date has a day greater than the stored date for the same year and month,
        # update the highest_dates dictionary with the current date
        if year_month not in highest_dates or day > highest_dates[year_month]:
            highest_dates[year_month] = day

    # Combine the year and month with the highest day to form the final list of dates
    highest_dates_list = [f"{year_month}-{highest_day:02d}" for year_month, highest_day in highest_dates.items()]

    return highest_dates_list

In [15]:
# Configure ChromeOptions for headless mode
options = Options()
#options.add_argument("--headless")
# add options to diagnose errors
options.add_argument("--verbose")
options.add_argument("--log-path=chromedriver.log")


service = Service(chrome_driver_path)

In [16]:
try:
    all_data = []
    dates = []  # List to store corresponding dates

    with webdriver.Chrome(service=service) as driver1:
        driver1.get(url)
        driver1.minimize_window()
        WebDriverWait(driver1, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "mega-table")))
        date_dropdown_ul = WebDriverWait(driver1, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "ul[data-value='rankDate']"))
        )
        date_options = date_dropdown_ul.find_elements(By.TAG_NAME, "li")

        date_values = []
        for option in date_options:
            date_value = option.get_attribute("data-value")
            if date_value is None:
                continue
            try:
                year = int(date_value[:4])
            except ValueError:
                continue
            if year > 2013:
                date_values.append(date_value)

    filtered_dates = get_highest_dates(date_values)
    print("Filtered Dates:", filtered_dates)

#     filtered_dates = [filtered_dates[-1]]

    for date in filtered_dates:
        with webdriver.Chrome(service=service, options = options) as driver2:
            complete_url = f"https://www.atptour.com/en/rankings/doubles?rankRange=1-5000&rankDate={date}"
            print("Accessing URL for date:", date)  # Print the URL being accessed
            driver2.get(complete_url)
            driver2.minimize_window()
            WebDriverWait(driver2, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "mega-table")))  # Wait for the table to be present
            time.sleep(5)

            page_source = driver2.page_source
            soup = BeautifulSoup(page_source, "html.parser")

            rankings_table = soup.find("table", {"class": "mega-table"})
            if rankings_table is None:
                print(f"Table not found for date: {date}")
                continue  # Skip to the next date if the table is not found

            rows = rankings_table.find_all("tr")[1:150] # Process only the first n rows


            for index, row in enumerate (rows, start = 1):
                rank = row.find("td", {"class": "rank-cell"}).text.strip()
                player_name = row.find("td", {"class": "player-cell"}).text.strip()
                age = row.find("td", {"class": "age-cell"}).text.strip()
                points = row.find("td", {"class": "points-cell"}).text.strip()
                tournaments_played = row.find("td", {"class": "tourn-cell"}).text.strip()
                player_profile_link = row.find("a", href=True)["href"]
                print(f"Processing row {index}: Rank {rank}, Player {player_name}")  # Print the current row being processed


                with webdriver.Chrome(service=service, options = options) as driver3:
                    driver3.get(f"https://www.atptour.com{player_profile_link}")
                    driver3.minimize_window()
                    time.sleep(5)
                    player_page_source = driver3.page_source
                    soup_player = BeautifulSoup(player_page_source, "html.parser")

                    birthplace_div = soup_player.find("div", class_="table-value")
                    birthplace = birthplace_div.get_text(strip=True)
                    city_birthplace, _, country_birthplace = birthplace.partition(",")

                    turned_pro_label_div = soup_player.find("div", class_="table-big-label", text="Turned Pro")
                    year_pro_div = turned_pro_label_div.find_next_sibling("div", class_="table-big-value")
                    year_pro = year_pro_div.get_text(strip=True)

                    weight_span = soup_player.select_one("span.table-weight-kg-wrapper")
                    weight_kg = weight_span.get_text(strip=True).strip("()") if weight_span else ""

                    height_span = soup_player.select_one("span.table-height-cm-wrapper")
                    height_cm = height_span.get_text(strip=True).strip("()") if height_span else ""

                    hand_backhand_div = soup_player.find_all("div", class_="table-value")[1]
                    hand, backhand = [item.strip() for item in hand_backhand_div.get_text(strip=True).split(",")]

                    coach_div = soup_player.find_all("div", class_="table-value")[2]
                    coaches = coach_div.get_text(strip=True).split(", ")
                    coach1 = coaches[0]
                    coach2 = coaches[1] if len(coaches) > 1 else ""

                    all_data.append([rank, player_name, age, points, tournaments_played, year_pro, weight_kg, height_cm, city_birthplace, country_birthplace, hand, backhand, coach1, coach2])
                    dates.append(date)  # Append date for each row

    df = pd.DataFrame(all_data, columns=["Rank", "Player", "Age", "Points", "Tournaments Played", "Year Turned Pro", "Weight (kg)", "Height (cm)", "City of Birthplace", "Country of Birthplace", "Hand", "Backhand", "Coach1", "Coach2"])
    df['Date'] = dates  # Add 'Date' column to the DataFrame

    output_excel_filename = os.path.join(output_folder, "atp_doubles.xlsx")
    df.to_excel(output_excel_filename, index=False)

    print(f"All data saved to {output_excel_filename}")

finally:
    service.stop()


TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF67E681F52+60322]
	(No symbol) [0x00007FF67E5FCEC9]
	(No symbol) [0x00007FF67E4B7EBA]
	(No symbol) [0x00007FF67E507676]
	(No symbol) [0x00007FF67E50773C]
	(No symbol) [0x00007FF67E54E967]
	(No symbol) [0x00007FF67E52C25F]
	(No symbol) [0x00007FF67E54BC80]
	(No symbol) [0x00007FF67E52BFC3]
	(No symbol) [0x00007FF67E4F9617]
	(No symbol) [0x00007FF67E4FA211]
	GetHandleVerifier [0x00007FF67E9994AD+3301629]
	GetHandleVerifier [0x00007FF67E9E36D3+3605283]
	GetHandleVerifier [0x00007FF67E9D9450+3563680]
	GetHandleVerifier [0x00007FF67E734326+790390]
	(No symbol) [0x00007FF67E60750F]
	(No symbol) [0x00007FF67E603404]
	(No symbol) [0x00007FF67E603592]
	(No symbol) [0x00007FF67E5F2F9F]
	BaseThreadInitThunk [0x00007FFCFEA87344+20]
	RtlUserThreadStart [0x00007FFCFF5A26B1+33]


In [None]:
# shut down operating system after 360 seconds (6 minutes)
os.system("shutdown /s /t 360")
