In [40]:
import os
import csv
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pandas as pd


In [41]:
user = "aldi"

In [42]:
# Path to the ChromeDriver
chrome_driver_path = f"C:/Users/{user}/Downloads/chromedriver.exe"

In [43]:
output_folder = f"C:/Users/{user}/Documents/GitHub/tennis-homophily/data/atp"


In [44]:
url = "https://www.atptour.com/en/rankings/doubles?rankRange=1-5000"

In [45]:
%run functions.ipynb

In [55]:
# Configure ChromeOptions for headless mode
options = Options()
#options.add_argument("--headless")
# add options to diagnose errors
options.add_argument("--verbose")
options.add_argument("--log-path=chromedriver.log")


service = Service(chrome_driver_path)

In [56]:
try:
    all_data = []
    dates = []  # List to store corresponding dates

    with webdriver.Chrome(service=service) as driver1:
        driver1.get(url)
        driver1.minimize_window()
        WebDriverWait(driver1, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "mega-table")))
        date_dropdown_ul = WebDriverWait(driver1, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "ul[data-value='rankDate']"))
        )
        date_options = date_dropdown_ul.find_elements(By.TAG_NAME, "li")

        date_values = []
        for option in date_options:
            date_value = option.get_attribute("data-value")
            if date_value is None:
                continue
            try:
                year = int(date_value[:4])
            except ValueError:
                continue
            if year > 2015:
                date_values.append(date_value)

    filtered_dates = get_highest_dates(date_values)
    print("Filtered Dates:", filtered_dates)

#     filtered_dates = [filtered_dates[-1]]

    for date in filtered_dates:
        with webdriver.Chrome(service=service, options = options) as driver2:
            complete_url = f"https://www.atptour.com/en/rankings/doubles?rankRange=1-5000&rankDate={date}"
            print("Accessing URL for date:", date)  # Print the URL being accessed
            driver2.get(complete_url)
            driver2.minimize_window()
            WebDriverWait(driver2, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "mega-table")))  # Wait for the table to be present
            time.sleep(5)

            page_source = driver2.page_source
            soup = BeautifulSoup(page_source, "html.parser")

            rankings_table = soup.find("table", {"class": "mega-table"})
            if rankings_table is None:
                print(f"Table not found for date: {date}")
                continue  # Skip to the next date if the table is not found

            rows = rankings_table.find_all("tr")[1:25] # Process only the first n rows


            for index, row in enumerate (rows, start = 1):
                rank = row.find("td", {"class": "rank-cell"}).text.strip()
                player_name = row.find("td", {"class": "player-cell"}).text.strip()
                age = row.find("td", {"class": "age-cell"}).text.strip()
                points = row.find("td", {"class": "points-cell"}).text.strip()
                tournaments_played = row.find("td", {"class": "tourn-cell"}).text.strip()
                player_profile_link = row.find("a", href=True)["href"]
                print(f"Processing row {index}: Rank {rank}, Player {player_name}")  # Print the current row being processed


                with webdriver.Chrome(service=service, options = options) as driver3:
                    driver3.get(f"https://www.atptour.com{player_profile_link}")
                    driver3.minimize_window()
                    time.sleep(5)
                    player_page_source = driver3.page_source
                    soup_player = BeautifulSoup(player_page_source, "html.parser")

                    birthplace_div = soup_player.find("div", class_="table-value")
                    birthplace = birthplace_div.get_text(strip=True)
                    city_birthplace, _, country_birthplace = birthplace.partition(",")

                    turned_pro_label_div = soup_player.find("div", class_="table-big-label", text="Turned Pro")
                    year_pro_div = turned_pro_label_div.find_next_sibling("div", class_="table-big-value")
                    year_pro = year_pro_div.get_text(strip=True)

                    weight_span = soup_player.select_one("span.table-weight-kg-wrapper")
                    weight_kg = weight_span.get_text(strip=True).strip("()") if weight_span else ""

                    height_span = soup_player.select_one("span.table-height-cm-wrapper")
                    height_cm = height_span.get_text(strip=True).strip("()") if height_span else ""

                    hand_backhand_div = soup_player.find_all("div", class_="table-value")[1]
                    hand, backhand = [item.strip() for item in hand_backhand_div.get_text(strip=True).split(",")]

                    coach_div = soup_player.find_all("div", class_="table-value")[2]
                    coaches = coach_div.get_text(strip=True).split(", ")
                    coach1 = coaches[0]
                    coach2 = coaches[1] if len(coaches) > 1 else ""

                    all_data.append([rank, player_name, age, points, tournaments_played, year_pro, weight_kg, height_cm, city_birthplace, country_birthplace, hand, backhand, coach1, coach2])
                    dates.append(date)  # Append date for each row

    df = pd.DataFrame(all_data, columns=["Rank", "Player", "Age", "Points", "Tournaments Played", "Year Turned Pro", "Weight (kg)", "Height (cm)", "City of Birthplace", "Country of Birthplace", "Hand", "Backhand", "Coach1", "Coach2"])
    df['Date'] = dates  # Add 'Date' column to the DataFrame

    output_excel_filename = os.path.join(output_folder, "atp_doubles.xlsx")
    df.to_excel(output_excel_filename, index=False)

    print(f"All data saved to {output_excel_filename}")

finally:
    service.stop()


Filtered Dates: ['2023-12-11', '2023-11-27', '2023-10-30', '2023-09-25', '2023-08-28', '2023-07-31', '2023-06-26', '2023-05-29', '2023-04-24', '2023-03-20', '2023-02-27', '2023-01-30', '2022-12-26', '2022-11-28', '2022-10-31', '2022-09-26', '2022-08-29', '2022-07-25', '2022-06-27', '2022-05-23', '2022-04-25', '2022-03-21', '2022-02-28', '2022-01-31', '2021-12-27', '2021-11-29', '2021-10-25', '2021-09-27', '2021-08-30', '2021-07-26', '2021-06-28', '2021-05-31', '2021-04-26', '2021-03-22', '2021-02-22', '2021-01-25', '2020-12-28', '2020-11-30', '2020-10-26', '2020-09-28', '2020-08-31', '2020-03-16', '2020-02-24', '2020-01-20', '2019-12-30', '2019-11-25', '2019-10-28', '2019-09-30', '2019-08-26', '2019-07-29', '2019-06-24', '2019-05-27', '2019-04-29', '2019-03-18', '2019-02-25', '2019-01-28', '2018-12-31', '2018-11-26', '2018-10-29', '2018-09-24', '2018-08-27', '2018-07-30', '2018-06-25', '2018-05-28', '2018-04-30', '2018-03-19', '2018-02-26', '2018-01-29', '2017-12-25', '2017-11-27', '20

KeyboardInterrupt: 