In [1]:
import os
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time


In [2]:
user = "aldi"

In [3]:
# Path to the ChromeDriver
chrome_driver_path = f"C:/Users/{user}/Downloads/chromedriver.exe"

In [4]:
output_folder = f"C:/Users/{user}/Documents/GitHub/tennis-homophily/data/atp"


In [5]:
%run functions.ipynb

In [6]:
# Use the Service object for ChromeDriver
service = Service(chrome_driver_path)

try:
    # Initialize the list of dates to be scraped
    date_values = []

    # Open the first part to get the list of dates
    with webdriver.Chrome(service=service) as driver1:
        # Open the URL and wait for the content to load
        driver1.get("https://www.atptour.com/en/rankings/doubles?rankRange=1-5000")
        driver1.maximize_window()
        WebDriverWait(driver1, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "mega-table")))

        # Find the date dropdown menu and retrieve the date options
        date_dropdown_ul = WebDriverWait(driver1, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "ul[data-value='rankDate']"))
        )
        date_options = date_dropdown_ul.find_elements(By.TAG_NAME, "li")
        
        # Extract date values from the dropdown and store them in a list
#         date_values = [option.get_attribute("data-value") for option in date_options]

        # Extract date values from the dropdown and store them in a list - applying year filter
        for option in date_options:
            date_value = option.get_attribute("data-value")
            if date_value is None:
                continue
            try:
                year = int(date_value[:4])
            except ValueError:
                continue
            if year > 2022:
                date_values.append(date_value)
                
    # Filter the dates to keep only the highest number for each pair of year and month
    filtered_dates = get_highest_dates(date_values)


    print("List of Dates:")
    for date in filtered_dates :
        print(date)

    # Now, loop through different dates and scrape the data
    for date in filtered_dates :
        # Form the complete URL with the selected date
        complete_url = f"https://www.atptour.com/en/rankings/doubles?rankRange=1-5000&rankDate={date}"

        # Open a new WebDriver for each date
        with webdriver.Chrome(service=service) as driver2:
            # Open the URL and wait for the content to load
            driver2.get(complete_url)
            time.sleep(5)  # Wait for 5 seconds for dynamic content to load, adjust as needed

            # Get the page source and create BeautifulSoup object
            page_source = driver2.page_source
            soup = BeautifulSoup(page_source, "html.parser")

            # Continue with extracting data as before
            rankings_table = soup.find("table", {"class": "mega-table"})
            rows = rankings_table.find_all("tr")[1:]  # Skip the header row

            # Create a CSV file for each date and save the data
            filename = os.path.join(output_folder, f"rankings_data_{date}.csv")
            with open(filename, mode="w", newline="", encoding="utf-8") as file:
                writer = csv.writer(file)
                writer.writerow(["Rank", "Player", "Age", "Points", "Tournaments Played"])

                for row in rows:
                    rank = row.find("td", {"class": "rank-cell"}).text.strip()
                    player_name = row.find("td", {"class": "player-cell"}).text.strip()
                    age = row.find("td", {"class": "age-cell"}).text.strip()
                    points = row.find("td", {"class": "points-cell"}).text.strip()
                    tournaments_played = row.find("td", {"class": "tourn-cell"}).text.strip()

                    writer.writerow([rank, player_name, age, points, tournaments_played])

            print(f"Rank Range: 1-5000, Rank Date: {date} - Data saved to {filename}")
            print()

finally:
    # Don't forget to stop the service once you are done.
    service.stop()


List of Dates:
2023-12-04
2023-11-27
2023-10-30
2023-09-25
2023-08-28
2023-07-31
2023-06-26
2023-05-29
2023-04-24
2023-03-20
2023-02-27
2023-01-30
2022-12-26
2022-11-28
2022-10-31
2022-09-26
2022-08-29
2022-07-25
2022-06-27
2022-05-23
2022-04-25
2022-03-21
2022-02-28
2022-01-31
2021-12-27
2021-11-29
2021-10-25
2021-09-27
2021-08-30
2021-07-26
2021-06-28
2021-05-31
2021-04-26
2021-03-22
2021-02-22
2021-01-25
Rank Range: 1-5000, Rank Date: 2023-12-04 - Data saved to C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp\rankings_data_2023-12-04.csv

Rank Range: 1-5000, Rank Date: 2023-11-27 - Data saved to C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp\rankings_data_2023-11-27.csv

Rank Range: 1-5000, Rank Date: 2023-10-30 - Data saved to C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp\rankings_data_2023-10-30.csv

Rank Range: 1-5000, Rank Date: 2023-09-25 - Data saved to C:/Users/aldi/Documents/GitHub/tennis-homophily/data/atp\rankings_data_2023-09-25.csv

Rank Rang

KeyboardInterrupt: 

In [10]:

# Initialize Chrome WebDriver
driver = webdriver.Chrome(executable_path=chrome_driver_path)

try:
    url = "https://www.atptour.com/en/players/wesley-koolhof/kc41/overview"

    # Open the URL using Selenium
    driver.get(url)

    # Get the page source after the dynamic content has loaded
    page_source = driver.page_source

    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(page_source, "html.parser")

    # Find the player profile hero table
    player_hero_table = soup.find("div", class_="player-profile-hero-table")

    # Find the div with class "table-big-label" for "Turned Pro"
    turned_pro_label_div = player_hero_table.find("div", class_="table-big-label", text="Turned Pro")

    # Extract the "Turned Pro" information
    year_pro_div = turned_pro_label_div.find_next_sibling("div", class_="table-big-value")
    year_pro = year_pro_div.get_text(strip=True)
    
    # Find the span with class "table-weight-kg-wrapper" for weight in kg
    weight_span = player_hero_table.select_one("span.table-weight-kg-wrapper")

    # Extract weight in kg information
    weight_kg = ""
    if weight_span:
        weight_kg = weight_span.get_text(strip=True).strip("()")

    # Find the span with class "table-height-cm-wrapper" for height in cm
    height_span = player_hero_table.select_one("span.table-height-cm-wrapper")

    # Extract height in cm information
    height_cm = ""
    if height_span:
        height_cm = height_span.get_text(strip=True).strip("()")

    # Find the player birthplace div with class "table-value"
    birthplace_div = soup.find("div", class_="table-value")

    # Extract birthplace information
    birthplace = birthplace_div.get_text(strip=True)

    # Split the birthplace into city and country
    city_birthplace, country_birthplace = birthplace.split(",")

    # Remove any leading or trailing whitespaces from city and country
    city_birthplace = city_birthplace.strip()
    country_birthplace = country_birthplace.strip()

    # Find the second "div" with class "table-value" for hand and backhand
    hand_backhand_div = player_hero_table.find_all("div", class_="table-value")[1]
    
    # Extract hand and backhand information if available
    hand, backhand = "", ""
    if hand_backhand_div:
        hand, backhand = [item.strip() for item in hand_backhand_div.get_text(strip=True).split(",")]

    # Find the third "div" with class "table-value" for coach information
    coach_div = player_hero_table.find_all("div", class_="table-value")[2]

    # Extract coach information if available
    coaches = coach_div.get_text(strip=True).split(", ")

    # Print the extracted information
    print(f"Year Turned Pro: {year_pro}")
    print(f"Weight (kg): {weight_kg}")
    print(f"Height (cm): {height_cm}")
    print(f"City of Birthplace: {city_birthplace}")
    print(f"Country of Birthplace: {country_birthplace}")
    print(f"Hand: {hand}")
    print(f"Backhand: {backhand}")
    print("Coaches:")
    for idx, coach in enumerate(coaches, 1):
        print(f"Coach {idx}: {coach}")

finally:
    # Don't forget to close the WebDriver once you are done.
    driver.quit()


  driver = webdriver.Chrome(executable_path=chrome_driver_path)


Year Turned Pro: 2008
Weight (kg): 78kg
Height (cm): 180cm
City of Birthplace: Zevenaar
Country of Birthplace: Netherlands
Hand: Right-Handed
Backhand: Two-Handed Backhand
Coaches:
Coach 1: Rob Morgan
Coach 2: Mariusz Fyrstenberg


combine the two chunks above in order to get all the information for each player in the rank table. (Test the first 10 players of the last 2023 tournament)

In [13]:


# Use the Service object for ChromeDriver
service = Service(chrome_driver_path)

try:
    # Initialize the list of dates to be scraped
    date_values = []

    # Open the first part to get the list of dates
    with webdriver.Chrome(service=service) as driver1:
        # Open the URL and wait for the content to load
        driver1.get("https://www.atptour.com/en/rankings/doubles?rankRange=1-5000")
        WebDriverWait(driver1, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "mega-table")))

        # Find the date dropdown menu and retrieve the date options
        date_dropdown_ul = WebDriverWait(driver1, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "ul[data-value='rankDate']"))
        )
        date_options = date_dropdown_ul.find_elements(By.TAG_NAME, "li")

        # Extract date values from the dropdown and store them in a list - applying year filter
        for option in date_options:
            date_value = option.get_attribute("data-value")
            if date_value is None:
                continue
            try:
                year = int(date_value[:4])
            except ValueError:
                continue
            if year == 2023:
                date_values.append(date_value)

    # Filter the dates to keep only the highest number for each pair of year and month
    filtered_dates = get_highest_dates(date_values)

    # Use only one date for testing purposes
    filtered_dates = [filtered_dates[-1]]

    # Now, loop through different dates and scrape the data
    for date in filtered_dates:
        # Create a CSV file for each date and save the data
        output_folder = "C:\\Users\\ALESSANDRO\\Documents\\GitHub\\tennis-homophily\\data\\atp"
        filename = os.path.join(output_folder, f"rankings_data_{date}.csv")
        with open(filename, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow(["Rank", "Player", "Age", "Points", "Tournaments Played", "Year Turned Pro", "Weight (kg)", "Height (cm)", "City of Birthplace", "Country of Birthplace", "Hand", "Backhand", "Coach1", "Coach2"])

            # Open a new WebDriver for each date
            with webdriver.Chrome(service=service) as driver2:
                # Form the complete URL with the selected date
                complete_url = f"https://www.atptour.com/en/rankings/doubles?rankRange=1-5000&rankDate={date}"

                # Open the URL and wait for the content to load
                driver2.get(complete_url)
                time.sleep(5)  # Wait for 5 seconds for dynamic content to load, adjust as needed

                # Get the page source and create BeautifulSoup object
                page_source = driver2.page_source
                soup = BeautifulSoup(page_source, "html.parser")

                # Continue with extracting data as before
                rankings_table = soup.find("table", {"class": "mega-table"})
#                 rows = rankings_table.find_all("tr")[1:]  # Skip the header row
                rows = rankings_table.find_all("tr")[1:11]  # Process only the first 10 rows


                for row in rows:
                    rank = row.find("td", {"class": "rank-cell"}).text.strip()
                    player_name = row.find("td", {"class": "player-cell"}).text.strip()
                    age = row.find("td", {"class": "age-cell"}).text.strip()
                    points = row.find("td", {"class": "points-cell"}).text.strip()
                    tournaments_played = row.find("td", {"class": "tourn-cell"}).text.strip()

                    # Extract player profile link
                    player_profile_link = row.find("a", href=True)["href"]

                    # Open the player profile URL
                    with webdriver.Chrome(service=service) as driver3:
                        driver3.get(f"https://www.atptour.com{player_profile_link}")
                        time.sleep(5)  # Wait for 5 seconds for dynamic content to load, adjust as needed

                        # Get the player profile page source and create BeautifulSoup object
                        player_page_source = driver3.page_source
                        soup_player = BeautifulSoup(player_page_source, "html.parser")

                        # Continue with extracting player information
                        birthplace_div = soup_player.find("div", class_="table-value")
                        birthplace = birthplace_div.get_text(strip=True)
                        city_birthplace, _, country_birthplace = birthplace.partition(",")

                        turned_pro_label_div = soup_player.find("div", class_="table-big-label", text="Turned Pro")
                        year_pro_div = turned_pro_label_div.find_next_sibling("div", class_="table-big-value")
                        year_pro = year_pro_div.get_text(strip=True)

                        weight_span = soup_player.select_one("span.table-weight-kg-wrapper")
                        weight_kg = ""
                        if weight_span:
                            weight_kg = weight_span.get_text(strip=True).strip("()")

                        height_span = soup_player.select_one("span.table-height-cm-wrapper")
                        height_cm = ""
                        if height_span:
                            height_cm = height_span.get_text(strip=True).strip("()")

                        # Find the div with class "table-value" for hand and backhand
                        hand_backhand_div = soup_player.find_all("div", class_="table-value")[1]

                        # Extract hand and backhand information if available
                        hand, backhand = "", ""
                        if hand_backhand_div:
                            hand, backhand = [item.strip() for item in hand_backhand_div.get_text(strip=True).split(",")]

                        # Find the div with class "table-value" for coach information
                        coach_div = soup_player.find_all("div", class_="table-value")[2]

                        # Extract coach information if available
                        coaches = coach_div.get_text(strip=True).split(", ")
                        coach1 = coaches[0]
                        coach2 = "" if len(coaches) < 2 else coaches[1]


                        writer.writerow([rank, player_name, age, points, tournaments_played, year_pro, weight_kg, height_cm, city_birthplace, country_birthplace, hand, backhand, coach1, coach2])

        print(f"Rank Range: 1-5000, Rank Date: {date} - Data saved to {filename}")
        print()

finally:
    # Don't forget to stop the Service once you are done.
    service.stop()


Rank Range: 1-5000, Rank Date: 2023-01-30 - Data saved to C:\Users\ALESSANDRO\Documents\GitHub\tennis-homophily\data\atp\rankings_data_2023-01-30.csv

