In [27]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Function to retrieve th and td elements from the table using Selenium
def retrieve_table_data_selenium(player_url):
    # Initialize Selenium
    chrome_service = ChromeService("C:/Users/ALESSANDRO/Downloads/chromedriver.exe")  # Replace with the path to your ChromeDriver executable
    chrome_service.start()
    chrome_options = Options()
    # Remove the --headless option to run Chrome with GUI
    chrome_options.add_argument("--headless")  # Remove this line

    driver = webdriver.Chrome(service=chrome_service, options=chrome_options)

    # Append "&tab=profile" to the player's URL
    player_url += "&tab=profile"

    # Navigate to the player's URL
    driver.get(player_url)

    try:
        # Wait for the table to be visible (adjust timeout as needed)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "table.table.table-condensed.text-nowrap")))

        # Find the table with class "table table-condensed text-nowrap"
        table = driver.find_element(By.CSS_SELECTOR, "table.table.table-condensed.text-nowrap")

        # Find all th and td elements within the table
        th_elements = table.find_elements(By.TAG_NAME, "th")
        td_elements = table.find_elements(By.TAG_NAME, "td")

        # Create a dictionary to store th and td values
        player_data = {}
        th_values = [th.text.strip() for th in th_elements]
        td_values = [td.text.strip() for td in td_elements]

        # Pair each th with its corresponding td value
        for th, td in zip(th_values, td_values):
            player_data[th] = td

        return player_data
    except Exception as e:
        print(f"Error retrieving table data for URL {player_url}: {str(e)}")
        return None
    finally:
        # Close the Selenium WebDriver
        driver.quit()

# URL of the webpage containing the table
url = "https://www.ultimatetennisstatistics.com/tournamentEvent?tournamentEventId=4626&tab=results"

# Send an HTTP GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the webpage
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the table by its ID ("resultsTable")
    table = soup.find("table", {"id": "resultsTable"})

    if table:
        # Create an empty list to store player data
        player_data_list = []

        # Initialize counters for rows and cells
        row_index = 0
        cell_index = 0

        # Initialize a timer
        start_time = time.time()

        # Extract and append the data from the table to the list
        for row in table.find_all("tr"):
            # Increment the row index
            row_index += 1

            player_cell = row.find("td", {"class": "player"})
            score_cells = row.find_all("td", {"class": "score"})

            if player_cell:
                # Increment the cell index only when a <td> element is found
                cell_index = 0

                player_name = player_cell.text.strip()
                # Extract nationality from the title attribute of the img tag
                nationality = player_cell.find("img")["title"]

                # Initialize variables for tiebreak sets and games in each set
                tiebreak_set1 = tiebreak_set2 = tiebreak_set3 = 0
                games_set1 = games_set2 = games_set3 = None

                if score_cells:
                    scores = [cell.text.strip() for cell in score_cells]

                    # Extract tiebreak information and update set scores
                    if len(scores) > 0 and "(" in scores[0]:
                        tiebreak_set1 = 1
                        games_set1 = int(scores[0].split("(")[1].split(")")[0])
                        scores[0] = scores[0].split("(")[0].strip()
                    if len(scores) > 1 and "(" in scores[1]:
                        tiebreak_set2 = 1
                        games_set2 = int(scores[1].split("(")[1].split(")")[0])
                        scores[1] = scores[1].split("(")[0].strip()
                    if len(scores) > 2 and "(" in scores[2]:
                        tiebreak_set3 = 1
                        games_set3 = int(scores[2].split("(")[1].split(")")[0])
                        scores[2] = scores[2].split("(")[0].strip()

                # Determine the 'stage' based on row index
                if row_index in [3, 4, 11, 12, 19, 20, 27, 28, 35, 36, 43, 44, 51, 52, 59, 60]:
                    stage = "quarterfinal"
                elif row_index in [5, 6, 13, 14, 21, 22, 29, 30]:
                    stage = "semifinal"
                elif row_index in [7, 8, 15, 16]:
                    stage = "final"
                else:
                    stage = "roundrobin"

                # Find the player's href link in the first column
                player_link = row.find("a")

                # Check if the player_link is not None and get the href attribute
                if player_link:
                    href = player_link.get("href")

                    # Retrieve additional table data for the player
                    player_data = retrieve_table_data_selenium("https://www.ultimatetennisstatistics.com" + href)

                    # Check if valid data is retrieved before appending
                    if player_data is not None:
                        # Increment the cell index
                        cell_index += 1

                        # Add the row index and cell index to the player_data dictionary
                        player_data["Row_Index"] = row_index
                        player_data["Cell_Index"] = cell_index

                        player_data.update({
                            "Player": player_name,
                            "Nationality": nationality,
                            "Set1": scores[0] if len(scores) > 0 else None,
                            "Set2": scores[1] if len(scores) > 1 else None,
                            "Set3": scores[2] if len(scores) > 2 else None,
                            "Tiebreak_Set1": tiebreak_set1,
                            "Tiebreak_Set2": tiebreak_set2,
                            "Tiebreak_Set3": tiebreak_set3,
                            "Games_Set1": games_set1,
                            "Games_Set2": games_set2,
                            "Games_Set3": games_set3,
                            "Stage": stage,
                            "Player_Href": href
                        })

                        # Append the player data to the list
                        player_data_list.append(player_data)

            # Check if 1 minute has elapsed and save the DataFrame
            elapsed_time = time.time() - start_time
            if elapsed_time >= 60:
                # Create a Pandas DataFrame from the list of player data
                df = pd.DataFrame(player_data_list)

                # Display the first 5 rows of the DataFrame for testing
                print(df.head())

                # Save the DataFrame to an Excel file
                df.to_excel("tennis_data_partial.xlsx", index=False)

                print("Data saved to 'tennis_data_partial.xlsx'")
                break

        # Create a Pandas DataFrame from the list of player data
        df = pd.DataFrame(player_data_list)

        # Display the first 5 rows of the DataFrame for testing
        print(df.head())

        # Save the final DataFrame to an Excel file
        df.to_excel("tennis_data.xlsx", index=False)

        print("Data saved to 'tennis_data.xlsx'")
    else:
        print("Table not found on the webpage.")
else:
    print("Failed to fetch the webpage. Status code:", response.status_code)

    

Error retrieving table data for URL https://www.ultimatetennisstatistics.com/playerProfile?playerId=46888&tab=profile: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF7E3F17D12+55474]
	(No symbol) [0x00007FF7E3E877C2]
	(No symbol) [0x00007FF7E3D3E0EB]
	(No symbol) [0x00007FF7E3D7EBAC]
	(No symbol) [0x00007FF7E3D7ED2C]
	(No symbol) [0x00007FF7E3DB9F77]
	(No symbol) [0x00007FF7E3D9F19F]
	(No symbol) [0x00007FF7E3DB7EF2]
	(No symbol) [0x00007FF7E3D9EF33]
	(No symbol) [0x00007FF7E3D73D41]
	(No symbol) [0x00007FF7E3D74F84]
	GetHandleVerifier [0x00007FF7E427B762+3609346]
	GetHandleVerifier [0x00007FF7E42D1A80+3962400]
	GetHandleVerifier [0x00007FF7E42C9F0F+3930799]
	GetHandleVerifier [0x00007FF7E3FB3CA6+694342]
	(No symbol) [0x00007FF7E3E92218]
	(No symbol) [0x00007FF7E3E8E484]
	(No symbol) [0x00007FF7E3E8E5B2]
	(No symbol) [0x00007FF7E3E7EE13]
	BaseThreadInitThunk [0x00007FFD7717257D+29]
	RtlUserThreadStart [0x00007FFD7780AA68+40]

Error retrieving table data for URL https://www.ultimate