In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 20 10:27:54 2023

@author: drolldodo
"""

'''
- l'informazione del match non la riesca a prendere insieme a quella del match_status essendo nello
  stesso tag -> OK
- estendere il codice a tutti i div class = "rubber-header" e "rubber-body" -> OK
- raccogliere le informazioni sui giocatori dal fondo della pagina ed eventualmente dai loro link
'''


from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import re
import time

# Initialize Selenium
chrome_service = ChromeService("/usr/local/bin/chromedriver")
chrome_service.start()
chrome_options = Options()
#chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_service, options=chrome_options)

# URL of the webpage
url = "https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-FRA-NED-01"

# Navigate to the webpage
driver.get(url)

# Wait for the page to be loaded
loaded = False
while not loaded:
    try:
        main_element = driver.find_element(By.CLASS_NAME, "main")
        loaded = True
        print("Loaded!")
    except:
        time.sleep(1)
        print("waiting...")

# ---------------------------------------------------------------------------------------------

# Find the div element with class "main"
main_element = driver.find_element(By.CLASS_NAME, "main")

# Include the component_title_text in the stage variable
component_title_element = main_element.find_element(By.CLASS_NAME, "component-title")
stage = component_title_element.text.strip()

# ---------------------------------------------------------------------------------------------

# Now, let's find the div element with class "tie" within the main element
tie_element = main_element.find_element(By.CLASS_NAME, "details")

# Find all sub div elements within the "tie" element
sub_div_elements = tie_element.find_elements(By.TAG_NAME, "div")

# Initialize variables to store data
column_data = {}

for sub_div_element in sub_div_elements:
    sub_div_text = sub_div_element.text.strip()
    if ":" in sub_div_text:
        column_name, column_value = sub_div_text.split(":", 1)
        column_data[column_name] = [column_value]

# ---------------------------------------------------------------------------------------------

# Create a DataFrame from the collected data
df = pd.DataFrame(column_data)

# Add the "Stage" column with the component_title_text
df["Stage"] = stage

# ---------------------------------------------------------------------------------------------

# Find the div element with class "rubber-header"
rubber_header_elements = main_element.find_elements(By.CLASS_NAME, "rubber-header")
match_num = []
match_status = []

for rubber_header_element in rubber_header_elements:

    # Extract "match" and "match status" from the span elements
    spans = rubber_header_element.find_elements(By.TAG_NAME, "span")
#   span_element = rubber_header_element.find_element(By.CSS_SELECTOR, "span.label.ng-binding")
 
    if len(spans) >= 2:
        match_num.append(spans[0].text.strip())
        match_status.append(spans[1].text.strip())
#       match = span_element.text.strip()

# ---------------------------------------------------------------------------------------------
    
# Now, let's find the div element with class "rubber-body" within the main element
rubber_body_elements = main_element.find_elements(By.CLASS_NAME, "rubber-body")
tables_data = []

match_idx = -1
for rubber_body_element in rubber_body_elements:
    match_idx += 1 
        
    # Find all tables with class "dc" within the rubber-body
    table_elements = rubber_body_element.find_elements(By.CLASS_NAME, "dc")  

    for table_element in table_elements:
        # Initialize data for each table
        table_data = {
            "Player": [],
            "Set 1": [],
            "Set 2": [],
            "Set 3": [],
            "Tie-Break 1": [],
            "Tie-Break 2": [],
            "Tie-Break 3": []
        }

        # Find the table body
        tbody_element = table_element.find_element(By.TAG_NAME, "tbody")

        # Find all rows (tr elements) within the tbody
        rows = tbody_element.find_elements(By.TAG_NAME, "tr")

        for row in rows:
            # Find all td elements within the row
            td_elements = row.find_elements(By.TAG_NAME, "td")

            # Extract and store the information starting from td_elements[1]
            player = td_elements[1].text.strip()
            
            # Skip set and tie-break infos if match hasn't been played
            if match_status[match_idx] == "NOT PLAYED":
                print(f"Skipping match {match_idx+1}")
                continue

            # Extract results from td class "results"
            results = td_elements[2]
            set_scores = results.find_elements(By.TAG_NAME, "span")

            set_results = []
            tie_breaks = []

            for set_score in set_scores:
                set_result = set_score.text.strip()
                tie_break = ""

                # Use regular expressions to extract the first number in set_result
                match = re.search(r'\d+', set_result)
                if match:
                    set_result = match.group()
                else:
                    set_result = ""

                if set_score.find_elements(By.TAG_NAME, "sup"):
                    tie_break = set_score.find_element(By.TAG_NAME, "sup").text.strip()
                    # Use regular expressions to extract the first number in tie_break
                    match = re.search(r'\d+', tie_break)
                    if match:
                        tie_break = match.group()
                    else:
                        tie_break = ""

                set_results.append(set_result)
                tie_breaks.append(tie_break)

            # Ensure there are at most 3 sets
            set_results = set_results[:3]
            tie_breaks = tie_breaks[:3]

            # Assign the extracted values to the dictionary
            table_data["Player"].append(player)
            table_data["Set 1"].append(set_results[0])
            table_data["Set 2"].append(set_results[1])
            table_data["Set 3"].append(set_results[2])
            
            # Keep only the first element in the list for tie-breaks
            for i, tie_break in enumerate(tie_breaks):
                if i == 0 and tie_break:
                    table_data["Tie-Break 1"].append(tie_break)
                else:
                    table_data[f"Tie-Break {i+1}"].append(None)

        # Append the table data to the list
        tables_data.append(table_data)

# Close the Selenium WebDriver
driver.quit()

# Create a DataFrame from the collected data
tables_df = pd.DataFrame(tables_data)

# Combine the information from both DataFrames
combined_df = pd.concat([df] * 2 * len(tables_df), ignore_index=True)
combined_df = pd.concat([combined_df, tables_df], axis=1)

# Add match and match status columns
combined_df["match status"] = ""
combined_df["match"] = ""
for i in range(len(match_status)):
    combined_df.loc[i*2:(i*2)+1, "match status"] = match_status[i]
    combined_df.loc[i*2:(i*2)+1, "match"] = match_num[i]

# Display the combined DataFrame
print("Combined DataFrame:")
print(combined_df)


