# Official Davis Cup website

In [25]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 20 10:27:54 2023

@author: drolldodo
"""

'''
- l'informazione del match non la riesca a prendere insieme a quella del match_status essendo nello
  stesso tag -> OK
- estendere il codice a tutti i div class = "rubber-header" e "rubber-body" -> OK
- raccogliere le informazioni sui giocatori dal fondo della pagina ed eventualmente dai loro link
'''


from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import re
import time

# Initialize Selenium
chrome_service = ChromeService("C:/Users/aldi/Downloads/chromedriver.exe")
chrome_service.start()
chrome_options = Options()
#chrome_options.add_argument("--headless")
driver = webdriver.Chrome(service=chrome_service, options=chrome_options)

# URL of the webpage
url = "https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-FRA-NED-01"

# Navigate to the webpage
driver.get(url)

# Wait for the page to be loaded
loaded = False
while not loaded:
    try:
        main_element = driver.find_element(By.CLASS_NAME, "main")
        loaded = True
        print("Loaded!")
    except:
        time.sleep(1)
        print("waiting...")

# ---------------------------------------------------------------------------------------------

# Find the div element with class "main"
main_element = driver.find_element(By.CLASS_NAME, "main")

# Include the component_title_text in the stage variable
component_title_element = main_element.find_element(By.CLASS_NAME, "component-title")
stage = component_title_element.text.strip()

# ---------------------------------------------------------------------------------------------

# Now, let's find the div element with class "tie" within the main element
tie_element = main_element.find_element(By.CLASS_NAME, "details")

# Find all sub div elements within the "tie" element
sub_div_elements = tie_element.find_elements(By.TAG_NAME, "div")

# Initialize variables to store data
column_data = {}

for sub_div_element in sub_div_elements:
    sub_div_text = sub_div_element.text.strip()
    if ":" in sub_div_text:
        column_name, column_value = sub_div_text.split(":", 1)
        column_data[column_name] = [column_value]

# ---------------------------------------------------------------------------------------------

# Create a DataFrame from the collected data
df = pd.DataFrame(column_data)

# Add the "Stage" column with the component_title_text
df["Stage"] = stage

# ---------------------------------------------------------------------------------------------

# Find the div element with class "rubber-header"
rubber_header_elements = main_element.find_elements(By.CLASS_NAME, "rubber-header")
match_num = []
match_status = []

for rubber_header_element in rubber_header_elements:

    # Extract "match" and "match status" from the span elements
    spans = rubber_header_element.find_elements(By.TAG_NAME, "span")
#   span_element = rubber_header_element.find_element(By.CSS_SELECTOR, "span.label.ng-binding")
 
    if len(spans) >= 2:
        match_num.append(spans[0].text.strip())
        match_status.append(spans[1].text.strip())
#       match = span_element.text.strip()

# ---------------------------------------------------------------------------------------------
    
# Now, let's find the div element with class "rubber-body" within the main element
rubber_body_elements = main_element.find_elements(By.CLASS_NAME, "rubber-body")
tables_data = []

match_idx = -1
for rubber_body_element in rubber_body_elements:
    match_idx += 1 
        
    # Find all tables with class "dc" within the rubber-body
    table_elements = rubber_body_element.find_elements(By.CLASS_NAME, "dc")  

    for table_element in table_elements:
        # Initialize data for each table
        table_data = {
            "Player": [],
            "Set 1": [],
            "Set 2": [],
            "Set 3": [],
            "Tie-Break 1": [],
            "Tie-Break 2": [],
            "Tie-Break 3": []
        }

        # Find the table body
        tbody_element = table_element.find_element(By.TAG_NAME, "tbody")

        # Find all rows (tr elements) within the tbody
        rows = tbody_element.find_elements(By.TAG_NAME, "tr")

        for row in rows:
            # Find all td elements within the row
            td_elements = row.find_elements(By.TAG_NAME, "td")

            # Extract and store the information starting from td_elements[1]
            player = td_elements[1].text.strip()
            
            # Skip set and tie-break infos if match hasn't been played
            if match_status[match_idx] == "NOT PLAYED":
                print(f"Skipping match {match_idx+1}")
                continue

            # Extract results from td class "results"
            results = td_elements[2]
            set_scores = results.find_elements(By.TAG_NAME, "span")

            set_results = []
            tie_breaks = []

            for set_score in set_scores:
                set_result = set_score.text.strip()
                tie_break = ""

                # Use regular expressions to extract the first number in set_result
                match = re.search(r'\d+', set_result)
                if match:
                    set_result = match.group()
                else:
                    set_result = ""

                if set_score.find_elements(By.TAG_NAME, "sup"):
                    tie_break = set_score.find_element(By.TAG_NAME, "sup").text.strip()
                    # Use regular expressions to extract the first number in tie_break
                    match = re.search(r'\d+', tie_break)
                    if match:
                        tie_break = match.group()
                    else:
                        tie_break = ""

                set_results.append(set_result)
                tie_breaks.append(tie_break)

            # Ensure there are at most 3 sets
            set_results = set_results[:3]
            tie_breaks = tie_breaks[:3]

            # Assign the extracted values to the dictionary
            table_data["Player"].append(player)
            table_data["Set 1"].append(set_results[0])
            table_data["Set 2"].append(set_results[1])
            table_data["Set 3"].append(set_results[2])
            
            # Keep only the first element in the list for tie-breaks
            for i, tie_break in enumerate(tie_breaks):
                if i == 0 and tie_break:
                    table_data["Tie-Break 1"].append(tie_break)
                else:
                    table_data[f"Tie-Break {i+1}"].append(None)

        # Append the table data to the list
        tables_data.append(table_data)

# Close the Selenium WebDriver
driver.quit()

# Create a DataFrame from the collected data
tables_df = pd.DataFrame(tables_data)

# Combine the information from both DataFrames
combined_df = pd.concat([df]  * len(tables_df), ignore_index=True)
combined_df = pd.concat([combined_df, tables_df], axis=1)

# Add match and match status columns
combined_df["match status"] = ""
combined_df["match"] = ""
for i in range(len(match_status)):
    combined_df.loc[i*2:(i*2)+1, "match status"] = match_status[i]
    combined_df.loc[i*2:(i*2)+1, "match"] = match_num[i]

# Display the combined DataFrame
print("Combined DataFrame:")
print(combined_df)




Loaded!
Skipping match 5
Skipping match 5
Combined DataFrame:
                    Date                                  Venue  \
0   02 Feb - 04 Feb 2018   Halle Olympique, Albertville, France   
1   02 Feb - 04 Feb 2018   Halle Olympique, Albertville, France   
2   02 Feb - 04 Feb 2018   Halle Olympique, Albertville, France   
3   02 Feb - 04 Feb 2018   Halle Olympique, Albertville, France   
4   02 Feb - 04 Feb 2018   Halle Olympique, Albertville, France   
5   02 Feb - 04 Feb 2018   Halle Olympique, Albertville, France   
6   02 Feb - 04 Feb 2018   Halle Olympique, Albertville, France   
7   02 Feb - 04 Feb 2018   Halle Olympique, Albertville, France   
8   02 Feb - 04 Feb 2018   Halle Olympique, Albertville, France   
9   02 Feb - 04 Feb 2018   Halle Olympique, Albertville, France   

                               Surface               Ball  \
0   Hard - Rebound Ace Synpave, Indoor   Tecnifibre X-One   
1   Hard - Rebound Ace Synpave, Indoor   Tecnifibre X-One   
2   Hard - Reboun

In [72]:
combined_df

Unnamed: 0,Date,Venue,Surface,Ball,Stage,Player,Set 1,Set 2,Set 3,Tie-Break 1,Tie-Break 2,Tie-Break 3,match status,match
0,02 Feb - 04 Feb 2018,"Halle Olympique, Albertville, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP 1ST ROUND,[Adrian MANNARINO],[6],[3],[3],[4],[None],[None],PLAYED & COMPLETED,MATCH 1
1,02 Feb - 04 Feb 2018,"Halle Olympique, Albertville, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP 1ST ROUND,[Thiemo DE BAKKER],[7],[6],[6],[7],[None],[None],PLAYED & COMPLETED,MATCH 1
2,02 Feb - 04 Feb 2018,"Halle Olympique, Albertville, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP 1ST ROUND,[Richard GASQUET],[6],[7],[3],[None],[None],[None],PLAYED & COMPLETED,MATCH 2
3,02 Feb - 04 Feb 2018,"Halle Olympique, Albertville, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP 1ST ROUND,[Robin HAASE],[4],[6],[6],[None],[None],[None],PLAYED & COMPLETED,MATCH 2
4,02 Feb - 04 Feb 2018,"Halle Olympique, Albertville, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP 1ST ROUND,[Pierre-Hugues HERBERT\nNicolas MAHUT],[7],[6],[6],[8],[None],[None],PLAYED & COMPLETED,MATCH 3
5,02 Feb - 04 Feb 2018,"Halle Olympique, Albertville, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP 1ST ROUND,[Robin HAASE\nJean-Julien ROJER],[6],[3],[7],[6],[None],[None],PLAYED & COMPLETED,MATCH 3
6,02 Feb - 04 Feb 2018,"Halle Olympique, Albertville, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP 1ST ROUND,[Adrian MANNARINO],[4],[7],[7],[None],[None],[None],PLAYED & COMPLETED,MATCH 4
7,02 Feb - 04 Feb 2018,"Halle Olympique, Albertville, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP 1ST ROUND,[Robin HAASE],[6],[6],[5],[None],[None],[None],PLAYED & COMPLETED,MATCH 4
8,02 Feb - 04 Feb 2018,"Halle Olympique, Albertville, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP 1ST ROUND,[],[],[],[],[],[],[],NOT PLAYED,MATCH 5
9,02 Feb - 04 Feb 2018,"Halle Olympique, Albertville, France","Hard - Rebound Ace Synpave, Indoor",Tecnifibre X-One,WORLD GROUP 1ST ROUND,[],[],[],[],[],[],[],NOT PLAYED,MATCH 5


In [87]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd

# Initialize Selenium WebDriver
driver = webdriver.Chrome(executable_path="C:/Users/aldi/Downloads/chromedriver.exe")

# URL of the webpage
url = "https://www.daviscup.com/en/draws-results/tie.aspx?id=M-DC-2018-WG-M-FRA-NED-01"

# Navigate to the webpage
driver.get(url)

try:
    # Wait for the page to load
    driver.implicitly_wait(10)  # You can adjust the waiting time as needed

    # Find all div elements with class "team-nominations-col"
    team_nominations_col_elements = driver.find_elements(By.CLASS_NAME, "team-nominations-col")

    # Initialize a list to store the paired data
    paired_data = []

    # Initialize a list to store the column names
    column_names = set()

    # Loop through each "team-nominations-col" element
    for team_nominations_col_element in team_nominations_col_elements:
        # Extract the team name
        team_name_element = team_nominations_col_element.find_element(By.CLASS_NAME, "team-name")
        team_name = team_name_element.text.strip()

        # Find "players-info" elements and extract text from "ng-binding" elements
        players_info_elements = team_nominations_col_element.find_elements(By.CLASS_NAME, "players-info")

        for players_info_element in players_info_elements:
            ng_binding_elements = players_info_element.find_elements(By.CLASS_NAME, "ng-binding")
            
            # Create a dictionary for the row
            row_data = {"Team Name": team_name}
            
            for i, ng_binding_element in enumerate(ng_binding_elements, start=1):
                row_data[f"Info {i}"] = ng_binding_element.text.strip()
                column_names.add(f"Info {i}")

            paired_data.append(row_data)

    # Create a Pandas DataFrame from the paired data
    df = pd.DataFrame(paired_data)

    # Reorder columns to match the column names
    df = df[["Team Name"] + sorted(column_names)]

    # Now you have a DataFrame with team names and player information in separate columns
    print(df)

except Exception as e:
    print("Error:", str(e))
finally:
    driver.quit()


  


      Team Name                 Info 1                      Info 2  \
0        FRANCE          Lucas POUILLE  Date of birth: 23 Feb 1994   
1        FRANCE       Adrian MANNARINO  Date of birth: 29 Jun 1988   
2        FRANCE        Richard GASQUET  Date of birth: 18 Jun 1986   
3        FRANCE  Pierre-Hugues HERBERT  Date of birth: 18 Mar 1991   
4        FRANCE          Nicolas MAHUT  Date of birth: 21 Jan 1982   
5        FRANCE                Captain                Yannick NOAH   
6   NETHERLANDS            Robin HAASE  Date of birth: 06 Apr 1987   
7   NETHERLANDS      Tallon GRIEKSPOOR  Date of birth: 02 Jul 1996   
8   NETHERLANDS       Thiemo DE BAKKER  Date of birth: 19 Sep 1988   
9   NETHERLANDS       Matwe MIDDELKOOP  Date of birth: 03 Sep 1983   
10  NETHERLANDS      Jean-Julien ROJER  Date of birth: 25 Aug 1981   
11  NETHERLANDS                Captain               Paul HAARHUIS   

                  Info 3                Info 4  
0   Singles ranking: 335  Doubles rankin

# Data cleaning

In [89]:
df

Unnamed: 0,Team Name,Name,DOB,Single Ranking,Doubles Ranking
0,FRANCE,Lucas POUILLE,23 Feb 1994,335.0,834.0
1,FRANCE,Adrian MANNARINO,29 Jun 1988,24.0,251.0
2,FRANCE,Richard GASQUET,18 Jun 1986,62.0,
3,FRANCE,Pierre-Hugues HERBERT,18 Mar 1991,375.0,76.0
4,FRANCE,Nicolas MAHUT,21 Jan 1982,,33.0
6,NETHERLANDS,Robin HAASE,06 Apr 1987,538.0,42.0
7,NETHERLANDS,Tallon GRIEKSPOOR,02 Jul 1996,25.0,88.0
8,NETHERLANDS,Thiemo DE BAKKER,19 Sep 1988,715.0,894.0
9,NETHERLANDS,Matwe MIDDELKOOP,03 Sep 1983,,39.0
10,NETHERLANDS,Jean-Julien ROJER,25 Aug 1981,,16.0


In [88]:
# Remove rows where Info 1 is equal to "captain"
df = df[df["Info 1"] != "Captain"]
# Rename the columns
df = df.rename(columns={"Info 1": "Name", "Info 2": "DOB", "Info 3": "Single Ranking", "Info 4": "Doubles Ranking"})


In [None]:
# Remove text before ":" in the specified columns
df["DOB"] = df["DOB"].str.split(":", expand=True)[1].str.strip()
df["Single Ranking"] = df["Single Ranking"].str.split(":", expand=True)[1].str.strip()
df["Doubles Ranking"] = df["Doubles Ranking"].str.split(":", expand=True)[1].str.strip()

In [None]:
df

In [None]:
# combine information in df and combined_df