# Libraries

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import pandas as pd
import re
import time
import sys
from getpass import getuser


# Lists functions to extract matches and players ' information

## matches 

In [2]:
# Get the current user's name
user = getuser()

In [3]:
def extract_matches_df(url):
    try:
        # Initialize Selenium
        print("Initializing Selenium...")
        chrome_service = ChromeService(f"C:/Users/{user}/Downloads/chromedriver.exe")
        chrome_service.start()
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
        # Navigate to the webpage
        print("Navigating to the webpage...")
        driver.get(url)

        # Wait for the page to be loaded
        print("Waiting for the page to load...")
        loaded = False
        while not loaded:
            try:
                main_element = driver.find_element(By.CLASS_NAME, "main")
                loaded = True
            except:
                time.sleep(1)
        # ---------------------------------------------------------------------------------------------

        # Find the div element with class "main"
        main_element = driver.find_element(By.CLASS_NAME, "main")

        # Include the component_title_text in the stage variable
        component_title_element = main_element.find_element(By.CLASS_NAME, "component-title")
        stage = component_title_element.text.strip()

        # ---------------------------------------------------------------------------------------------

        # Now, let's find the div element with class "tie" within the main element
        tie_element = main_element.find_element(By.CLASS_NAME, "details")

        # Find all sub div elements within the "tie" element
        sub_div_elements = tie_element.find_elements(By.TAG_NAME, "div")

        # Initialize variables to store data
        column_data = {}

        for sub_div_element in sub_div_elements:
            sub_div_text = sub_div_element.text.strip()
            if ":" in sub_div_text:
                column_name, column_value = sub_div_text.split(":", 1)
                column_data[column_name] = [column_value]

        # Create a DataFrame from the collected data
        df = pd.DataFrame(column_data)

        # Add the "stage" column with the component_title_text
        df["stage"] = stage

        # ---------------------------------------------------------------------------------------------

        # Find the div element with class "rubber-header"
        rubber_header_elements = main_element.find_elements(By.CLASS_NAME, "rubber-header")
        match_num = []
        match_status = []

        for rubber_header_element in rubber_header_elements:

            # Extract "match" and "match status" from the span elements
            spans = rubber_header_element.find_elements(By.TAG_NAME, "span")

            if len(spans) >= 2:
                match_num.append(spans[0].text.strip())
                match_status.append(spans[1].text.strip())
            #       match = span_element.text.strip()

        # ---------------------------------------------------------------------------------------------
        # Now, let's find the div element with class "rubber-body" within the main element
        rubber_body_elements = main_element.find_elements(By.CLASS_NAME, "rubber-body")
        tables_data = []

        match_idx = -1
        for rubber_body_element in rubber_body_elements:
            match_idx += 1

            # Find all tables with class "dc" within the rubber-body
            table_elements = rubber_body_element.find_elements(By.CLASS_NAME, "dc")

            for table_element in table_elements:
                # Initialize data for each table
                table_data = {
                    "player": [],
                    "set1": [],
                    "set2": [],
                    "set3": [],
                    "set4": [],
                    "set5": [],
                    "tb1": [],
                    "tb2": [],
                    "tb3": [],
                    "tb4": [],
                    "tb5": []
                }

                # Find the table body
                tbody_element = table_element.find_element(By.TAG_NAME, "tbody")

                # Find all rows (tr elements) within the tbody
                rows = tbody_element.find_elements(By.TAG_NAME, "tr")

                for row in rows:
                    # Find all td elements within the row
                    td_elements = row.find_elements(By.TAG_NAME, "td")

                    # Extract and store the information starting from td_elements[1]
                    player = td_elements[1].text.strip()

                    # Skip set and tie-break infos if match hasn't been played
                    if match_status[match_idx] in ["NOT PLAYED", "WALKOVER"]:
                        print(f"Skipping match {match_idx+1}")
                        continue

                    # Extract results from td class "results"
                    results = td_elements[2]
                    set_scores = results.find_elements(By.TAG_NAME, "span")

                    set_results = [None, None, None, None, None]
                    tie_breaks = [None, None, None, None, None]

                    for set_score_idx in range(len(set_scores)):
                        set_score= set_scores[set_score_idx]
                        scores = set_score.text.strip().split()  # Split by whitespace to extract numbers
                        set_results[set_score_idx]= scores[0]
                        if len(scores)>1:
                            tie_breaks[set_score_idx] = scores[1]
                    
                    
                    # Assign the extracted values to the dictionary
                    table_data["player"].append(player)
                    for i in range(5):
                        table_data[f"set{i+1}"].append(set_results[i])
                        table_data[f"tb{i+1}"].append(tie_breaks[i])
                    # Append the table data to the list
                    tables_data.append(table_data)

        # Close the Selenium WebDriver
        driver.quit()

        # Create a DataFrame from the collected data
        tables_df = pd.DataFrame(tables_data)

        # Combine the information from both DataFrames
        matches_df = pd.concat([df] * len(tables_df), ignore_index=True)
        matches_df = pd.concat([matches_df, tables_df], axis=1)

        # Add match and match status columns
        matches_df["match status"] = ""
        matches_df["match"] = ""
        for i in range(len(match_status)):
            matches_df.loc[i * 2:(i * 2) + 1, "match status"] = match_status[i]
            matches_df.loc[i * 2:(i * 2) + 1, "match"] = match_num[i]
        print("matches df downloaded")
        return matches_df
    except Exception as e:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        print(f"[ERROR] Exception occurred: {str(e)}")
        return None


## players

In [4]:
def extract_players_df(url):
    try:
        # Initialize Selenium
        chrome_service = ChromeService(f"C:/Users/{user}/Downloads/chromedriver.exe")
        chrome_service.start()
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
        # Navigate to the webpage
        driver.get(url)

        # Wait for the page to load
        loaded = False
        while not loaded:
            try:
                main_element = driver.find_element(By.CLASS_NAME, "main")
                loaded = True
            except:
                time.sleep(1)

        # Wait for the page to load
        driver.implicitly_wait(10)  # You can adjust the waiting time as needed

        # Find all div elements with class "team-nominations-col"
        team_nominations_col_elements = driver.find_elements(By.CLASS_NAME, "team-nominations-col")

        # Initialize a list to store the paired data
        paired_data = []

        # Initialize a list to store the column names
        column_names = set()

        # Loop through each "team-nominations-col" element
        for team_nominations_col_element in team_nominations_col_elements:
            # Extract the team name
            team_name_element = team_nominations_col_element.find_element(By.CLASS_NAME, "team-name")
            team_name = team_name_element.text.strip()

            # Find "players-info" elements and extract text from "ng-binding" elements
            players_info_elements = team_nominations_col_element.find_elements(By.CLASS_NAME, "players-info")

            for players_info_element in players_info_elements:
                ng_binding_elements = players_info_element.find_elements(By.CLASS_NAME, "ng-binding")

                # Create a dictionary for the row
                row_data = {"team_name": team_name}

                for i, ng_binding_element in enumerate(ng_binding_elements, start=1):
                    row_data[f"Info {i}"] = ng_binding_element.text.strip()
                    column_names.add(f"Info {i}")

                paired_data.append(row_data)

        # Create a Pandas DataFrame from the paired data
        players_df = pd.DataFrame(paired_data)

        # Reorder columns to match the column names
        players_df = players_df[["team_name"] + sorted(column_names)]

        # Now you have a DataFrame with team names and player information in separate columns
        print("players df downloaded")
        return players_df

    except Exception as e:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        print(f"[ERROR] Exception occurred: {str(e)}")
        return None


# Create and clean dataframe function

In [5]:
def to_df(links):
    # Initialize Selenium
    chrome_service = ChromeService(f"C:/Users/{user}/Downloads/chromedriver.exe")
    chrome_service.start()
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(service=chrome_service, options=chrome_options)

    try:
        # Create empty lists to store the results
        match_results = []
        player_results = []

        for link in links:
            try:
                print(f"Processing match: {link}")  # Print the link being analyzed

                # Apply the functions to extract match and player data
                matches_df = extract_matches_df(link)
                players_df = extract_players_df(link)

                if matches_df is not None:
                    match_results.append(matches_df)
                if players_df is not None:
                    player_results.append(players_df)

            except Exception as inner_e:
                # Handle any errors that occur during the processing of each link
                print(f"[ERROR] Failed to process link {link}: {str(inner_e)}")
                continue  # Skip to the next link if there's an error

        # Create DataFrames from the lists
        if match_results:
            matches_df = pd.concat(match_results, ignore_index=True)
        else:
            matches_df = pd.DataFrame()  # Return an empty DataFrame if no matches

        if player_results:
            players_df = pd.concat(player_results, ignore_index=True)
        else:
            players_df = pd.DataFrame()  # Return an empty DataFrame if no players

        # Check the number of rows in the DataFrames
        print("Total number of rows in matches_df:", len(matches_df))
        print("Total number of rows in players_df:", len(players_df))

        return matches_df, players_df

    except Exception as e:
        # Handle any errors that occur in the main processing
        exc_type, exc_value, exc_traceback = sys.exc_info()
        print(f"[ERROR] Line {exc_traceback.tb_lineno}: {str(e)}")
        return None, None  # Return None if there was an error

    finally:
        # Ensure the WebDriver is always closed, even if an error occurs
        driver.quit()


# Clean players and matches dataframes

## clean players dataframe

In [6]:
def clean_players_df(players_df):
    # Remove rows where Info 1 is equal to "captain"
    players_df = players_df[players_df["Info 1"] != "Captain"]
    
    # Rename the columns
    players_df = players_df.rename(columns={"Info 1": "player", "Info 2": "dob", "Info 3": "single_ranking", "Info 4": "doubles_ranking"})
    
    # Remove text before ":" in the specified columns
    players_df["dob"] = players_df["dob"].str.split(":", expand=True)[1].str.strip()
    players_df["single_ranking"] = players_df["single_ranking"].str.split(":", expand=True)[1].str.strip()
    players_df["doubles_ranking"] = players_df["doubles_ranking"].str.split(":", expand=True)[1].str.strip()
    
    # Keep the first letter in each word in uppercase for the "player" column in players_df
    players_df['player'] = players_df['player'].str.title()
    
    return players_df



## clean matches dataframe

In [7]:

def clean_matches_df(matches_df):
    # Convert "player" and set columns to string
    columns_to_convert = ["player", "set1", "set2", "set3", "set4", "set5",
                          "tb1", "tb2", "tb3", "tb4", "tb5"]
    matches_df[columns_to_convert] = matches_df[columns_to_convert].astype(str)

    # Define a function to apply regular expression replacements
    def apply_regex_replacements(df, columns):
        for col in columns:
            df[col] = df[col].str.replace(r'[\[\]\']+| and ', '', regex=True)

    # Apply the function to the specified columns
    columns_to_clean = ["player", "set1", "set2", "set3", "set4", "set5",
                          "tb1", "tb2", "tb3", "tb4", "tb5"]
    apply_regex_replacements(matches_df, columns_to_clean)

    # Apply the str.extract method with the specified regular expression
    split_players = matches_df['player'].str.extract(r'^(.*?)\\n(.*)$')
    
    # Create 'player1' and 'player2' columns
    matches_df['player1'] = split_players[0].fillna(matches_df['player'])
    matches_df['player2'] = split_players[1].fillna('')
    
    # Drop the original 'player' column
    matches_df.drop('player', axis=1, inplace=True)
    
    # Keep the first letter in each word in uppercase for the "player1" and "player2" columns
    matches_df["player1"] = matches_df["player1"].str.title()
    matches_df["player2"] = matches_df["player2"].str.title()
    
    # Define a function to convert specific columns from string to numeric
    def convert_columns_to_numeric(df, columns):
        for col in columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # Specify the columns to convert
    columns_to_convert = ["set1", "set2", "set3", "set4", "set5",
                          "tb1", "tb2", "tb3", "tb4", "tb5"]

    # Apply the function to convert the specified columns
    convert_columns_to_numeric(matches_df, columns_to_convert)
    
    return matches_df



# Merge players and matches dataframes

In [8]:
def merge_data_frames(df1, df2):
    merged_df = df1.merge(df2, left_on='player1', right_on='player', how='left')
    merged_df = merged_df.drop_duplicates()

    # Second merge based on 'player2'
    final_merged = merged_df.merge(df2, left_on='player2', right_on='player', how='left')
    final_merged = final_merged.drop_duplicates()

    # Dropping specified columns
    columns_to_drop = ['player_x', 'player_y', 'team_name_y']
    final_merged = final_merged.drop(columns_to_drop, axis=1)

    # Renaming columns based on suffixes
    final_merged = final_merged.rename(columns=lambda x: x.replace('_x', '_player1').replace('_y', '_player2'))

    # Renaming 'Team Name_Player1' to 'team_name'
    final_merged = final_merged.rename(columns={'team_name_player1': 'team_name'})

    return final_merged
