In [9]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By

In [2]:

# Initialize Selenium WebDriver
def init_webdriver():
    options = Options()
    options.add_argument("--headless")  # Run in headless mode
    service = Service(EdgeChromiumDriverManager().install())
    driver = webdriver.Edge(service=service, options=options)
    return driver

In [3]:

# Load the page and extract HTML content
def get_page_content(driver, url):
    driver.get(url)
    time.sleep(10)  # Wait for the page to load
    html = driver.page_source
    return html

In [4]:

# Parse the table using BeautifulSoup
def extract_table_data(html,table_name):
    soup = BeautifulSoup(html, "html.parser")
    table = soup.find("table", {"id": table_name})
    
    # Extract table headers
    headers = [th.getText() for th in table.find("thead").findAll("th")]
    
    # Extract table rows
    rows = table.find("tbody").findAll("tr")
    data = []
    
    for row in rows:
        # "Date" appears to be in a <th> element rather than in a <td> one
        date = row.find("th").getText().strip()
        
        # Extract the rest of the columns (data) from <td> elements
        cells = row.findAll("td")
        if len(cells) > 0:
            match_data = [cell.getText().strip() for cell in cells]  # Clean up the text
            match_data.insert(0, date)  # Insert the date at the beginning of the row data
            
            # This was for debugging purposes 
            # print(f"Row data ({len(match_data)}): {match_data}")  # Print row data for inspection
            
            # Check if the number of columns matches the headers
            if len(match_data) != len(headers):
                print(f"Skipping row with mismatched columns: {len(match_data)} columns")
                continue  # Skip this row if the column count doesn't match
            
            data.append(match_data)
    
    return headers, data

In [5]:

# Convert data to a DataFrame
def create_dataframe(headers, data):
    df = pd.DataFrame(data, columns=headers)
    # Clean up column names by removing any unnamed columns
    df.columns = df.columns.str.replace("Unnamed: ", "")
    # Add a new column "Match Number" starting from 1
    df.index = pd.RangeIndex(start=1, stop=len(df) + 1, step=1)
    df.index.name = "Match Number"
    return df

In [6]:

# Save data to Excel
def save_data(df, filename="gwangju_fc_matches_2024.xlsx"):
    df.to_excel(filename, index=True)
    return filename

In [7]:

# Check if the table exists
def check_table(driver,table_name):
    try:
        table_exists = driver.find_element(By.ID, table_name)
        print(f"Table {table_exists} found using Selenium!")
    except:
        print(f"Table {table_exists} not found using Selenium")

In [8]:

table_name = "matchlogs_for"

# Main script
if __name__ == "__main__":
    edge_options = Options()
    url = "https://fbref.com/en/squads/ae306ede/Gwangju-FC-Stats#all_matchlogs"
    driver = init_webdriver()
    
    try:
        html = get_page_content(driver, url)
        table_exists = check_table(driver,table_name)
        headers, data = extract_table_data(html,table_name)
        df = create_dataframe(headers, data)
        filename = save_data(df)
        print(f"Data saved to {filename}")
    finally:
        driver.quit()

Table <selenium.webdriver.remote.webelement.WebElement (session="0624d7640be74943b93ad85c73b58e13", element="f.7FA5AB98A0240E9A4A52429894B1DC5F.d.10575541D3382057910E2EAD452FFA38.e.48")> found using Selenium!
Data saved to gwangju_fc_matches_2024.xlsx
