In [None]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import csv
import re
from datetime import datetime

def get_season_match_urls(base_url, session):
    """Fetch match URLs for the specified season."""
    response = session.get(base_url, timeout=10)
    if response.status_code != 200:
        raise Exception(f"Failed to retrieve data from {base_url}")

    soup = BeautifulSoup(response.text, 'html.parser')
    match_links = []

    # Find all match links from the provided season's page
    for link in soup.find_all("a", href=True):
        href = link['href']
        if "/spielbericht/" in href:
            match_links.append(f"https://www.transfermarkt.com{href}")

    return match_links

def process_match_url(_match_url, session):
    data = {
        "Match ID": None,
        "Date": None,
        "Season": None,
        "Stadium": None,
        "Number of Attendance": None,
        "Referee": None,
        "Home Team": "Unknown",
        "Away Team": "Unknown",
        "Home Lineup": None,
        "Away Lineup": None,
        "Fulltime Score": "Unknown"
    }

    try:
        response = session.get(_match_url, timeout=10)
        if response.status_code != 200:
            print(f"Failed to retrieve data from {_match_url}")
            return data

        soup = BeautifulSoup(response.text, 'html.parser')

        # Match ID
        match_id = _match_url.rstrip('/').split("/")[-1]
        data["Match ID"] = str(match_id)

        # Date
        date_tag = soup.find("p", class_="sb-datum hide-for-small")
        if date_tag:
            date_links = date_tag.find_all("a")
            if date_links:
                date_text = date_links[-1].text.strip()

                try:
                    match_date = datetime.strptime(date_text, "%a, %m/%d/%y").date()
                    data["Date"] = match_date

                    # Season
                    year = match_date.year
                    season = f"{str(year - 1)[-2:]}/{str(year)[-2:]}" if match_date.month <= 7 else f"{str(year)[-2:]}/{str(year + 1)[-2:]}"
                    data["Season"] = season
                except ValueError:
                    print(f"Date format error for Match ID {match_id}: '{date_text}'")

        # Stadium, Attendance, Referee
        stadium_tag = soup.find("p", class_="sb-zusatzinfos")
        if stadium_tag:
            stadium_link = stadium_tag.find("a", href=lambda href: href and "/stadion/" in href)
            if stadium_link:
                data["Stadium"] = stadium_link.text.strip()

            attendance_text = stadium_tag.find("strong", text=lambda text: text and "Attendance:" in text.parent.text)
            if attendance_text:
                attendance_raw = attendance_text.parent.text.strip()
                attendance = attendance_raw.split(":")[-1].replace(".", "").strip()
                data["Number of Attendance"] = attendance

            referee_link = stadium_tag.find("a", href=lambda href: href and "/profil/schiedsrichter/" in href)
            if referee_link:
                data["Referee"] = referee_link.text.strip()

        # Home and Away Team Names
        home_team_tag = soup.select_one('.sb-team.sb-heim .sb-vereinslink')
        away_team_tag = soup.select_one('.sb-team.sb-gast .sb-vereinslink')
        if home_team_tag:
            data["Home Team"] = home_team_tag.get_text(strip=True)
        if away_team_tag:
            data["Away Team"] = away_team_tag.get_text(strip=True)

        # Full-Time Score
        full_timescore_tag = soup.select_one('.sb-endstand')
        if full_timescore_tag:
            full_timescore = full_timescore_tag.get_text(strip=True)
            match = re.match(r'^(\d+):(\d+)', full_timescore)
            if match:
                main_score = match.group(0)
                data["Fulltime Score"] = main_score
            else:
                data["Fulltime Score"] = full_timescore.split('(')[0].strip()

    except Exception as e:
        print(f"Error processing {_match_url}: {e}")

    return data

def determine_winner(row):
    try:
        home_score, away_score = map(int, row["Fulltime Score"].split(":"))
        if home_score > away_score:
            return row["Home Team"]
        elif home_score < away_score:
            return row["Away Team"]
        else:
            return "Draw"
    except:
        return "Unknown"

def scrape_all_match_overview(base_url, headers, max_workers=15):
    session = requests.Session()
    session.headers.update(headers)

    # Step 1: Get all match URLs for the specified season
    match_urls = get_season_match_urls(base_url, session)

    # Step 2: Use ThreadPoolExecutor for multithreading
    matches = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_match_url, url, session): url for url in match_urls}

        for future in as_completed(futures):
            try:
                match_data = future.result()
                # Exclude matches with missing date or fulltime score
                if match_data["Date"] and match_data["Fulltime Score"] != "Unknown":
                    matches.append(match_data)
            except Exception as e:
                print(f"Error processing URL: {e}")

    session.close()

    # Step 3: Create DataFrame
    match_df = pd.DataFrame(matches)

    # Drop unnecessary columns
    match_df = match_df.drop(columns=["Stadium", "Number of Attendance", "Referee"])

    # Add Winner column
    match_df["Winner"] = match_df.apply(determine_winner, axis=1)

    return match_df

# Main execution
base_url = "https://www.transfermarkt.com/premier-league/gesamtspielplan/wettbewerb/GB1/saison_id/2024"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

output_csv = "./data/clean_data/match_overview_2024.csv"
match_overview_df = scrape_all_match_overview(base_url, headers, max_workers=10)
match_overview_df.to_csv(output_csv, index=False, encoding='utf-8', header=True)
print("Scraping complete. Data saved to", output_csv)
