In [7]:
import requests
from bs4 import BeautifulSoup
import time

In [8]:

def scraper(start_year, end_year, url_template):
    # Initialize a list to store data for all editions in the range
    all_matches_data = []

    # Loop through each year in the specified range
    for year in range(start_year, end_year + 1):
        # Construct the URL for each year using the provided template
        url = url_template.format(year=year)
        print(f"Attempting to scrape {url}")
        
        # Send a request to fetch the HTML content of the page
        try:
            response = requests.get(url, allow_redirects=True, timeout=10)
        except requests.RequestException as e:
            print(f"Network error while fetching {url}: {e}")
            continue  # Skip to the next year if there's a network error

        # Check if the response status code is OK
        if response.status_code == 404:
            print(f"Page not found: {url} (status code 404)")
            continue
        elif response.status_code != 200:
            print(f"Failed to retrieve {url}, status code: {response.status_code}")
            continue
        
        # Detect if there was a redirect, and print the final URL
        if response.history:
            print(f"Redirected for {url} - Final URL: {response.url}")

        # Parse the page content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all divs with the class 'footballbox'
        footballbox_divs = soup.find_all('div', {'class': 'footballbox'})
        if not footballbox_divs:
            print(f"No 'footballbox' divs found on {url}. Page structure may have changed.")
            continue

        # Initialize a list to store expanded match data with individual goals per row
        expanded_matches_data = []

        # Initialize a variable to store the current stage (from h3 tags)
        current_stage = None

        # Iterate through each 'footballbox' div
        for idx, box in enumerate(footballbox_divs, start=1):
            print(f"Processing match box {idx} on page for {year}")

            # Find the closest preceding h3 tag to track the stage
            previous_h3 = box.find_previous('h3')
            if previous_h3:
                current_stage = previous_h3.get_text(strip=True)
            else:
                print(f"No stage found for match box {idx} on {url}")
            
            # Initialize a dictionary to store match information
            match_info = {
                'year': year,
                'stage': current_stage,
                'score': None,  # Default value for score
            }

            # Step 1: Extracting date and time from div class 'fleft'
            fleft = box.find('div', {'class': 'fleft'})
            if fleft:
                match_info['date'] = fleft.find('div', {'class': 'fdate'}).get_text(strip=True) if fleft.find('div', {'class': 'fdate'}) else None
                match_info['time'] = fleft.find('div', {'class': 'ftime'}).get_text(strip=True) if fleft.find('div', {'class': 'ftime'}) else None
            else:
                print(f"Warning: No 'fleft' div found for match box {idx} on {url}")

            # Step 2: Extracting home team, away team, and score from table class 'fevent'
            fevent_table = box.find('table', {'class': 'fevent'})
            if fevent_table:
                match_info['home_team'] = fevent_table.find('th', {'class': 'fhome'}).get_text(strip=True) if fevent_table.find('th', {'class': 'fhome'}) else None
                match_info['away_team'] = fevent_table.find('th', {'class': 'faway'}).get_text(strip=True) if fevent_table.find('th', {'class': 'faway'}) else None
                match_info['score'] = fevent_table.find('th', {'class': 'fscore'}).get_text(strip=True) if fevent_table.find('th', {'class': 'fscore'}) else match_info['score']
                
                if not match_info['home_team'] or not match_info['away_team']:
                    print(f"Warning: Missing team information for match box {idx} on {url}")
                
                if match_info['score'] is None:
                    print(f"Warning: No score found for match box {idx} on {url}")
            else:
                print(f"Warning: No 'fevent' table found for match box {idx} on {url}")

            # Step 3: Extracting stadium and referee information from div class 'fright'
            fright = box.find('div', {'class': 'fright'})
            if fright:
                location_div = fright.find('div', {'itemprop': 'location'})
                if location_div:
                    match_info['stadium_name'] = location_div.find('a').get_text(strip=True) if location_div.find('a') else None
                    match_info['stadium_city'] = location_div.find_all('a')[1].get_text(strip=True) if len(location_div.find_all('a')) > 1 else None
                else:
                    print(f"Warning: No stadium location found for match box {idx} on {url}")

                stadium_attendance = fright.find_all('div')[1].get_text(strip=True) if len(fright.find_all('div')) > 1 else None
                match_info['stadium_attendance'] = stadium_attendance

                referee_div = fright.find_all('div')[2] if len(fright.find_all('div')) > 2 else None
                if referee_div:
                    match_info['referee_name'] = referee_div.find('a').get_text(strip=True) if referee_div.find('a') else None
                    match_info['referee_nationality'] = referee_div.find_all('a')[1].get_text(strip=True) if len(referee_div.find_all('a')) > 1 else None
                else:
                    print(f"Warning: No referee information found for match box {idx} on {url}")
            else:
                print(f"Warning: No 'fright' div found for match box {idx} on {url}")

            # Step 4: Extract individual goals for home and away teams
            fgoals_rows = fevent_table.find_all('tr', {'class': 'fgoals'}) if fevent_table else []
            
            for goal_row in fgoals_rows:
                # Handle home team goals
                home_goals = goal_row.find_all('td', {'class': 'fhgoal'})
                for home_goal in home_goals:
                    plainlist = home_goal.find('div', {'class': 'plainlist'})
                    if plainlist:
                        scorers = plainlist.find_all('li')
                        for scorer in scorers:
                            scorer_name = scorer.find('a').get_text(strip=True) if scorer.find('a') else None
                            if scorer_name:
                                goal_minutes = scorer.find_all('span', {'class': 'fb-goal'})
                                for minute in goal_minutes:
                                    match_copy = match_info.copy()
                                    match_copy['scorer_name'] = scorer_name
                                    match_copy['scorer_nationality'] = match_info.get('home_team')
                                    match_copy['goal_minute'] = minute.get_text(strip=True)
                                    expanded_matches_data.append(match_copy)

                # Handle away team goals
                away_goals = goal_row.find_all('td', {'class': 'fagoal'})
                for away_goal in away_goals:
                    plainlist = away_goal.find('div', {'class': 'plainlist'})
                    if plainlist:
                        scorers = plainlist.find_all('li')
                        for scorer in scorers:
                            scorer_name = scorer.find('a').get_text(strip=True) if scorer.find('a') else None
                            if scorer_name:
                                goal_minutes = scorer.find_all('span', {'class': 'fb-goal'})
                                for minute in goal_minutes:
                                    match_copy = match_info.copy()
                                    match_copy['scorer_name'] = scorer_name
                                    match_copy['scorer_nationality'] = match_info.get('away_team')
                                    match_copy['goal_minute'] = minute.get_text(strip=True)
                                    expanded_matches_data.append(match_copy)

        # Add all match data for the current year
        all_matches_data.extend(expanded_matches_data)

    if not all_matches_data:
        print(f"No match data found in the specified range {start_year} - {end_year}.")
    
    # Return the list of all match data collected in the range
    return all_matches_data


In [9]:
def clean_data(df):
    # Check for extra time in the score and remove it
    df['extra_time'] = df['score'].apply(lambda x: 1 if '(a.e.t.)' in str(x) else 0)
    df['score'] = df['score'].str.replace(r'\(a.e.t.\)', '', regex=True).str.strip()
    df['score'] = df['score'].str.replace(r'[^\d–]', '', regex=True).str.strip()
    df['score'] = df['score'].fillna('0–0')

    # Splitting the score into goals_home and goals_away
    df[['goals_home', 'goals_away']] = df['score'].str.split('–', expand=True)
    df['goals_home'] = pd.to_numeric(df['goals_home'], errors='coerce').fillna(0).astype(int)
    df['goals_away'] = pd.to_numeric(df['goals_away'], errors='coerce').fillna(0).astype(int)

    # Duplicate rows for cases where goal_minute contains a comma
    df = df.assign(goal_minute=df['goal_minute'].str.split(',')).explode('goal_minute')

    # Create own_goal and penalty flags
    df['own_goal'] = df['goal_minute'].apply(lambda x: 1 if '(o.g.)' in str(x) else 0)
    df['penalty'] = df['goal_minute'].apply(lambda x: 1 if '(pen.)' in str(x) else 0)

    # Clean goal_minute by removing '(o.g.)', '(pen.)', and any apostrophes
    df['goal_minute'] = df['goal_minute'].str.replace(r'\(o.g.\)', '', regex=True)
    df['goal_minute'] = df['goal_minute'].str.replace(r'\(pen.\)', '', regex=True)
    df['goal_minute'] = df['goal_minute'].str.replace("'", "").str.strip()

    # Process goal_minute to handle extra time
    df['goal_minute_et'] = df['goal_minute'].apply(lambda x: int(x.split('+')[1]) if '+' in str(x) else 0)
    df['goal_minute'] = df['goal_minute'].apply(lambda x: int(x.split('+')[0]) if '+' in str(x) else int(x))

    # Create goal_et variable: 1 if the minute is extra time (90+), otherwise 0
    df['goal_et'] = df['goal_minute'].apply(lambda x: 1 if x >= 90 else 0)

    # Update goal_minute by adding goal_minute_et to it
    df['goal_minute'] = df['goal_minute'] + df['goal_minute_et']

    # Clean and format other columns, handling commas in 'stadium_attendance'
    df['stadium_attendance'] = df['stadium_attendance'].str.replace(r'[^\d]', '', regex=True).replace(',', '').astype(int)

    # Extract date details
    df['short_date'] = df['date'].str.extract(r'\((.*?)\)')
    df['long_date'] = df['date'].str.split('(').str[0].str.strip()

    # Dropping the original 'date' column
    df = df.drop(columns=['date'])

    return df
