In [4]:
import mechanicalsoup
from bs4 import BeautifulSoup
import pandas as pd


# Enter Player Name
player_name = "Phil-Foden"

def generate_yearly_url(year):
    base_url = "https://fbref.com/en/players/ed1e53f3/matchlogs"
    
    
    url = f"{base_url}/{year}/{player_name}-Match-Logs"
    return url

def scrape_data_for_year(year):
    browser = mechanicalsoup.StatefulBrowser()
    url = generate_yearly_url(year)
    
    try:
        browser.open(url)
        # Now you can proceed with scraping the data for the given year
        # ...
        th_elements = browser.page.find_all("th", attrs={"class": 'left'})
        filtered_th_elements = [th for th in th_elements if "iz" not in th.get("class", [])]

    
        Date = [value.text for value in filtered_th_elements] # We're only interested in the text not plus tags
    
        td_elements = browser.page.find_all("td", attrs={"class": lambda x: x and 'left iz' not in x and 'left iz group_start' not in x and 'center iz' not in x})
        filtered_td_elements = [td for td in td_elements]
        
        columns = [value.text for value in filtered_td_elements]
        
        col_names = ['Day',
             'Comp',
             'Round',
             'Venue',
             'Result',
             'Squad',
             'Opponent',
             'Start',
             'Pos',
             'Min',
             'Gls',
             'Ast',
             'PK',
             'PKatt',
             'Sh',
             'SoT',
             'CrdY',
             'CrdR',
             'Touches',
             'Tkl',
             'Int',
             'Blocks',
             'xG',
             'npxG',
             'xAG',
             'SCA',
             'GCA',
             'Cmp',
             'Att',
             'Cmp%',
             'PrgP',
             'Carries',
             'PrgC',
             'Att',
             'Succ',
             'Match Report']
        
        consistent_occurrences = []
        target_gap = 36  # We have a total of 36 columns

        # Iterate through the list and find occurrences
        for i in range(len(columns)):
            if columns[i] == 'Match Report':
                consistent_occurrences.append(i)

        # Create a list to store rows
        consistent_rows = []

        # Process only the last occurrence in each group
        for occurrence in consistent_occurrences:
            start_index = occurrence - target_gap + 1
            if start_index >= 0:
                consistent_rows.append(columns[start_index:occurrence + 1])

        # Create a DataFrame with column names
        df = pd.DataFrame(consistent_rows, columns=col_names)
        df.index = df.index +1
        return df  # Return the DataFrame
    
    
    except Exception as e:
        print(f"Error scraping data for {year}: {e}")
        return None
    finally:
        browser.close()

# Example usage:
years = ["2019-2020","2020-2021","2021-2022", "2022-2023"]

# Create an empty list to store DataFrames
dfs = []

for year in years:
    df = scrape_data_for_year(year)
    if df is not None:  # Check if the DataFrame is not None
        dfs.append(df)

# Concatenate the DataFrames if the list is not empty
if dfs:
    final_df = pd.concat(dfs, axis=0, ignore_index=True)
    
    # Data Preprocessing Logic 
    
    
    # Removing rows with non-date values in Days Column
    days_to_keep = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat']
    final_df = final_df[final_df['Day'].isin(days_to_keep)]
    
    #Filtering for only Premier League
    final_df = final_df[final_df['Comp'] == 'Premier League']
    
    
    # Save the final DataFrame
    final_df.to_csv(f"{player_name}.csv", index = False, encoding='utf-8')
    
    
    
else:
    print("No data available.")


In [29]:
final_df

Unnamed: 0,Day,Comp,Round,Venue,Result,Squad,Opponent,Start,Pos,Min,...,GCA,Cmp,Att,Cmp%,PrgP,Carries,PrgC,Att.1,Succ,Match Report
1,Sat,Premier League,Matchweek 1,Away,W 5–0,Manchester City,West Ham,N,AM,11,...,0,5,7,71.4,0,8,0,0,0,Match Report
10,Sat,Premier League,Matchweek 9,Away,W 2–0,Manchester City,Crystal Palace,N,LM,1,...,0,1,1,100.0,0,1,0,0,0,Match Report
12,Sat,Premier League,Matchweek 10,Home,W 3–0,Manchester City,Aston Villa,N,DM,15,...,0,18,20,90.0,1,18,1,2,2,Match Report
14,Sat,Premier League,Matchweek 11,Home,W 2–1,Manchester City,Southampton,N,"RM,CM",7,...,1,4,4,100.0,2,4,0,0,0,Match Report
16,Sat,Premier League,Matchweek 13,Home,W 2–1,Manchester City,Chelsea,N,"AM,LM",24,...,0,8,10,80.0,1,11,0,0,0,Match Report
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238,Sat,Premier League,Matchweek 35,Home,W 2–1,Manchester City,Leeds United,Y,LW,90,...,0,38,51,74.5,2,40,4,3,3,Match Report
240,Sun,Premier League,Matchweek 36,Away,W 3–0,Manchester City,Everton,Y,"LW,AM",90,...,1,49,58,84.5,4,52,7,5,3,Match Report
242,Sun,Premier League,Matchweek 37,Home,W 1–0,Manchester City,Chelsea,Y,"AM,LW",90,...,0,52,59,88.1,9,44,5,1,0,Match Report
243,Wed,Premier League,Matchweek 32,Away,D 1–1,Manchester City,Brighton,Y,LW,50,...,0,21,25,84.0,5,26,6,2,2,Match Report
