In [23]:
import mechanicalsoup
from bs4 import BeautifulSoup
import pandas as pd


# Enter Player Name
player_name = "Mohamed-Salah"

def yearly_url_func(year):
    base_url = "https://fbref.com/en/players/e342ad68/matchlogs"
    
    url = f"{base_url}/{year}/{player_name}-Match-Logs"
    
    return url


def scrape_data_for_year(year):
    browser = mechanicalsoup.StatefulBrowser()
    url = yearly_url_func(year)
    
    try:
        browser.open(url)
        # Scraping the data for the given year
        
        th_elements = browser.page.find_all("th", attrs={"class": 'left'})
        
        filtered_th_elements = [th for th in th_elements if "iz" not in th.get("class", [])]

    
        Date = [value.text for value in filtered_th_elements] # We're only interested in the text not text plus tags
    
        td_elements = browser.page.find_all("td", attrs={"class": lambda x: x and 'left iz' not in x and 'left iz group_start' not in x and 'center iz' not in x})
        
        filtered_td_elements = [td for td in td_elements]
        
        columns = [value.text for value in filtered_td_elements]
        
        col_names = ['Day',
             'Comp',
             'Round',
             'Venue',
             'Result',
             'Squad',
             'Opponent',
             'Start',
             'Pos',
             'Min',
             'Gls',
             'Ast',
             'PK',
             'PKatt',
             'Sh',
             'SoT',
             'CrdY',
             'CrdR',
             'Touches',
             'Tkl',
             'Int',
             'Blocks',
             'xG',
             'npxG',
             'xAG',
             'SCA',
             'GCA',
             'Cmp',
             'Att',
             'Cmp%',
             'PrgP',
             'Carries',
             'PrgC',
             'Att',
             'Succ',
             'Match Report']
        
        consistent_occurrences = []
        target_gap = 36  # We have a total of 36 columns

        # Iterate through the list and find occurrences
        for i in range(len(columns)):
            if columns[i] == 'Match Report':
                consistent_occurrences.append(i)

        # Create a list to store rows
        consistent_rows = []

        # Process only the last occurrence in each group
        for occurrence in consistent_occurrences:
            start_index = occurrence - target_gap + 1
            if start_index >= 0:
                consistent_rows.append(columns[start_index:occurrence + 1])

        # Create a DataFrame with column names
        df = pd.DataFrame(consistent_rows, columns=col_names)
        df.index = df.index +1
        return df  # Return the DataFrame
    
    
    except Exception as e:
        print(f"Error scraping data for {year}: {e}")
        return None
    finally:
        browser.close()

# Example usage:
years = ["2016-2017","2017-2018","2018-2019","2019-2020","2020-2021","2021-2022", "2022-2023"]

# Create an empty list to store DataFrames
dfs = []

for year in years:
    df = scrape_data_for_year(year)
    if df is not None:  # Check if the DataFrame is not None
        dfs.append(df)

# Concatenate the DataFrames if the list is not empty
if dfs:
    final_df = pd.concat(dfs, axis=0, ignore_index=True)
    
    # Data Preprocessing Logic    
    
    # Removing rows with non-date values in Days Column
    days_to_keep = ['Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat']
    final_df = final_df[final_df['Day'].isin(days_to_keep)]
    
    #Filtering for only Premier League
    final_df = final_df[final_df['Comp'] == 'Premier League']
    
    
    # Save the final DataFrame
    final_df.to_csv(f"{player_name}.csv", index = False, encoding='utf-8')
    
    
    
else:
    print("No data available.")


In [24]:
final_df

Unnamed: 0,Day,Comp,Round,Venue,Result,Squad,Opponent,Start,Pos,Min,...,GCA,Cmp,Att,Cmp%,PrgP,Carries,PrgC,Att.1,Succ,Match Report
56,Sat,Premier League,Matchweek 1,Away,D 3–3,Liverpool,Watford,Y,RW,85,...,1,14,20,70.0,2,21,3,1,1,Match Report
58,Sat,Premier League,Matchweek 2,Home,W 1–0,Liverpool,Crystal Palace,N,"RW,RM",30,...,0,9,12,75.0,1,12,0,1,1,Match Report
60,Sun,Premier League,Matchweek 3,Home,W 4–0,Liverpool,Arsenal,Y,"RW,LW",90,...,1,19,23,82.6,0,23,5,1,1,Match Report
63,Sat,Premier League,Matchweek 4,Away,L 0–5,Liverpool,Manchester City,Y,"RW,RM",45,...,0,11,15,73.3,1,12,3,5,3,Match Report
65,Sat,Premier League,Matchweek 5,Home,D 1–1,Liverpool,Burnley,Y,RW,90,...,0,34,47,72.3,5,43,3,3,3,Match Report
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
392,Wed,Premier League,Matchweek 28,Home,W 1–0,Liverpool,Fulham,Y,RW,83,...,0,41,57,71.9,4,41,2,5,5,Match Report
393,Sat,Premier League,Matchweek 35,Home,W 1–0,Liverpool,Brentford,Y,RW,90,...,0,34,46,73.9,6,38,2,3,2,Match Report
394,Mon,Premier League,Matchweek 36,Away,W 3–0,Liverpool,Leicester City,Y,RW,87,...,3,31,40,77.5,6,41,5,2,0,Match Report
395,Sat,Premier League,Matchweek 37,Home,D 1–1,Liverpool,Aston Villa,Y,RW,90,...,1,22,34,64.7,2,38,6,8,2,Match Report
