In [2]:
import requests

from bs4 import BeautifulSoup
import pandas as pd
import time

### Scraping from FBRef Website

In [None]:
# URL of the website to scrape
url = "https://fbref.com/en/comps/9/Premier-League-Stats"

# Send an HTTP GET request to the website and parse it
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
# Get an array of the teams url and the teams name
teams = []
teams_name=[]
href='https://fbref.com/'
for team in soup.find_all("td", attrs={"data-stat": "team"}):
    team_link = team.find('a')
    team_href = team_link.get("href")
    team_href = href+team_href
    teams.append(team_href)
    teams_name.append(team_link.text)
time.sleep(1) 

In [None]:
# Create a function to get a dataframe for the player stats

def merge_levels(column):
    col = column.split('?')
    if col[0] == 'Per 90 Minutes':
        return col[1] + '/90'
    else:        
        return col[1]

def get_team_stats(url,team_name,table_tag):
    # Retrieve HTML content from the URL
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the table containing the team stats
    table_tags = soup.find_all('table', attrs={'id': table_tag})
    if not table_tags:
        print("No table found with id 'stats_standard_9'.")
        return None

    # Read the table into a DataFrame
    df = pd.read_html(str(table_tags[0]))[0]

    # Merge MultiIndex levels and drop 'Matches' column
    df.columns = df.columns.map('?'.join)
    df.columns = df.columns.map(merge_levels)
    df.drop('Matches', axis=1, inplace=True)

    # Extract team name from URL
    df['Team'] = team_name
    
    df = df[~df['Player'].isin(['Opponent Total', 'Squad Total'])]

    return df

# Example function call
df1 = get_team_stats(teams[0],teams_name[0],'stats_shooting_9')


I have to gather the data for each statistic manually because if I try to automate the process and make too many requests in a loop, the website blocks me from scraping more data. So, I have to wait a few seconds between each request before continuing to gather the information.

In [None]:
# Get the shooting stats dataframe

df_teams = pd.DataFrame()

i=0
for team in teams:
    df_team = get_team_stats(team,teams_name[i],'stats_shooting_9')
    df_teams = pd.concat([df_teams, df_team], ignore_index=True)
    print(team)
    i=i+1
    time.sleep(1) 

In [None]:
df_teams.to_csv('shooting_stat.csv',index_label=False)

In [None]:
# Get the passing stats dataframe

import pandas as pd

df_teams = pd.DataFrame()

i=0
for team in teams:
    df_team = get_team_stats(team,teams_name[i],'stats_passing_9')
    df_teams = pd.concat([df_teams, df_team], ignore_index=True)
    print(team)
    i=i+1
    time.sleep(1) 

In [None]:
df_teams.to_csv('passing_stats.csv',index_label=False)

In [None]:
# Get the gca stats dataframe

df_teams = pd.DataFrame()

i=0
for team in teams:
    df_team = get_team_stats(team,teams_name[i],'stats_gca_9')
    df_teams = pd.concat([df_teams, df_team], ignore_index=True)
    print(team)
    i=i+1
    time.sleep(1) 

In [None]:
df_teams.to_csv('gca_stats.csv',index_label=False)

In [None]:
# Get the defense stats dataframe

df_teams = pd.DataFrame()

i=0
for team in teams:
    df_team = get_team_stats(team,teams_name[i],'stats_defense_9')
    df_teams = pd.concat([df_teams, df_team], ignore_index=True)
    print(team)
    i=i+1
    time.sleep(1) 

In [None]:
df_teams.to_csv('gca_stats.csv',index_label=False)

In [None]:
# Get the possession stats dataframe

df_teams = pd.DataFrame()

i=0
for team in teams:
    df_team = get_team_stats(team,teams_name[i],'stats_possession_9')
    df_teams = pd.concat([df_teams, df_team], ignore_index=True)
    print(team)
    i=i+1
    time.sleep(1) 

In [None]:
df_teams.to_csv('possession_stats.csv',index_label=False)

Considering the model's performance, the Mean Absolute Error (MEA) metric is significantly large. Hence, I'm attempting to scrape data on player wages, as it could prove useful in predicting player valuation.

In [6]:
# URL of the website to scrape
url = "https://fbref.com/en/comps/9/wages/Premier-League-Wages"

# Send an HTTP GET request to the website and parse it
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

In [7]:
table_tags = soup.find('table', id='player_wages')

In [8]:
df = pd.read_html(str(table_tags))[0]


In [9]:
df['Weekly Wages']=df['Weekly Wages'].str.extract(r'£ (\d[\d,]*)')

In [10]:
df=df[['Player','Weekly Wages']]

In [14]:
df['Weekly Wages'] = df['Weekly Wages'].str.replace(',', '').astype(float)

In [15]:
df.to_csv('./data/player_wages.csv',index_label=False)

### Scraping Country Codes

In [None]:
import requests

from bs4 import BeautifulSoup
import pandas as pd
import time

# URL of the website to scrape
url = "https://www.iban.com/country-codes"

# Send an HTTP GET request to the website and parse it
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')



In [None]:
table_tags = soup.find_all('table')
df = pd.read_html(str(table_tags))[0]

In [None]:
df.to_csv('country_code.csv')

### Scraping from Transfer Markt

In [3]:
# URL of the website to scrape
url = "https://www.transfermarkt.com/premier-league/marktwerte/wettbewerb/GB1/pos//detailpos/0/altersklasse/alle/plus/1"

headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'
}

# Send an HTTP GET request to the website and parse it
response = requests.get(url,headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')



In [4]:
url = "https://www.transfermarkt.com/premier-league/marktwerte/wettbewerb/GB1/pos//detailpos/0/altersklasse/alle/plus/1/page/"

df_players = pd.DataFrame()

for i in range(1,5):
    # Send an HTTP GET request to the website
    response = requests.get(url+str(i),headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    table_tags = soup.find('table', class_='items')
    df = pd.read_html(str(table_tags))[0]
    df.dropna(subset=['#'], inplace=True)
    df_players = pd.concat([df_players, df], ignore_index=True)

In [None]:
df_players.to_csv('player_valuation.csv',index_label=False)