Premier League(EPL) - Web Scrapper

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [2]:
# Declaring all the lists for the attributes you want to store

# 1
names = list()
# 2
nationality = list()
# 3
position = list()
# 4
age = list()
# 5
matches = list()
# 6
starts = list()
# 7
mins_played = list()
# 9
goals = list()
# 10
assists = list()
# 12
penalty_goals = list()
# 13
penalty_attempted = list()
# 14
yellow_cards = list()
# 15
red_cards = list()
# 21
xG = list()
# 23
xA = list()
attempted = list()
passes_comp = list()
club = list()

In [3]:
def getPassingStats(link, threshold_d):
    """
    Gets the passing statistics for the players. We have seperate function for this as
    passing stats are in a different table altogether.
    
    Parameters
    ----------
    link: string
        The link for the team from which we have to retrieve player's passing stats.
    threshold_d: integer
        The number of players who have played atleast 1 match for the team.

    Returns
    -------
    None

    See also
    --------
    -
    """
    
    # Getting the page for the team
    page = requests.get(link)
    soup = BeautifulSoup(page.text, 'html.parser')

    # Getting the rows of data for the passing stats table
    table = soup.find_all('table')[5]
    body = table.find_all('tbody')[0]
    rows = body.find_all('tr')
    
    temp_players = list()
    count = 0
    
    for row in rows:
        count += 1
        temp = list()
        data = row.find_all('td')
        temp.append(row.find('a').get_text())
        for d in data:
            temp.append(d.text)
            
        # Passes attempted
        try:
            integer_data = int(data[5].text)
            attempted.append(integer_data)
        except:
            attempted.append(-1)
        
        # Percentage Passes Completed
        try:
            float_data = float(data[6].text)
            passes_comp.append(float_data)
        except:
            passes_comp.append(-1.0)
            
    # Assertion to ensure we got every player's passing stats 
    assert threshold_d == count, 'Passing stats for all players was not retrieved'

In [4]:
def fetchWebsiteData(link, threshold_d):
    """
    Gets the passing statistics for the players. We have seperate function for this as
    passing stats are in a different table altogether.
    
    Parameters
    ----------
    link: string
        The link for the team from which we have to retrieve player's passing stats.
    threshold_d: integer
        The number of players who have played atleast 1 match for the team.

    Returns
    -------
    all_players: list of lists
        The list that holds rows of data for all the players of the team

    See also
    --------
    -
    """
    
    # Getting the passing statistics
    getPassingStats(link, threshold_d)
    page = requests.get(link)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # Getting rows of general statistics data for each player from the team
    table = soup.find_all('table')[0]
    body = table.find_all('tbody')[0]
    rows = body.find_all('tr')
    
    # Storing the data for all players in a list to later on distribute it into different attributes
    all_players = list()
    for row in rows:
        data = row.find_all('td')
        temp = list()
        temp.append(row.find('a').get_text())
        for d in data:
            temp.append(d.text)
        all_players.append(temp)
        
    return all_players

In [5]:
def appendData(team, threshold, all_players):
    """
    Distribute the data retrieved in fetchWebsiteData function to all the attribute lists
    
    Parameters
    ----------
    team: string
        The team name of the players.
    threshold: integer
        The number of players who have played atleast 1 match for the team.
    all_players: list of lists
        The list that holds rows of data for all the players of the team

    Returns
    -------
    None
    
    See also
    --------
    -
    """
    
    for player in all_players[:threshold]:
        
        names.append(player[0])
        club.append(team)
        nationality.append(player[1].split(' ')[1])
        position.append(player[2])
        
        if player[3] == '':
            age.append('-1')
        else:
            age.append(player[3])
        
        matches.append(player[4])
        starts.append(player[5])
        mins_played.append(player[6].replace(',', ''))
        goals.append(player[8])
        assists.append(player[9])
        penalty_goals.append(player[11])
        penalty_attempted.append(player[12])
        yellow_cards.append(player[13])
        red_cards.append(player[14])
        
        try:
            xG.append(player[24])
            xA.append(player[25])
        except:
            xG.append(-1)
            xA.append(-1)

In [6]:
def convertDataFrame():
    """
    Converts all the data that we retrieved into a single pandas dataframe.
    
    Parameters
    ----------
    -

    Returns
    -------
    df: pandas DataFrame
    
    See also
    --------
    -
    """
    
    df = pd.DataFrame({
        'Name': names,
        'Club': club,
        'Nationality': nationality,
        'Position': position,
        'Age': age,
        'Matches': matches,
        'Starts': starts,
        'Mins': mins_played,
        'Goals': goals,
        'Assists': assists,
        'Passes_Attempted': attempted,
        'Perc_Passes_Completed': passes_comp,
        'Penalty_Goals': penalty_goals,
        'Penalty_Attempted': penalty_attempted,
        'xG': xG,
        'xA': xA,
        'Yellow_Cards': yellow_cards,
        'Red_Cards': red_cards,

    })
    
    # All data we retrieved was of the form strings, so we typecast in required forms
    df['Age'] = df['Age'].astype(int)
    df['Matches'] = df['Matches'].astype(int)
    df['Starts'] = df['Starts'].astype(int)
    df['Mins'] = df['Mins'].astype(int)
    df['Goals'] = df['Goals'].astype(int)
    df['Assists'] = df['Assists'].astype(int)
    df['Penalty_Goals'] = df['Penalty_Goals'].astype(int)
    df['Penalty_Attempted'] = df['Penalty_Attempted'].astype(int)
    df['xG'] = df['xG'].astype(float)
    df['xA'] = df['xA'].astype(float)
    df['Yellow_Cards'] = df['Yellow_Cards'].astype(int)
    df['Red_Cards'] = df['Red_Cards'].astype(int)
    df['Passes_Attempted'] = df['Passes_Attempted'].astype(int)
    df['Perc_Passes_Completed'] = df['Perc_Passes_Completed'].astype(float)
    
    return df

In [14]:
def get_player_count(links):
    """
    Returns the number of players who played atleast 1 match for each team.
    
    Parameters
    ----------
    links: list of string
        Links of all the teams in the league.

    Returns
    -------
    all_player_counts: list of integers
        The list of number of players who played atleast 2 match for each team.
    
    See also
    --------
    -
    """
    
    
    all_player_counts = list()
    for link_table in links:
        page = requests.get(link_table)
        soup = BeautifulSoup(page.text, 'html.parser')
        all_players = soup.find_all('table')[0].find('tbody').find_all('tr')
        
        count = 0
        for player_row in all_players:
            player_data = player_row.find_all('td')
            
            # If player has played non 0 matches, we increase count by 1
            if player_data[3].text != '0':
                count += 1
        all_player_counts.append(count)
    
    return all_player_counts

In [25]:
# This is where the hard part comes, you have to manually paste the links of the teams from the league table page.
# League Table page: https://fbref.com/en/comps/9/Premier-League-Stats
# We then have an opportunity to name the teams as we want for the dataset, I could have automated this part but I
# found having the power of choice to do this better

links = [
    'https://fbref.com/en/squads/b8fd03ef/Manchester-City-Stats'#,
    # 'https://fbref.com/en/squads/19538871/Manchester-United-Stats',
    # 'https://fbref.com/en/squads/822bd0ba/Liverpool-Stats',
    # 'https://fbref.com/en/squads/cff3d9bb/Chelsea-Stats',
    # 'https://fbref.com/en/squads/a2d435b3/Leicester-City-Stats',
    # 'https://fbref.com/en/squads/7c21e445/West-Ham-United-Stats',
    # 'https://fbref.com/en/squads/361ca564/Tottenham-Hotspur-Stats',
    # 'https://fbref.com/en/squads/18bb7c10/Arsenal-Stats',
    # 'https://fbref.com/en/squads/5bfb9659/Leeds-United-Stats',
    # 'https://fbref.com/en/squads/d3fd31cc/Everton-Stats',
    # 'https://fbref.com/en/squads/8602292d/Aston-Villa-Stats',
    # 'https://fbref.com/en/squads/b2b47a98/Newcastle-United-Stats',
    # 'https://fbref.com/en/squads/8cec06e1/Wolverhampton-Wanderers-Stats',
    # 'https://fbref.com/en/squads/47c64c55/Crystal-Palace-Stats',
    # 'https://fbref.com/en/squads/33c895d4/Southampton-Stats',
    # 'https://fbref.com/en/squads/d07537b9/Brighton-and-Hove-Albion-Stats',
    # 'https://fbref.com/en/squads/943e8050/Burnley-Stats',
    # 'https://fbref.com/en/squads/fd962109/Fulham-Stats',
    # 'https://fbref.com/en/squads/60c6b05f/West-Bromwich-Albion-Stats',
    # 'https://fbref.com/en/squads/1df6b87e/Sheffield-United-Stats'
]

team = [
    'Manchester City' #,
    # 'Machester United',
    # 'Liverpool',
    # 'Chelsea',
    # 'Leicester City',
    # 'West Ham United',
    # 'Tottenham Hotspur',
    # 'Arsenal',
    # 'Leeds United',
    # 'Everton',
    # 'Aston Villa',
    # 'Newcastle United',
    # 'Wolverhampton',
    # 'Crystal Palace',
    # 'Southampton',
    # 'Brighton',
    # 'Burnley',
    # 'Fulham',
    # 'West Brom',
    # 'Sheffield United'
    
]

# Gets the number of players who played atleast 1 match for each team.
threshold = get_player_count(links)

NameError: name 'all_player_counts' is not defined

In [12]:
count = 0
for i in range(len(links)):
    count += 1
    web_page_data = fetchWebsiteData(links[i], threshold[i])
    appendData(team[i], threshold[i], web_page_data)
    
assert count == 20, 'EPL has 20 teams, data wasn\'t retrieved for all the teams'

IndexError: list index out of range

In [17]:
df = convertDataFrame()

ValueError: All arrays must be of the same length

In [18]:
df.head(5)

NameError: name 'df' is not defined

In [19]:
# Checking for the null values
df.isna().sum()

NameError: name 'df' is not defined

In [20]:
df.to_csv('epl_2020_21_dataset.csv', index=False)

NameError: name 'df' is not defined