# Season table scraper

### To get our data we will get it from fbref.com 

In [1]:
![season_table_screenshot.png](season_table_screenshot.png)

'[season_table_screenshot.png]' is not recognized as an internal or external command,
operable program or batch file.


In [2]:
# Importing libraries

# Web scraping liberaries
from bs4 import BeautifulSoup
import requests
import time

# Data analysis liberaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
def text_extract_from_contents(contents_list):
    '''
    Input : 
    Output :  
    
    '''
    list_ = []
    for i in range(len(contents_list)):
        if contents_list[i] == " " or contents_list[i] == "\n":
            pass
        else:
            list_.append(contents_list[i].text)
    
    return list_

In [5]:
def content_to_list(content_list):
    '''
    Input : Content of tags 
    Output : Return a list without space and tags as element of list.
    
    '''
    list_ = []
    for i in range(len(content_list)):
        if content_list[i] == " " or content_list[i] == "\n":
            pass
        else:
            list_.append(content_list[i])
    return list_

In [6]:
def create_dictionary(list_):
    '''
    Input : List of elements as keys 
    Output : Return a dictionary with 
    '''
    dictionary = {}
    for i in list_:
        dictionary[i] = []
    return dictionary


In [7]:
def final_table(table_headers_list,table_content):
    dictionary = create_dictionary(table_headers_list)
    
    for i in range(len(table_content)):
        detail = text_extract_from_contents(table_content[i].contents)
        for j in range(len(table_headers_list)):
            dictionary[table_headers_list[j]].append(detail[j])
        
    return dictionary

### Understanding the structure and code of the website

In [8]:
URL = "https://fbref.com/en/comps/9/2015-2016/2015-2016-Premier-League-Stats"

In [9]:
page = requests.get(URL)

In [10]:
code = page.status_code
code

200

In [11]:
soup = BeautifulSoup(page.text)

#### Find the table tags from web page

In [12]:
table_tags = soup.find('table')

#### Table head tags 

In [13]:
table_tags.thead

<thead> <tr> <th aria-label="Rank" class="poptip sort_default_asc center" data-stat="rank" data-tip="&lt;strong&gt;Rank&lt;/strong&gt;&lt;br&gt;Squad finish in competition&lt;br&gt;Finish within the league or competition.&lt;br&gt;For knockout competitions may show final round reached.&lt;br&gt;Colors and arrows represent promotion/relegation or qualifiation for continental cups.&lt;br&gt;Trophy indicates team won league whether by playoffs or by leading the table.&lt;br&gt;Star indicates topped table in league USING another means of naming champion." scope="col">Rk</th> <th aria-label="Squad" class="poptip sort_default_asc center" data-stat="team" scope="col">Squad</th> <th aria-label="Matches Played" class="poptip center" data-stat="games" data-tip="&lt;strong&gt;Matches Played&lt;/strong&gt;&lt;br&gt;Matches Played by the player or squad" scope="col">MP</th> <th aria-label="Wins" class="poptip center" data-stat="wins" data-tip="&lt;strong&gt;Wins&lt;/strong&gt;&lt;br&gt;Wins" scope=

#### Table body tags 

In [14]:
table_tags.tbody

<tbody> <tr><th class="right qualifier qualification_indicator1" csk="1" data-stat="rank" scope="row">1</th><td class="left" data-stat="team"><img alt="Club Crest" height="13" itemscope="image" src="https://cdn.ssref.net/req/202303201/tlogo/fb/mini.a2d435b3.png" style="vertical-align:text-top" width="13"/> <a href="/en/squads/a2d435b3/2015-2016/Leicester-City-Stats">Leicester City</a></td><td class="right" data-stat="games">38</td><td class="right" data-stat="wins">23</td><td class="right" data-stat="ties">12</td><td class="right" data-stat="losses">3</td><td class="right" data-stat="goals_for">68</td><td class="right" data-stat="goals_against">36</td><td class="right" data-stat="goal_diff">+32</td><td class="right" data-stat="points">81</td><td class="right" data-stat="points_avg">2.13</td><td class="right" csk="31998" data-stat="attendance_per_g">31,998</td><td class="right" csk="24" data-stat="top_team_scorers"><a href="/en/players/45963054/Jamie-Vardy">Jamie Vardy</a> - <span>24</s

In [15]:
content_to_list(table_tags.thead.contents)

[<tr> <th aria-label="Rank" class="poptip sort_default_asc center" data-stat="rank" data-tip="&lt;strong&gt;Rank&lt;/strong&gt;&lt;br&gt;Squad finish in competition&lt;br&gt;Finish within the league or competition.&lt;br&gt;For knockout competitions may show final round reached.&lt;br&gt;Colors and arrows represent promotion/relegation or qualifiation for continental cups.&lt;br&gt;Trophy indicates team won league whether by playoffs or by leading the table.&lt;br&gt;Star indicates topped table in league USING another means of naming champion." scope="col">Rk</th> <th aria-label="Squad" class="poptip sort_default_asc center" data-stat="team" scope="col">Squad</th> <th aria-label="Matches Played" class="poptip center" data-stat="games" data-tip="&lt;strong&gt;Matches Played&lt;/strong&gt;&lt;br&gt;Matches Played by the player or squad" scope="col">MP</th> <th aria-label="Wins" class="poptip center" data-stat="wins" data-tip="&lt;strong&gt;Wins&lt;/strong&gt;&lt;br&gt;Wins" scope="col">W

In [16]:
table_header_tags = content_to_list(table_tags.thead.contents)

In [17]:
table_header_tags

[<tr> <th aria-label="Rank" class="poptip sort_default_asc center" data-stat="rank" data-tip="&lt;strong&gt;Rank&lt;/strong&gt;&lt;br&gt;Squad finish in competition&lt;br&gt;Finish within the league or competition.&lt;br&gt;For knockout competitions may show final round reached.&lt;br&gt;Colors and arrows represent promotion/relegation or qualifiation for continental cups.&lt;br&gt;Trophy indicates team won league whether by playoffs or by leading the table.&lt;br&gt;Star indicates topped table in league USING another means of naming champion." scope="col">Rk</th> <th aria-label="Squad" class="poptip sort_default_asc center" data-stat="team" scope="col">Squad</th> <th aria-label="Matches Played" class="poptip center" data-stat="games" data-tip="&lt;strong&gt;Matches Played&lt;/strong&gt;&lt;br&gt;Matches Played by the player or squad" scope="col">MP</th> <th aria-label="Wins" class="poptip center" data-stat="wins" data-tip="&lt;strong&gt;Wins&lt;/strong&gt;&lt;br&gt;Wins" scope="col">W

#### Creating a list containing the table headers

In [18]:
table_header_tags = table_header_tags[0]

In [19]:
table_header_tags = content_to_list(table_header_tags.contents)

In [20]:
list_of_table_header = []
for i in table_header_tags:
    list_of_table_header.append(i.text)

In [21]:
list_of_table_header

['Rk',
 'Squad',
 'MP',
 'W',
 'D',
 'L',
 'GF',
 'GA',
 'GD',
 'Pts',
 'Pts/MP',
 'Attendance',
 'Top Team Scorer',
 'Goalkeeper',
 'Notes']

In [22]:
table_body_tags = content_to_list(table_tags.tbody.contents)

In [23]:
header_dictionary = create_dictionary(list_of_table_header)

In [24]:
for i in table_body_tags:
    team_detail = text_extract_from_contents(i.contents)
    for j in range(len(header_dictionary)):
        header_dictionary[list_of_table_header[j]].append(team_detail[j])
        

In [25]:
pd.DataFrame(header_dictionary)

Unnamed: 0,Rk,Squad,MP,W,D,L,GF,GA,GD,Pts,Pts/MP,Attendance,Top Team Scorer,Goalkeeper,Notes
0,1,Leicester City,38,23,12,3,68,36,32,81,2.13,31998,Jamie Vardy - 24,Kasper Schmeichel,→ Champions League via league finish
1,2,Arsenal,38,20,11,7,65,36,29,71,1.87,59944,Olivier Giroud - 16,Petr Čech,→ Champions League via league finish
2,3,Tottenham,38,19,13,6,69,35,34,70,1.84,35776,Harry Kane - 25,Hugo Lloris,→ Champions League via league finish
3,4,Manchester City,38,19,9,10,71,41,30,66,1.74,54041,Sergio Agüero - 24,Joe Hart,→ Champions League via league finish
4,5,Manchester Utd,38,19,9,10,49,35,14,66,1.74,75286,Anthony Martial - 11,David de Gea,→ Europa League via cup win 1
5,6,Southampton,38,18,9,11,59,41,18,63,1.66,30751,"Sadio Mané, Graziano Pellè - 11",Fraser Forster,→ Europa League via league finish 2
6,7,West Ham,38,16,14,8,65,51,14,62,1.63,34910,"Dimitri Payet, Andy Carroll - 9",Adrián,→ Europa League via league finish 3
7,8,Liverpool,38,16,12,10,63,50,13,60,1.58,43910,Roberto Firmino - 10,Simon Mignolet,
8,9,Stoke City,38,14,9,15,41,55,-14,51,1.34,27534,Marko Arnautović - 11,Jack Butland,
9,10,Chelsea,38,12,14,12,59,53,6,50,1.32,41500,Diego Costa - 12,Thibaut Courtois,


## Making  a function for scraping more season table from different seasons

In [4]:
def request_to_soup(url):
    # Requesting web server to get acces to the web page and converting it into text
    page = requests.get(url)

    # Checking status code for URL
    code = page.status_code
    print(code)
    
    # Instantiating BeautifulSoup library
    soup = BeautifulSoup(page.text)
    
    return soup

### Create a scaper funtion get needed table tags and convert them into soutable form.

In [None]:
def scraper(soup_):
    
    # Finding tags which contain the season table    
    table_tags = soup_.find('table')
    
    # Separating the table header tags and table body tags..
    table_header_tags = table_tags.thead
    table_body_tags = table_tags.tbody
    
    # Creating a list of heading from header tags
    
    
    # Creating a list which cont    

### Create final_table function return a dictionary or data from which connect header list to body contents of table

### Finally create a function which returns all season table according to inputs like season year and number of seasons to scrape

In [13]:
URL_to_table_csv(start_year=2015,number_of_seasons=2)

https://fbref.com/en/comps/9/2015-2016/2015-2016-Premier-League-Stats
200


TypeError: final_table() got an unexpected keyword argument 'headers_list'

In [19]:
dummy_years = 2015

In [23]:
base_dummy_url = f"https://fbref.com/en/comps/9/{str(dummy_years)}/{str(dummy_years)}-Premier-League-Stats"
print(base_dummy_url)

        # Getting tags
soup_ = request_to_soup(url= base_dummy_url)

https://fbref.com/en/comps/9/2015/2015-Premier-League-Stats
200


In [21]:
table_header_list = text_extract_from_contents(table_tags.thead.tr.contents)

NameError: name 'table_tags' is not defined

In [24]:
table_tags = soup_.find('table')
    
    # Table header section    
    # Separating tags for scraping the text
content_to_list(table_tags.thead.tr.contents)

    # Scraping the text from tags contents
table_header_list = text_extract_from_contents(table_tags.thead.tr.contents)
    
    # Table body section 
    # Scraping the body tags from table    
table_details = content_to_list(table_tags.tbody.contents)
    

In [None]:
scraper

In [12]:
def URL_to_table_csv(start_year,number_of_seasons):

    for i in range(number_of_seasons):
        # Creating a template of years for url
        dummy_years = f"{start_year+i}-{start_year+i+1}"
        base_dummy_url = f"https://fbref.com/en/comps/9/{str(dummy_years)}/{str(dummy_years)}-Premier-League-Stats"
        print(base_dummy_url)

        # Getting tags
        soup = request_to_soup(url= base_dummy_url)

        # Calling the scraper function
        Squad_stats = scraper(soup_= soup)

        # Creating a template for naming the files 
        file_name =f"Premier_league_season_table_{dummy_years}"

        # Converting the dataframe to csv file
        Season_table.to_csv(file_name)
        if i == 0: 
            print(f"{i+1} season successfully scraped ✔ ")
        elif i >0:
            print(f"{i+1} seasons successfully scraped ✔ ")
        if i == number_of_seasons-1:
            print(f'All seasons scraped')
        
        time.sleep(5)

In [11]:
def scraper(soup_):
    # Getting tags for regular season tabel
    table_tags = soup_.find('table')
    
    # Table header section    
    # Separating tags for scraping the text
    content_to_list(table_tags.thead.tr.contents)

    # Scraping the text from tags contents
    table_header_list = text_extract_from_contents(table_tags.thead.tr.contents)
    
    # Table body section 
    # Scraping the body tags from table    
    table_details = content_to_list(table_tags.tbody.contents)
    
    # Creating a final table in the as dictionary
    table = final_table(headers_list=table_header_list,table_content=table_details)
    
    return pd.DataFrame(table)