# Season table scraper

### To get our data we will get it from fbref.com 

In [1]:
![season_table_screenshot.png](season_table_screenshot.png)

'[season_table_screenshot.png]' is not recognized as an internal or external command,
operable program or batch file.


In [18]:
# Importing libraries

# Web scraping liberaries
from bs4 import BeautifulSoup
import requests
import time

# Data analysis liberaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [19]:
def text_extract_from_contents(contents_list):
    '''
    Input : 
    Output :  
    
    '''
    list_ = []
    for i in range(len(contents_list)):
        if contents_list[i] == " " or contents_list[i] == "\n":
            pass
        else:
            list_.append(contents_list[i].text)
    
    return list_

In [20]:
def request_to_soup(url):
    # Requesting web server to get acces to the web page and converting it into text
    page = requests.get(url)

    # Checking status code for URL
    code = page.status_code
    print(code)
    
    # Instantiating BeautifulSoup library
    soup = BeautifulSoup(page.text)
    
    return soup

In [21]:
def content_to_list(content_list):
    '''
    Input : Content of tags 
    Output : Return a list without space and tags as element of list.
    
    '''
    list_ = []
    for i in range(len(content_list)):
        if content_list[i] == " " or content_list[i] == "\n":
            pass
        else:
            list_.append(content_list[i])
    return list_

In [23]:
def create_dictionary(list_):
    '''
    Input : List of elements as keys 
    Output : Return a dictionary with 
    '''
    dictionary = {}
    for i in list_:
        dictionary[i] = []
    return dictionary


In [22]:
def final_table(table_headers_list,table_content):
    dictionary = create_dictionary(table_headers_list)
    
    for i in range(len(table_content)):
        detail = text_extract_from_contents(table_content[i].contents)
        for j in range(len(table_headers_list)):
            dictionary[table_headers_list[j]].append(detail[j])
        
    return dictionary

### Understanding the structure and code of the website

In [59]:
URL = "https://fbref.com/en/comps/9/2015-2016/2015-2016-Premier-League-Stats"

In [60]:
page = requests.get(URL)

In [61]:
code = page.status_code
code

200

In [62]:
soup = BeautifulSoup(page.text)

In [63]:
soup

<!DOCTYPE html>
<html class="no-js" data-root="/home/fb/deploy/www/base" data-version="klecko-" lang="en">
<head>
<meta charset="utf-8"/>
<meta content="ie=edge" http-equiv="x-ua-compatible"/>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=2.0" name="viewport"/>
<link href="https://cdn.ssref.net/req/202303161" rel="dns-prefetch"/>
<!-- Quantcast Choice. Consent Manager Tag v2.0 (for TCF 2.0) -->
<script async="true" type="text/javascript">
    (function() {
	var host = window.location.hostname;
	var element = document.createElement('script');
	var firstScript = document.getElementsByTagName('script')[0];
	var url = 'https://cmp.quantcast.com'
	    .concat('/choice/', 'XwNYEpNeFfhfr', '/', host, 
		    '/choice.js?tag_version=V2');
	var uspTries = 0;
	var uspTriesLimit = 3;
	element.async = true;
	element.type = 'text/javascript';
	element.src = url;
	
	firstScript.parentNode.insertBefore(element, firstScript);
	
	function makeStub() {
	    var TCF_LOCATOR_NAME = '_

#### Find the table tags from web page

In [64]:
table_tags = soup.find('table')

In [65]:
table_tags

<table class="stats_table sortable min_width force_mobilize" data-cols-to-freeze=",2" id="results2015-201691_overall"> <caption>Regular season Table</caption> <colgroup><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/><col/></colgroup> <thead> <tr> <th aria-label="Rank" class="poptip sort_default_asc center" data-stat="rank" data-tip="&lt;strong&gt;Rank&lt;/strong&gt;&lt;br&gt;Squad finish in competition&lt;br&gt;Finish within the league or competition.&lt;br&gt;For knockout competitions may show final round reached.&lt;br&gt;Colors and arrows represent promotion/relegation or qualifiation for continental cups.&lt;br&gt;Trophy indicates team won league whether by playoffs or by leading the table.&lt;br&gt;Star indicates topped table in league USING another means of naming champion." scope="col">Rk</th> <th aria-label="Squad" class="poptip sort_default_asc center" data-stat="team" scope="col">Squad</th> <th aria-label="Matches Played" class="poptip cen

#### Table head tags 

In [66]:
table_tags.thead

<thead> <tr> <th aria-label="Rank" class="poptip sort_default_asc center" data-stat="rank" data-tip="&lt;strong&gt;Rank&lt;/strong&gt;&lt;br&gt;Squad finish in competition&lt;br&gt;Finish within the league or competition.&lt;br&gt;For knockout competitions may show final round reached.&lt;br&gt;Colors and arrows represent promotion/relegation or qualifiation for continental cups.&lt;br&gt;Trophy indicates team won league whether by playoffs or by leading the table.&lt;br&gt;Star indicates topped table in league USING another means of naming champion." scope="col">Rk</th> <th aria-label="Squad" class="poptip sort_default_asc center" data-stat="team" scope="col">Squad</th> <th aria-label="Matches Played" class="poptip center" data-stat="games" data-tip="&lt;strong&gt;Matches Played&lt;/strong&gt;&lt;br&gt;Matches Played by the player or squad" scope="col">MP</th> <th aria-label="Wins" class="poptip center" data-stat="wins" data-tip="&lt;strong&gt;Wins&lt;/strong&gt;&lt;br&gt;Wins" scope=

#### Table body tags 

In [67]:
table_tags.tbody

<tbody> <tr><th class="right qualifier qualification_indicator1" csk="1" data-stat="rank" scope="row">1</th><td class="left" data-stat="team"><img alt="Club Crest" height="13" itemscope="image" src="https://cdn.ssref.net/req/202303161/tlogo/fb/mini.a2d435b3.png" style="vertical-align:text-top" width="13"/> <a href="/en/squads/a2d435b3/2015-2016/Leicester-City-Stats">Leicester City</a></td><td class="right" data-stat="games">38</td><td class="right" data-stat="wins">23</td><td class="right" data-stat="ties">12</td><td class="right" data-stat="losses">3</td><td class="right" data-stat="goals_for">68</td><td class="right" data-stat="goals_against">36</td><td class="right" data-stat="goal_diff">+32</td><td class="right" data-stat="points">81</td><td class="right" data-stat="points_avg">2.13</td><td class="right" csk="31998" data-stat="attendance_per_g">31,998</td><td class="right" csk="24" data-stat="top_team_scorers"><a href="/en/players/45963054/Jamie-Vardy">Jamie Vardy</a> - <span>24</s

In [68]:
content_to_list(table_tags.thead.contents)

[<tr> <th aria-label="Rank" class="poptip sort_default_asc center" data-stat="rank" data-tip="&lt;strong&gt;Rank&lt;/strong&gt;&lt;br&gt;Squad finish in competition&lt;br&gt;Finish within the league or competition.&lt;br&gt;For knockout competitions may show final round reached.&lt;br&gt;Colors and arrows represent promotion/relegation or qualifiation for continental cups.&lt;br&gt;Trophy indicates team won league whether by playoffs or by leading the table.&lt;br&gt;Star indicates topped table in league USING another means of naming champion." scope="col">Rk</th> <th aria-label="Squad" class="poptip sort_default_asc center" data-stat="team" scope="col">Squad</th> <th aria-label="Matches Played" class="poptip center" data-stat="games" data-tip="&lt;strong&gt;Matches Played&lt;/strong&gt;&lt;br&gt;Matches Played by the player or squad" scope="col">MP</th> <th aria-label="Wins" class="poptip center" data-stat="wins" data-tip="&lt;strong&gt;Wins&lt;/strong&gt;&lt;br&gt;Wins" scope="col">W

In [69]:
table_header_tags = content_to_list(table_tags.thead.contents)

In [70]:
table_header_tags

[<tr> <th aria-label="Rank" class="poptip sort_default_asc center" data-stat="rank" data-tip="&lt;strong&gt;Rank&lt;/strong&gt;&lt;br&gt;Squad finish in competition&lt;br&gt;Finish within the league or competition.&lt;br&gt;For knockout competitions may show final round reached.&lt;br&gt;Colors and arrows represent promotion/relegation or qualifiation for continental cups.&lt;br&gt;Trophy indicates team won league whether by playoffs or by leading the table.&lt;br&gt;Star indicates topped table in league USING another means of naming champion." scope="col">Rk</th> <th aria-label="Squad" class="poptip sort_default_asc center" data-stat="team" scope="col">Squad</th> <th aria-label="Matches Played" class="poptip center" data-stat="games" data-tip="&lt;strong&gt;Matches Played&lt;/strong&gt;&lt;br&gt;Matches Played by the player or squad" scope="col">MP</th> <th aria-label="Wins" class="poptip center" data-stat="wins" data-tip="&lt;strong&gt;Wins&lt;/strong&gt;&lt;br&gt;Wins" scope="col">W

#### Creating a list containing the table headers

In [72]:
table_header_tags = table_header_tags[0]

In [76]:
table_header_tags = content_to_list(table_header_tags.contents)

In [78]:
list_of_table_header = []
for i in table_header_tags:
    list_of_table_header.append(i.text)

In [79]:
list_of_table_header

['Rk',
 'Squad',
 'MP',
 'W',
 'D',
 'L',
 'GF',
 'GA',
 'GD',
 'Pts',
 'Pts/MP',
 'Attendance',
 'Top Team Scorer',
 'Goalkeeper',
 'Notes']

In [84]:
table_body_tags = content_to_list(table_tags.tbody.contents)

In [85]:
for i in table_body_tags:
    o

AttributeError: 'list' object has no attribute 'contents'

In [34]:
text_extract_from_contents(table_tags.thead.contents)

[' Rk Squad MP W D L GF GA GD Pts Pts/MP Attendance Top Team Scorer Goalkeeper Notes ']

In [13]:
URL_to_table_csv(start_year=2015,number_of_seasons=2)

https://fbref.com/en/comps/9/2015-2016/2015-2016-Premier-League-Stats
200


TypeError: final_table() got an unexpected keyword argument 'headers_list'

In [19]:
dummy_years = 2015

In [23]:
base_dummy_url = f"https://fbref.com/en/comps/9/{str(dummy_years)}/{str(dummy_years)}-Premier-League-Stats"
print(base_dummy_url)

        # Getting tags
soup_ = request_to_soup(url= base_dummy_url)

https://fbref.com/en/comps/9/2015/2015-Premier-League-Stats
200


In [21]:
table_header_list = text_extract_from_contents(table_tags.thead.tr.contents)

NameError: name 'table_tags' is not defined

In [24]:
table_tags = soup_.find('table')
    
    # Table header section    
    # Separating tags for scraping the text
content_to_list(table_tags.thead.tr.contents)

    # Scraping the text from tags contents
table_header_list = text_extract_from_contents(table_tags.thead.tr.contents)
    
    # Table body section 
    # Scraping the body tags from table    
table_details = content_to_list(table_tags.tbody.contents)
    

In [None]:
scraper

In [12]:
def URL_to_table_csv(start_year,number_of_seasons):

    for i in range(number_of_seasons):
        # Creating a template of years for url
        dummy_years = f"{start_year+i}-{start_year+i+1}"
        base_dummy_url = f"https://fbref.com/en/comps/9/{str(dummy_years)}/{str(dummy_years)}-Premier-League-Stats"
        print(base_dummy_url)

        # Getting tags
        soup = request_to_soup(url= base_dummy_url)

        # Calling the scraper function
        Squad_stats = scraper(soup_= soup)

        # Creating a template for naming the files 
        file_name =f"Premier_league_season_table_{dummy_years}"

        # Converting the dataframe to csv file
        Season_table.to_csv(file_name)
        if i == 0: 
            print(f"{i+1} season successfully scraped ✔ ")
        elif i >0:
            print(f"{i+1} seasons successfully scraped ✔ ")
        if i == number_of_seasons-1:
            print(f'All seasons scraped')
        
        time.sleep(5)

In [11]:
def scraper(soup_):
    # Getting tags for regular season tabel
    table_tags = soup_.find('table')
    
    # Table header section    
    # Separating tags for scraping the text
    content_to_list(table_tags.thead.tr.contents)

    # Scraping the text from tags contents
    table_header_list = text_extract_from_contents(table_tags.thead.tr.contents)
    
    # Table body section 
    # Scraping the body tags from table    
    table_details = content_to_list(table_tags.tbody.contents)
    
    # Creating a final table in the as dictionary
    table = final_table(headers_list=table_header_list,table_content=table_details)
    
    return pd.DataFrame(table)