# The aim is to have a dataset with club profiles of the teams from PL 19/20

##### The variables will be;
Club name, Ground, Capacity, Owner, Manager, position at the end of the season

In [1]:
##Import the necessary modules
import urllib.request
from bs4 import BeautifulSoup as bs

#### For debugging purposes, I'll first do this exercise for one team, say, Arsenal

In [2]:
# Load Arsenal's Wikipedia page
response = urllib.request.urlopen('https://en.wikipedia.org/wiki/Arsenal_F.C.')

#read the html of the page
html = response.read()

# Convert into a Beautiful Soup object
soup = bs(html)

## The infobox on the Wikipedia page seems to contain most of the information I need, so I'll scrape the info box

info_box = soup.find(class_ = 'infobox vcard')



In [3]:
# Separate the html into table rows (with the 'tr' tag)
rows = info_box.find_all('tr')

In [4]:
# The information I need starts from the second to the 9th row of the info box, so;

club_info = {}

for index,row in enumerate(rows):
    if index in range(1,len(rows)-5):
        attribute = row.find('th').get_text()
        value = row.find('td').get_text().replace('\xa0',' ')
        club_info[attribute] = value
    else:
        continue
    
print(club_info)

{'Full name': 'Arsenal Football Club', 'Nickname(s)': 'The Gunners', 'Founded': 'October 1886; 134 years ago (1886-10) as Dial Square[1]', 'Ground': 'Emirates Stadium', 'Capacity': '60,704[2]', 'Owner': 'Kroenke Sports & Entertainment', 'Manager': 'Mikel Arteta', 'League': 'Premier League', '2019–20': 'Premier League, 8th of 20', 'Website': 'Club website'}


## Now i can loop this over all the other clubs

In [5]:
#load the 19/20 season Wikipedia page
responses = urllib.request.urlopen('https://en.wikipedia.org/wiki/2019%E2%80%9320_Premier_League')
html1 = responses.read()
soup1 = bs(html1)

In [6]:
# I grab the second table on the page since contains links to all 20 clubs
links_table = soup1.find(class_ = 'wikitable sortable')

# Grab each link and append into a list
links = []
names = []
for row in links_table.find_all('tr')[1:]:
    link = row.find_all('td')[0].a['href']
    name = row.find_all('td')[0].a['title']
    links.append(link)
    names.append(name)

In [7]:
links = []
for row in links_table.find_all('tr')[1:]:
    link = row.find_all('td')[0].a['href']
    name = row.find_all('td')[0].a['title']
    links.append(link)
    names.append(name)
print(links[0]) # note how the link is incomplete...later I should the root 'http...'

/wiki/Arsenal_F.C.


In [8]:
# Get club information 
def club(url):
    
    res = urllib.request.urlopen(url)
    html = res.read()
    soup = bs(html)
    info_box = soup.find(class_ = 'infobox vcard')
    rows = info_box.find_all('tr')
    
    club_info = {}
    for index, row in enumerate(rows):
        if index in range(1,len(rows)-5):
            attribute = row.find('th').get_text()
            value = row.find('td').get_text().replace('\xa0',' ')
            club_info[attribute] = value
        else:
            continue
            
    return club_info

In [9]:
club_list = []
base_path = 'https://en.wikipedia.org'
for i in range(0,20):
    relative_path = links[i]
    full_path = base_path + relative_path
    club_list.append(club(full_path))
print(club_list)

[{'Full name': 'Arsenal Football Club', 'Nickname(s)': 'The Gunners', 'Founded': 'October 1886; 134 years ago (1886-10) as Dial Square[1]', 'Ground': 'Emirates Stadium', 'Capacity': '60,704[2]', 'Owner': 'Kroenke Sports & Entertainment', 'Manager': 'Mikel Arteta', 'League': 'Premier League', '2019–20': 'Premier League, 8th of 20', 'Website': 'Club website'}, {'Full name': 'Aston Villa Football Club', 'Nickname(s)': 'The VillaThe LionsThe Claret & Blue Army', 'Short name': 'Villa, AVFC', 'Founded': '21 November 1874; 145 years ago (1874-11-21)[1]', 'Ground': 'Villa Park', 'Capacity': '42,749[2]', 'Owner(s)': 'Nassef SawirisWes Edens', 'Chairman': 'Nassef Sawiris[3]', 'Head Coach': 'Dean Smith[4]', 'League': 'Premier League', '2019–20': 'Premier League, 17th of 20', 'Website': 'Club website'}, {'Full name': 'AFC Bournemouth[1]', 'Nickname(s)': 'The Cherries, Boscombe', 'Short name': 'AFCB', 'Founded': '1899; 121 years ago (1899) (as Boscombe)', 'Ground': 'Dean Court', 'Capacity': '11,364

In [12]:
club_list[19]

{'Full name': 'Wolverhampton Wanderers Football Club',
 'Nickname(s)': 'Wolves, The Wanderers',
 'Founded': "1877; 143 years ago (1877), as St. Luke's F.C.",
 'Ground': 'Molineux Stadium',
 'Capacity': '32,050[1]',
 'Owner': 'Fosun International',
 'Chairman': 'Jeff Shi[2]',
 'Head coach': 'Nuno Espírito Santo[3]',
 'League': 'Premier League',
 '2019–20': 'Premier League, 7th of 20',
 'Website': 'Club website'}

In [11]:
# Save as a json file

import json

def save(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data,f,ensure_ascii=False, indent=2)

def load(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)
    
save('Club_profile.json', club_list)