In [103]:
import pandas as pd
import requests
from bs4 import BeautifulSoup, Comment
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [104]:
url = 'https://fbref.com/en/comps/9/stats/Premier-League-Stats#stats_standard'
#     table_class = 'stats_table sortable min_width now_sortable'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
# I noticed that tables 
table = BeautifulSoup(soup.select_one('#all_stats_standard').find_next(text=lambda x: isinstance(x, Comment)), 'html.parser')
df = pd.read_html(str(table))[0]

In [105]:
# some data cleaning

# remove multilevel columns
df.columns = df.columns.droplevel()

In [106]:
# Remove duplicated columns
df = df.loc[:,~df.columns.duplicated()]

In [107]:
# Remove columns that arent needed. 
df.drop(df.columns[[0,18,19,20,21,22,23,24,25]], inplace=True, axis = 1)

In [108]:
# Nation, , 
# Trim Nation to get last 3 digits
df['Nation'] = df['Nation'].str[-3:]
# df

In [109]:
# Position 
# SOme players have more than one position so I will take the first one
df['Pos'] = df['Pos'].str[:2]
# df

In [110]:
# Age
# Remain with only full age of player
# Drop Born year column
df['Age'] = df['Age'].str[:2]
df.drop(['Born'], inplace=True, axis = 1)

In [111]:
# I noticed there were 21 inique teams as opposed to 20,on checking there was a team named Squad which is the columns name. 
# drop repeated headers scrapped as rows
df.drop(df[df['Squad'] == 'Squad'].index, inplace = True)
# df[df['Squad'] == 'Squad']

In [112]:
# From .describe I noticed there were duplicate players on checking its because players have played for more than one team this season
df[df['Player'].duplicated(keep=False)]

Unnamed: 0,Player,Nation,Pos,Squad,Age,MP,Starts,Min,90s,Gls,Ast,G-PK,PK,PKatt,CrdY,CrdR
44,Ross Barkley,ENG,MF,Chelsea,27,2,0,42,0.5,0,0,0,0,0,0,0
45,Ross Barkley,ENG,MF,Aston Villa,27,18,14,1167,13.0,3,1,3,0,0,0,0
269,Ruben Loftus-Cheek,ENG,FW,Chelsea,25,1,1,60,0.7,0,0,0,0,0,0,0
270,Ruben Loftus-Cheek,ENG,MF,Fulham,25,24,19,1661,18.5,1,0,1,0,0,3,0
286,Ainsley Maitland-Niles,ENG,DF,Arsenal,23,11,5,490,5.4,0,0,0,0,0,0,0
287,Ainsley Maitland-Niles,ENG,MF,West Brom,23,7,7,630,7.0,0,0,0,0,0,1,0
320,Takumi Minamino,JPN,MF,Southampton,26,6,5,410,4.6,2,0,2,0,0,0,0
321,Takumi Minamino,JPN,MF,Liverpool,26,9,2,293,3.3,1,0,1,0,0,0,0
411,Mathew Ryan,AUS,GK,Arsenal,28,1,1,90,1.0,0,0,0,0,0,0,0
412,Mathew Ryan,AUS,GK,Brighton,28,11,11,990,11.0,0,0,0,0,0,1,0


In [113]:
# change data types
df[['Age','MP','Starts','Min','Gls','Ast','G-PK','PK','PKatt','CrdY','CrdR']] = df[['Age','MP','Starts','Min','Gls','Ast','G-PK','PK','PKatt','CrdY','CrdR']].apply(pd.to_numeric)

In [72]:
df_nation = df.groupby(['Squad','Nation'])['Nation'].count().reset_index(name='count').sort_values(['Squad','count'],ascending=False)

In [94]:
df_nation

Unnamed: 0,Squad,Nation,count
258,Wolves,POR,9
251,Wolves,ENG,4
248,Wolves,BRA,2
252,Wolves,ESP,2
247,Wolves,BEL,1
249,Wolves,CIV,1
250,Wolves,DEN,1
253,Wolves,FRA,1
254,Wolves,ITA,1
255,Wolves,MAR,1


In [98]:
import plotly.express as px
 
fig = px.bar(df_nation, x="Squad", y="count", color="Nation",
            hover_data=['Squad'], barmode = 'stack', title = "EPL Player Nationality distribution By Team")
 
fig.show()

In [118]:
df_age = df.groupby(['Age','Squad'])['Player'].count().reset_index(name='count')

In [119]:
df_age

Unnamed: 0,Age,Squad,count
0,17,Manchester Utd,1
1,17,Sheffield Utd,1
2,17,Tottenham,1
3,18,Burnley,1
4,18,Manchester City,1
5,18,Newcastle Utd,1
6,18,Wolves,1
7,19,Arsenal,2
8,19,Aston Villa,1
9,19,Brighton,1


In [121]:
fig_age = px.bar(df_age, x="Age", y="count", color="Squad",
            hover_data=['Squad'], barmode = 'stack', title = "EPL Player Age distribution By Team")

In [122]:
fig_age