<a href="https://colab.research.google.com/github/Adrianonsare/WebScrapping/blob/main/EPLWebScrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip3 install datapane
# !pip install pyyaml==5.4.1

In [None]:
!pip install nerodia
!pip install selenium
!apt-get update 
!apt install chromium-chromedriver


# 1. Scraping the EPL Website

In [None]:
#importing libraries
from nerodia.browser import Browser
import pandas as pd
import numpy as np
import time
from functools import reduce
from selenium.webdriver.chrome.options import Options
import plotly.express as px
import datapane as dp 
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import altair as alt

In [None]:


options = Options()

# Using Selenium Chrome Options, set headless so the physical GUI of Chrome doesn't have to be used, and no sandbox to avoid crashes on Deepnote
options.add_argument('--headless')
options.add_argument('--no-sandbox') # Remove if running outside deepnote

browser = Browser('chrome', options=options) # Create Browser


#The analysis focuses on attacking players
# Each of the metrics has an associated data table
# Alist of table tags is created to allow appending to the url string
tags=['goals','total_pass','touches','total_scoring_att',
      'big_chance_missed','appearances','total_offside','dispossessed',
      'total_cross','total_through_ball']
se='?se=418'

#Base url string
urls=['https://www.premierleague.com/stats/top/players/'+str(i)+se for i in tags]

#For each table, urls are extracted
goals=urls[0]
total_pass=urls[1]
touches=urls[2]
total_shots=urls[3]
big_chance_missed=urls[4]
appearances=urls[5]
offsides=urls[6]
disposessions=urls[7]
total_cross=urls[8]
total_through_ball=urls[9]

In [None]:
goals #inspect url

In [None]:
#create a function to read in the various tables into dataframes
def create_df(df,url,statcol):
  browser.goto(url)
  time.sleep(4) # time delay for data reload

  #Create dataframe by reading from html table
  df = pd.read_html(browser.html)[0]

  #table is paginated. At the last page, the "next" button becomes inactive
  # the while not loop below allows reading of data from each page until the inactive button is found
  # https://deepnote.com/@danielstpaul/EPL-Web-Scraper-GfUde655TFGQbN7gE42hRA
  while not browser.div(class_name=['paginationBtn', 'paginationNextContainer', 'inactive']).exists:
    browser.div(class_name=['paginationBtn', 'paginationNextContainer']).fire_event('onClick') # fire onClick event on page next element. 

    df = df.append(pd.read_html(browser.html)[0]) # append the table from this page with the existing dataframe.

  #browser.close() # Close Browser
  df=df.rename(columns={"Stat": statcol})
  df = df.iloc[: , :-1] # Drop last "nan" column
  df=df.drop(columns='Rank') # Drop "rank" column
  #name=name.rename(columns={"Stat":statcol})
  return df


In [None]:
#Apply function to create dataframes
goal_df=create_df('goals_df',goals,'Goals')
total_pass_df=create_df('total_pass',total_pass,'TotalPasses')
touches_df=create_df('touches',touches,'Touches')
total_shots_df=create_df('total_shots',total_shots,'TotalShots')
big_chance_missed_df=create_df('big_chance_missed',big_chance_missed,'BigChancesMissed')
appearances_df=create_df('appearances',appearances,'Appearances')
offsides_df=create_df('offsides',offsides,'Offsides')

disposessions_df=create_df('disposessions',disposessions,'Disposessions')
total_cross_df=create_df('total_cross',total_cross,'TotalCrosses')
total_through_ball_df=create_df('total_through_ball',total_through_ball,'ThroughBalls')


In [None]:
#Merging all dataframes into 1 df

#Creating list of dataframes
dfs=[goal_df, total_pass_df, touches_df,total_shots_df,
     big_chance_missed_df,appearances_df,offsides_df,disposessions_df,
     total_cross_df,total_through_ball_df]
#Use reduce function to merge all the dataframes all at once
df = reduce(lambda  left,right: pd.merge(left,right,on=['Player','Nationality','Club'],
                                            how='outer'), dfs).drop_duplicates()

#Create First Name and Last Name Features, in order to enhance the optics during vizualization
df.loc[df['Player'].str.split().str.len() == 2, 'FirstName'] = df['Player'].str.split().str[0]
df['FirstName']=df['FirstName'].str[0]
df.loc[df['Player'].str.split().str.len() == 2, 'LastName'] = df['Player'].str.split().str[-1]
df['PlayerName']=df['FirstName']+str(".")+df['LastName']
#Drop Player column
df.drop(columns=['FirstName','LastName'],inplace=True)
df.head()

In [None]:
df.info()

# 2. Data Preprocessing

In [None]:
df.isnull().sum(axis=0) #Count  null values

In [None]:
df.describe().T #Summary stats

In [None]:
df=df.fillna(0) #Fill missing values with 0

In [None]:
df.describe().T #check that all rows have vales

In [None]:
df['ChanceConversion']=df['Goals']/df['TotalShots'] #Create new column for chance conversion
df=df.replace([np.inf, -np.inf], np.nan)
df=df.fillna(0)

#2.Exploratory Data Analysis

In [None]:
# histogram of goalscoring distribution
plt.figure(figsize=(12,7))
g=sns.histplot(df['Goals'])
plt.title("Distribution of Goals Scored")
plt.show()

# dp.Report(
#     dp.Plot(g)
# ).upload(name="Goals Distribution")

In [None]:
df.columns

In [None]:
#Distribution of goals scored per club

fig=px.box(df,y='Goals',x='Club',title='Goal Scoring Distribution Per Club',template='simple_white',
           hover_data=["Player"],color='Club')
fig.show()

# dp.Report(
#     dp.Plot(fig)
# ).upload(name="Goal Scoring Distribution Per Club")

In [None]:
# Distribution of median goals scored by nationality
goalsNationality=df.groupby('Nationality')['Goals'].median().sort_values(ascending=False).reset_index()[:10]

fig=px.bar(goalsNationality,x='Nationality',y='Goals',title='Cumulative Goal Scoring Per Nationality',
           text='Goals',template='simple_white')
fig.show()


In [None]:
#Pairwise correlation of all variables
pairplot=sns.pairplot(df) 
plt.title("Pairwise Relationships of Variables")


In [None]:
#Plotting correlation matrix for all variables
plt.figure(figsize=(12,7))
corr=df.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))


corrPlot=sns.heatmap(corr,mask=mask,cmap='jet',annot=True)
plt.title("Correlation Matrix for Player Stats")
plt.show()


# dp.Report(
#     dp.Plot(corrPlot)
# ).upload(name="Correlation Matrix")

In [None]:
#Plotting Goals vs Total Shots

fig = px.scatter(df[:20],x="TotalShots", y="Goals",title='Top 20 Goals Vs Total Shots',
                 height=800,width=1100,text="PlayerName",template='plotly_dark')
fig.update_traces(textposition="top center",orientation='v',textfont_size=8.5,
                  )
fig.show()

# dp.Report(
#     dp.Plot(fig)
# ).upload(name="Goals Vs Shots")

In [None]:
#Plotting Goals vs Total Shots

fig = px.scatter(df[:20],x="BigChancesMissed", y="Goals",title='Top 20 Goals Vs Big Chances Missed',
                 height=800,width=1100,text="PlayerName",template='plotly_dark')
fig.update_traces(textposition="top center",orientation='v',textfont_size=8.5,
                  )
fig.show()

# dp.Report(
#     dp.Plot(fig)
# ).upload(name="Goals Vs Big Chances Missed")

In [None]:
df['ShotsPerGoal']=1/df['ChanceConversion']
fig1 = px.scatter(df[:20],x="ShotEfficiency", y="Goals",title='Top 20 Goals Vs Shots Per Goal',
                 height=800,width=1100,text="PlayerName",template='plotly_dark')
fig1.update_traces(textposition="top center",orientation='v',textfont_size=8.5)
fig1.show()

# dp.Report(
#     dp.Plot(fig1)
# ).upload(name="Goals Vs Shots Per Goal")

In [None]:

fig1 = px.scatter(df[:20],x="Offsides", y="Goals",title='Top 20 Goals Vs Offsides',
                 height=800,width=1100,text="PlayerName",template='plotly_dark')
fig1.update_traces(textposition="top center",orientation='v',textfont_size=8.5)
fig1.show()

# dp.Report(
#     dp.Plot(fig1)
# ).upload(name="Goals Vs Offsides")

In [None]:
plt.figure(figsize=(14,10))
sns.displot(x=df['Goals'],hue='Club',data=df,kind="kde", multiple="stack")

In [None]:
plt.figure(figsize=(13,8))
sns.displot(x=df['TotalShots'],hue='Club',data=df,kind="hist")#, multiple="stack")

### Machine Learning - Prediction of Goal-Scoring

In [None]:

# from sklearn import metrics
# from sklearn.metrics import mean_squared_error, r2_score

# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder
# from sklearn.model_selection import train_test_split

In [None]:
# df.columns

In [None]:
# df=df.replace([np.inf, -np.inf], np.nan).dropna(axis=1)

In [None]:
# str_cols=df.loc[:, df.dtypes == object]
# str_cols

In [None]:
# #Split data into features and target
# X = str_cols.drop('PlayerName',axis=1)
# cats=['Player', 'Club', 'Nationality']
# X[cats]=X[cats].astype('category')
# X[cats] = X[cats].apply(lambda x: x.cat.codes)
# y= df['Goals']

# #Split data into training and test sets
# #solar
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# #Initiate model
# regressor = RandomForestRegressor(n_estimators=20,max_depth=8, random_state=0)

# #Solar
# regressor.fit(X_train, y_train)
# # y_pred = regressor.predict(X_test)


In [None]:
# y_pred = regressor.predict(X_test)


In [None]:
# plt.scatter(y_test,y_pred)

In [None]:
# regressor.score(X_test, y_test)