<a href="https://colab.research.google.com/github/Adrianonsare/WebScrapping/blob/main/EPLWebScrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install nerodia
# !pip install selenium

In [None]:
# !pip install selenium
# !apt-get update 
# !apt install chromium-chromedriver


In [None]:

# from selenium import webdriver
# from selenium.webdriver.chrome.options import Options
# chrome_options = webdriver.ChromeOptions()

# # Setting chrome options as headless means the browser GUI will not be used

# chrome_options.add_argument('--headless')

# # No sandbox,and 'disable-dev-shm-usage' prevents headerless chrome from crashing
# chrome_options.add_argument('--no-sandbox')
# chrome_options.add_argument('--disable-dev-shm-usage')
# driver = webdriver.Chrome('chromedriver',chrome_options=chrome_options)

In [None]:
from nerodia.browser import Browser
import pandas as pd
import numpy as np
import time
from functools import reduce
from selenium.webdriver.chrome.options import Options

options = Options()

# Using Selenium Chrome Options, set headless so the physical GUI of Chrome doesn't have to be used, and no sandbox to avoid crashes on Deepnote
options.add_argument('--headless')
options.add_argument('--no-sandbox') # Remove if running outside deepnote

browser = Browser('chrome', options=options) # Create Browser


#The analysis focuses on attacking players
# Each of the metrics has an associated data table
# Alist of table tags is created to allow appending to the url string
tags=['goals','total_pass','touches','total_scoring_att',
      'big_chance_missed','appearances','total_offside','dispossessed',
      'total_cross','total_through_ball']
se='?se=418'

#Base url string
urls=['https://www.premierleague.com/stats/top/players/'+str(i)+se for i in tags]

#For each table, urls are extracted
goals=urls[0]
total_pass=urls[1]
touches=urls[2]
total_shots=urls[3]
big_chance_missed=urls[4]
appearances=urls[5]
offsides=urls[6]
disposessions=urls[7]
total_cross=urls[8]
total_through_ball=urls[9]

In [None]:
goals

In [None]:
#create a function to read in the various tables into dataframes
def create_df(df,url,statcol):
  browser.goto(url)
  time.sleep(4) # time delay for data reload

  #Create dataframe by reading from html table
  df = pd.read_html(browser.html)[0]

  #table is paginated. At the last page, the "next" button becomes inactive
  # the while not loop below allows reading of data from each page until the inactive button is found
  # https://deepnote.com/@danielstpaul/EPL-Web-Scraper-GfUde655TFGQbN7gE42hRA
  while not browser.div(class_name=['paginationBtn', 'paginationNextContainer', 'inactive']).exists:
    browser.div(class_name=['paginationBtn', 'paginationNextContainer']).fire_event('onClick') # fire onClick event on page next element. 

    df = df.append(pd.read_html(browser.html)[0]) # append the table from this page with the existing dataframe.

  #browser.close() # Close Browser
  df=df.rename(columns={"Stat": statcol})
  df = df.iloc[: , :-1] # Drop last "nan" column
  df=df.drop(columns='Rank') # Drop "rank" column
  #name=name.rename(columns={"Stat":statcol})
  return df


In [None]:
#Apply function to create dataframes
goal_df=create_df('goals_df',goals,'Goals')
total_pass_df=create_df('total_pass',total_pass,'TotalPasses')
touches_df=create_df('touches',touches,'Touches')
total_shots_df=create_df('total_shots',total_shots,'TotalShots')
big_chance_missed_df=create_df('big_chance_missed',big_chance_missed,'BigChancesMissed')
appearances_df=create_df('appearances',appearances,'Appearances')
offsides_df=create_df('offsides',offsides,'Offsides')

disposessions_df=create_df('disposessions',disposessions,'Disposessions')
total_cross_df=create_df('total_cross',total_cross,'TotalCrosses')
total_through_ball_df=create_df('total_through_ball',total_through_ball,'ThroughBalls')


In [None]:
#Merging all dataframes into 1 table

#Creating list of dataframes
dfs=[goal_df, total_pass_df, touches_df,total_shots_df,
     big_chance_missed_df,appearances_df,offsides_df,disposessions_df,
     total_cross_df,total_through_ball_df]
#Use reduce function to merge all the dataframes all at once
df = reduce(lambda  left,right: pd.merge(left,right,on=['Player','Nationality','Club'],
                                            how='outer'), dfs).drop_duplicates()

#Create First Name and Last Name Features, in order to enhance the optics during vizualization
df.loc[df['Player'].str.split().str.len() == 2, 'FirstName'] = df['Player'].str.split().str[0]
df['FirstName']=df['FirstName'].str[0]
df.loc[df['Player'].str.split().str.len() == 2, 'LastName'] = df['Player'].str.split().str[-1]
df['PlayerName']=df['FirstName']+str(".")+df['LastName']
#Drop Player column
df.drop(columns=['FirstName','LastName'],inplace=True)
df.head()

In [None]:
df.info()

##Data Preparation and Feature Engineering

In [None]:
df.columns

In [None]:
df=df.fillna(0) #Fill missing values with 0

In [None]:
df['ChanceConversion']=df['Goals']/df['TotalShots'] #Create new column for chance conversion

In [None]:
df.describe().T#summary statistics

## Exploratory Data Analysis

In [None]:
#Import plotting libraries
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

In [None]:
df['Goals'].hist() #Goals histogram

In [None]:
sns.pairplot(df,hue='Club') #Pairplot of all variables

In [None]:
#Plotting correlation matrix for all variables
plt.figure(figsize=(12,7))
corr=df.corr()
sns.heatmap(corr,cmap='jet',annot=True)
plt.title("Correlation Matrix for Player Data")
plt.show()

In [None]:
#Plotting Big chances missed vs 
fig = px.scatter(df,x="TotalShots", y="BigChancesMissed",title='Goals Vs Total Shots',
                 height=800,width=1100,text="PlayerName")
fig.update_traces(textposition="bottom right")
fig.show()

In [None]:

fig = px.scatter(df,x="ChanceConversion", y="Goals",title='Goals Vs Total Shots',
                 height=800,width=1100,text="PlayerName")
fig.update_traces(textposition="bottom right")
fig.show()

In [None]:

fig = px.scatter(df,x="TotalShots", y="Goals",title='Goals Vs Total Shots',
                 height=800,width=950,text="PlayerName")
fig.update_traces(textposition="bottom right")
fig.show()

In [None]:
plt.figure(figsize=(13,8))
sns.displot(x=df['Goals'],hue='Club',data=df,kind="kde", multiple="stack")

In [None]:
plt.figure(figsize=(13,8))
sns.displot(x=df['TotalShots'],hue='Club',data=df,kind="hist")#, multiple="stack")

### Machine Learning

In [None]:

from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
df.columns

In [None]:
df=df.replace([np.inf, -np.inf], np.nan).dropna(axis=1)

In [None]:
str_cols=df.loc[:, df.dtypes == object]
str_cols

In [None]:
#Split data into features and target
X = str_cols.drop('PlayerName',axis=1)
cats=['Player', 'Club', 'Nationality']
X[cats]=X[cats].astype('category')
X[cats] = X[cats].apply(lambda x: x.cat.codes)
y= df['Goals']

#Split data into training and test sets
#solar
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#Initiate model
regressor = RandomForestRegressor(n_estimators=20,max_depth=8, random_state=0)

#Solar
regressor.fit(X_train, y_train)
# y_pred = regressor.predict(X_test)


In [None]:
y_pred = regressor.predict(X_test)


In [None]:
plt.scatter(y_test,y_pred)

In [None]:
regressor.score(X_test, y_test)