# ⚽Premier League Match Analysis🥅
📍 Project Title : Premier League Match Analysis

📍 Aim of the Project : Analyze the PL matches for the season of 2019-20

📍 Dataset : https://www.kaggle.com/datasets/idoyo92/epl-stats-20192020

📍 Libraries Required : Pandas, Numpy, Seaborn, Matplotlib, Sklearn

## Import Required Libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# import dataframe_image as dfi

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score

## Data Preprocessing

In [2]:
data = pd.read_csv('epl2020.csv')

In [3]:
data

Unnamed: 0.1,Unnamed: 0,h_a,xG,xGA,npxG,npxGA,deep,deep_allowed,scored,missed,...,AF.x,AC.x,AY.x,AR.x,B365H.x,B365D.x,B365A.x,HtrgPerc,AtrgPerc,matchDay
0,1,h,2.234560,0.842407,2.234560,0.842407,11,5,4,1,...,9,2,2,0,1.14,10.00,19.00,0.466667,0.416667,Fri
1,2,a,0.842407,2.234560,0.842407,2.234560,5,11,1,4,...,9,2,2,0,1.14,10.00,19.00,0.466667,0.416667,Fri
2,3,a,3.183770,1.200300,2.422640,1.200300,9,1,5,0,...,13,1,2,0,12.00,6.50,1.22,0.600000,0.642857,Sat
3,4,h,1.200300,3.183770,1.200300,2.422640,1,9,0,5,...,13,1,2,0,12.00,6.50,1.22,0.600000,0.642857,Sat
4,5,h,1.340990,1.598640,1.340990,1.598640,4,6,1,1,...,19,4,1,0,1.95,3.60,3.60,0.230769,0.375000,Sat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
571,572,a,0.445922,2.028830,0.445922,2.028830,3,9,0,4,...,10,1,2,0,1.80,3.90,4.20,0.647059,0.333333,Sun
572,573,a,0.585469,1.738900,0.585469,1.738900,2,3,0,2,...,9,11,4,0,5.00,4.33,1.60,0.500000,0.285714,Sun
573,574,h,1.738900,0.585469,1.738900,0.585469,3,2,2,0,...,9,11,4,0,5.00,4.33,1.60,0.500000,0.285714,Sun
574,575,a,0.672448,2.412390,0.672448,1.651220,2,8,0,4,...,12,0,1,0,1.36,5.25,8.00,0.466667,0.250000,Mon


In [4]:
data.shape

(576, 45)

### Convert to datetime type

In [5]:
data['date']= pd.to_datetime(data['date'])

In [6]:
data['date']

0     2019-08-09 20:00:00
1     2019-08-09 20:00:00
2     2019-08-10 12:30:00
3     2019-08-10 12:30:00
4     2019-08-10 15:00:00
              ...        
571   2020-03-08 14:00:00
572   2020-03-08 16:30:00
573   2020-03-08 16:30:00
574   2020-03-09 20:00:00
575   2020-03-09 20:00:00
Name: date, Length: 576, dtype: datetime64[ns]

In [7]:
data = data.drop(['Unnamed: 0'], axis = 1)
data.shape

(576, 44)

In [8]:
data.describe()

Unnamed: 0,xG,xGA,npxG,npxGA,deep,deep_allowed,scored,missed,xpts,wins,...,AST.x,AF.x,AC.x,AY.x,AR.x,B365H.x,B365D.x,B365A.x,HtrgPerc,AtrgPerc
count,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0,...,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0
mean,1.460088,1.460088,1.371809,1.371809,6.623264,6.623264,1.361111,1.361111,1.392007,0.375,...,4.072917,10.902778,4.916667,1.875,0.0625,2.867257,4.319618,4.77441,0.348448,0.361257
std,0.873678,0.873678,0.800827,0.800827,4.294232,4.294232,1.207865,1.207865,0.869274,0.484544,...,2.521193,3.696898,2.793782,1.196371,0.242272,2.275844,1.513534,4.161811,0.155324,0.166637
min,0.054134,0.054134,0.054134,0.054134,0.0,0.0,0.0,0.0,0.0004,0.0,...,0.0,1.0,0.0,0.0,0.0,1.07,3.1,1.14,0.0,0.0
25%,0.840033,0.840033,0.784056,0.784056,4.0,4.0,0.0,0.0,0.636775,0.0,...,2.0,8.0,3.0,1.0,0.0,1.6075,3.4,2.3,0.2475,0.25
50%,1.27536,1.27536,1.23067,1.23067,6.0,6.0,1.0,1.0,1.32135,0.0,...,4.0,11.0,5.0,2.0,0.0,2.2,3.75,3.35,0.333333,0.333333
75%,1.973553,1.973553,1.869485,1.869485,9.0,9.0,2.0,2.0,2.138725,1.0,...,5.0,13.0,7.0,3.0,0.0,3.1,4.75,5.75,0.444444,0.454545
max,6.63049,6.63049,5.86932,5.86932,31.0,31.0,9.0,9.0,2.9992,1.0,...,15.0,24.0,16.0,6.0,1.0,15.0,13.0,26.0,1.0,1.0


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 576 entries, 0 to 575
Data columns (total 44 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   h_a           576 non-null    object        
 1   xG            576 non-null    float64       
 2   xGA           576 non-null    float64       
 3   npxG          576 non-null    float64       
 4   npxGA         576 non-null    float64       
 5   deep          576 non-null    int64         
 6   deep_allowed  576 non-null    int64         
 7   scored        576 non-null    int64         
 8   missed        576 non-null    int64         
 9   xpts          576 non-null    float64       
 10  result        576 non-null    object        
 11  date          576 non-null    datetime64[ns]
 12  wins          576 non-null    int64         
 13  draws         576 non-null    int64         
 14  loses         576 non-null    int64         
 15  pts           576 non-null    int64     

In [10]:
data.isnull().sum()

h_a             0
xG              0
xGA             0
npxG            0
npxGA           0
deep            0
deep_allowed    0
scored          0
missed          0
xpts            0
result          0
date            0
wins            0
draws           0
loses           0
pts             0
npxGD           0
teamId          0
ppda_cal        0
allowed_ppda    0
matchtime       0
tot_points      0
round           0
tot_goal        0
tot_con         0
Referee.x       0
HS.x            0
HST.x           0
HF.x            0
HC.x            0
HY.x            0
HR.x            0
AS.x            0
AST.x           0
AF.x            0
AC.x            0
AY.x            0
AR.x            0
B365H.x         0
B365D.x         0
B365A.x         0
HtrgPerc        0
AtrgPerc        0
matchDay        0
dtype: int64

### Current Points Table

In [11]:
# Getting all the team names
teams = data['teamId'].unique()

# Creating a list of items with required data for all teams
team_result = []
for team in teams:
    team_data = data[data['teamId'] == team]

    wins = team_data['wins'].sum()
    loss = team_data['loses'].sum()
    draws = team_data['draws'].sum()
    goals_scored = team_data['scored'].sum()
    goals_missed = team_data['missed'].sum()
    goal_difference = goals_scored - goals_missed
    total_matches = wins + loss + draws
    total_points = (3 * wins) + draws

    team_result.append([team, total_matches, wins, loss, draws, goals_scored, goals_missed, goal_difference, total_points])

In [12]:
cols = ['Team', 'Matches Played', 'Wins', 'Losses', 'Draws', 'Goals Scored', 'Goals Missed', 'Goal Difference', 'Points']
points_table = pd.DataFrame(team_result, columns = cols)
points_table = points_table.sort_values(by=['Points', 'Matches Played', 'Goal Difference'], ascending = False, ignore_index = True)
points_table.set_index(np.array(range(1, 21)), inplace=True)
points_table.head(20)

Unnamed: 0,Team,Matches Played,Wins,Losses,Draws,Goals Scored,Goals Missed,Goal Difference,Points
1,Liverpool,29,27,1,1,66,21,45,82
2,Man City,28,18,7,3,68,31,37,57
3,Leicester,29,16,8,5,58,28,30,53
4,Chelsea,29,14,9,6,51,39,12,48
5,Man Utd,29,12,8,9,44,30,14,45
6,Wolves,29,10,6,13,41,34,7,43
7,Sheffield United,28,11,7,10,30,25,5,43
8,Tottenham,29,11,10,8,47,40,7,41
9,Arsenal,28,9,6,13,40,36,4,40
10,Burnley,29,11,12,6,34,40,-6,39


In [13]:
# dfi.export(points_table.style.background_gradient(), "mytable.png")

### Encoding

- Home -> 1
- Away -> 0

In [14]:
print(np.array(range(1, 21)))

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]


In [15]:
teams_list = list(teams)
string_equivalent = list(np.array(range(1, 21)))

team_equivalent = zip(teams_list, string_equivalent)
team_equivalent = dict(team_equivalent)
team_equivalent

{'Liverpool': 1,
 'Norwich': 2,
 'Man City': 3,
 'West Ham': 4,
 'Bournemouth': 5,
 'Brighton': 6,
 'Burnley': 7,
 'Crystal Palace': 8,
 'Everton': 9,
 'Sheffield United': 10,
 'Southampton': 11,
 'Watford': 12,
 'Aston Villa': 13,
 'Tottenham': 14,
 'Arsenal': 15,
 'Leicester': 16,
 'Newcastle United': 17,
 'Wolves': 18,
 'Chelsea': 19,
 'Man Utd': 20}

In [16]:
data['h_a'] = data['h_a'].apply(lambda x: 1 if x == 'h' else 0)
data['result'] = data['result'].apply(lambda x: 1 if x == 'w' else (0 if x == 'd' else -1))

### Scaling Value

In [17]:
scaler = MinMaxScaler()

data[['xG', 'xGA', 'HS.x', 'HST.x', 'HF.x', 'HC.x', 'HY.x', 'AS.x', 'AST.x', 'AF.x', 'AC.x', 'AY.x']] = scaler.fit_transform(data[['xG', 'xGA', 'HS.x', 'HST.x', 'HF.x', 'HC.x', 'HY.x', 'AS.x', 'AST.x', 'AF.x', 'AC.x', 'AY.x']])

### Expected Points

In [18]:
team_expected_points = []
for team in teams:
    team_data = data[data['teamId'] == team]

    exp_pts = round(team_data['xpts'].sum(), 2)

    team_expected_points.append([team, exp_pts])

team_expected_points = pd.DataFrame(team_expected_points, columns=['Team', 'Expected Points'])
team_expected_points.sort_values(by = ['Expected Points'], ascending = False, inplace = True, ignore_index = True)
team_expected_points.set_index(np.array(range(1, 21)), inplace=True)
# team_expected_points.head(20)

### Teams Performance

In [19]:
team_performance = []

for team in teams:
    current_pos = points_table[points_table['Team'] == team].index[0]
    expected_pos = team_expected_points[team_expected_points['Team'] == team].index[0]

    overperforming = 'Yes' if current_pos < expected_pos else 'No'

    team_performance.append([team, current_pos, expected_pos, overperforming])

team_performance = pd.DataFrame(team_performance, columns = ['Team', 'Actual Position', 'Expected Position', 'Overperforming'])
team_performance.sort_values(by = ['Actual Position'], ascending = True, inplace = True, ignore_index = True)
team_performance.set_index(np.array(range(1, 21)), inplace=True)
team_performance

Unnamed: 0,Team,Actual Position,Expected Position,Overperforming
1,Liverpool,1,2,Yes
2,Man City,2,1,No
3,Leicester,3,6,Yes
4,Chelsea,4,3,No
5,Man Utd,5,4,No
6,Wolves,6,5,No
7,Sheffield United,7,10,Yes
8,Tottenham,8,12,Yes
9,Arsenal,9,14,Yes
10,Burnley,10,9,No


## Model Building

### Split Train and Test Data

In [22]:
X = data[['h_a', 'scored', 'missed', 'xpts', 'wins', 'draws', 'loses', 'pts', 'tot_points', 'tot_goal', 'tot_con', 'teamId', 'xG', 'xGA', 'HS.x', 'HST.x', 'HF.x', 'HC.x', 'HY.x', 'AS.x', 'AST.x', 'AF.x', 'AC.x', 'AY.x']]
y = data[['result']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [23]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_train[:,0:2] *= 2.5
X_train[:,2:8] *= 1.5

### Multi Output Regressor

In [24]:
mor = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=3, weights='distance')).fit(X_train, y_train)

pred_mor = mor.predict(X_test)



In [25]:
r2_score(y_test, pred_mor)

0.13599170702802976

### K Neighbours Regressor

In [26]:
knr = KNeighborsRegressor(n_neighbors=7, weights='uniform', algorithm='kd_tree', leaf_size=30).fit(X_train, y_train)

pred_knr = knr.predict(X_test)



In [27]:
r2_score(y_test, pred_knr)

0.16308721644927548

### Decision Tree Regressor

In [28]:
dtr = DecisionTreeRegressor(criterion='squared_error', max_features='auto').fit(X_train, y_train)

pred_dtr = dtr.predict(X_test)



In [29]:
r2_score(y_test, pred_dtr)

0.6670992571377428