In [1]:

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
from datetime import datetime as dt



# EDA of the player data

I'll due some player stats progression during the season and team comparisons based on FPL individual player indices.
I am kind of new to data with Python, comments are appreciated. 

In [2]:
data = pd.read_csv('../input/epl-stats-20192020/players_1920_fin.csv')

In [3]:
### Addding player performence ratio to value
data['value'] = data['value'] / 10
data['ppm'] = data.apply(lambda row: row.total_points/ 
                                  (row.value), axis = 1) 

#### fixing liverpool double game week
data.loc[(data['team'] == 'Liverpool')&(data['fixture'] == 237), 'round'] = 24.5

data = pd.DataFrame(data)

data = data.astype({'ppm':float})
data['kickoff_time']=pd.to_datetime(data['kickoff_time'])

In [4]:
### quick look at out data
data.info()
data.columns

In [5]:
data.describe()


In [6]:
#### filtering columns for correlation matrix
data_corr = data.iloc[:, np.r_[1:6,8:15,24:26]]
data_corr.columns

# Correlation Matrix

In [7]:
### Correlation Matrix
corr = data_corr.corr()
corr.style.background_gradient(cmap='coolwarm')


# Scoring, assisting and FPL point leaders over time

In [13]:
#### Goals scored per player 
data['Total scored'] = data.sort_values(by=['round']).groupby(['full'])['goals_scored'].cumsum()
### retreiving the names of the current top 5 scorers and filtering the data to include their obs
top_scorers = data.groupby(['full'])['Total scored'].max().sort_values(ascending=False).head(n=5).index.get_level_values('full').tolist()
top = data.full.isin(top_scorers)
scorers = data[top]

In [14]:
#### ploting the goals scoring progress for each of the top scorers. 
plt.figure(figsize=(10, 10))
sns.set(style='whitegrid', rc={"grid.linewidth": 0.1})
sns.set_context("paper", font_scale=0.9)
plt.xlim(1,38)
sns.lineplot('round', 'Total scored', data=scorers, hue='full',size = 'full').set_title("Top 5 - Goals")

In [15]:
#### Points scored per player in FPL 
data['Total points'] = data.sort_values(by=['round']).groupby(['full'])['total_points'].cumsum()
### retreiving the names of the current top 5 leaders and filtering the data to include their obs
top_pts = data.groupby(['full'])['Total points'].max().sort_values(ascending=False).head(n=5).index.get_level_values('full').tolist()
top = data.full.isin(top_pts)
top_points = data[top]

In [16]:
#### Ploting the point scoring progress for each of the leaders. 
sns.set(style='whitegrid', rc={"grid.linewidth": 0.1})
sns.set_context("paper", font_scale=0.9)
plt.figure(figsize=(10, 10)) 
plt.xlim(1,38)
sns.lineplot('round', 'Total points', data=top_points, hue='full',size = 'full').set_title("Top 5 - points")



In [17]:
#### Assists per player in FPL 
data['Total assist'] = data.sort_values(by=['round']).groupby(['full'])['assists'].cumsum()
### retreiving the names of the current top 5 leaders and filtering the data to include their obs
top_assist = data.groupby(['full'])['Total assist'].max().sort_values(ascending=False).head(n=5).index.get_level_values('full').tolist()
top = data.full.isin(top_assist)
top_assists = data[top]

In [18]:
#### Ploting the point scoring progress for each of the leaders. 
sns.set(style='whitegrid', rc={"grid.linewidth": 0.1})
sns.set_context("paper", font_scale=0.9)
plt.figure(figsize=(10, 10)) 
plt.xlim(1,38)
sns.lineplot('round', 'Total assist', data=top_assists, hue='full',size = 'full').set_title("Top 5 - Assist")



In [24]:
## Creating a filtered dataframe of players who played a total greater then the median

## Total minutes played for each player
data['Total minutes'] = data.sort_values(by=['round']).groupby(['full'])['minutes'].cumsum()
tot_min = pd.DataFrame(data.groupby(['full'])['Total minutes'].max())

### totals as cumsums
data['total_cleans'] = data.sort_values(by=['round']).groupby(['full'])['clean_sheets'].cumsum()
data['total_saves'] = data.sort_values(by=['round']).groupby(['full'])['saves'].cumsum()
data['total_conceded'] = data.sort_values(by=['round']).groupby(['full'])['goals_conceded'].cumsum()


### filtering players that played above the median
played = tot_min[tot_min['Total minutes']>tot_min['Total minutes'].median()]
played.index.name = 'full'
played.reset_index(inplace=True)
reg_names = played['full'].tolist()

### filtering data to include obs by regular playing players
reg = data.full.isin(reg_names)
regulars = data[reg]
### getting match times
regulars['kickoff_time']= pd.to_datetime(regulars['kickoff_time'])
regulars['day'] = regulars.kickoff_time.dt.weekday
regulars['hour'] = regulars.kickoff_time.dt.hour


# Regulars indices summary by team

Regulars are defined as players that played above the median total minutes this season. 

In [25]:
#### producing a table to summarise FPL indices per team (for regular playing players in terms of minutes across all teams)

regulars.groupby(['team']).agg({'ict_index':['mean','median','max'], 
                         'influence':['mean','median','max'], 
                         'creativity':['mean','median','max'], 
                         'threat': ['mean','median','max'],
                         'selected': ['mean','median','max']})



In [26]:
#### indices disturbutions
fig, ax =plt.subplots(1,3 )
sns.set(style='whitegrid', rc={"grid.linewidth": 0.1})
sns.set_context("paper", font_scale=1.2)
plt.figure(figsize=(5, 10))
fig.tight_layout()
sns.distplot(regulars['ict_index'],ax=ax[0]).set_title("Ict dist")
sns.distplot(regulars['threat'],ax=ax[1]).set_title("Threat dist")
sns.distplot(regulars['influence'],ax=ax[2]).set_title("Influence dist")


In [27]:
fig, ax =plt.subplots(1,2 )
sns.set(style='whitegrid', rc={"grid.linewidth": 0.1})
sns.set_context("paper", font_scale=1.2)
plt.figure(figsize=(5, 10))
fig.tight_layout()
sns.distplot(regulars['selected'],ax=ax[0]).set_title("Selected dist")
sns.distplot(regulars['creativity'],ax=ax[1]).set_title("Creativity dist")


# Man City and Liverpool
An interesting point that arises, is that Man City seem to perform better in terms of induvidual stats measured by FPL. These results are for players that play a substantial amount of the time. So, did liverpool get ahead thanks to unseen factors? luck? 
Another option is that Liverpool's less regular players provided more then Man City's. 

In [31]:
nonreg = data[-reg]
nonreglc = nonreg[(nonreg['team']=='Liverpool')|(nonreg['team']=='Man City')]
livcity = data[(data['team']=='Liverpool')|(data['team']=='Man City')]

In [32]:
fig, ax =plt.subplots(1,2 )
sns.set(style='whitegrid', rc={"grid.linewidth": 0.1})
sns.set_context("paper", font_scale=1.2)
plt.figure(figsize=(20, 15))
fig.tight_layout(w_pad = 5)
 
sns.boxplot('team', 'ict_index', data=nonreglc,ax=ax[0]).set_title("Ict boxes")
sns.boxplot('team', 'threat', data=livcity,ax=ax[1]).set_title("Threat boxes")


In [33]:
sns.set(style='whitegrid', rc={"grid.linewidth": 0.1})
sns.set_context("paper", font_scale=1.2)
plt.figure(figsize=(10, 7.5)) 
sns.boxplot('team', 'creativity', data=nonreglc).set_title("creativity boxes")

sns.set(style='whitegrid', rc={"grid.linewidth": 0.1})
sns.set_context("paper", font_scale=1.2)
plt.figure(figsize=(10, 7.5)) 
sns.boxplot('team', 'selected', data=livcity).set_title("Selected boxes")


sns.set(style='whitegrid', rc={"grid.linewidth": 0.1})
sns.set_context("paper", font_scale=1.2)
plt.figure(figsize=(10, 7.5)) 
sns.boxplot('team', 'influence', data=livcity).set_title("Influence boxes")


It seems Liverpool's less regular players do manage to cut the gap, yet Man City induvidual indices are still superior in most cases. 

# Best player in FPL

We can measure efficiency by the ratio point/value.
Let's see the players with the best mean ratio throughout the season. 

In [34]:
best_value = data.groupby(['full'])['ppm'].mean().sort_values(ascending=False).head(n=20)
best_value = pd.DataFrame(best_value)
best_value.index.name = 'full'
best_value.reset_index(inplace=True)

val_names = best_value['full'].tolist()

### filtering data to include obs by regular playing players
val = data.full.isin(val_names)
best = data[val]

best_value

In [35]:
#### ploting the point scoring progress for each of the leaders. 
sns.set(style='whitegrid', rc={"grid.linewidth": 0.1})
sns.set_context("paper", font_scale=0.9)
plt.figure(figsize=(10, 10)) 
plt.xlim(1,38)
sns.lineplot('round', 'Total points', data=best, hue='full',size = 'full').set_title("Top 20 - points")


In [36]:
 best.groupby(['full']).agg({'ict_index':['mean','median','max'], 
                         'influence':['mean','median','max'], 
                         'creativity':['mean','median','max'], 
                         'threat': ['mean','median','max'],
                         'selected': ['mean','median','max'],
                         'ppm': ['mean','median','max']})



# Predicting performence 

I'll create form variables of FPL indices and more in order to predict the next performence of EPL players. 

In [38]:
#### creating form variables
### We get the means of indices for the last 2 games and 4 games
forms = ['total_points','assists','goals_scored','ict_index','threat','creativity','influence','saves','value','minutes','goals_conceded','clean_sheets','ppm']
form2_means = regulars.groupby(['full'])[forms].rolling(2).mean().reset_index()
form4_means = regulars.groupby(['full'])[forms].rolling(4).mean().reset_index()

### adding forms to regulars DF, dropping NA's will result in loosing the first 4 obs for each player. Hence this model is relevant in predicting from the 5th. 
result = pd.merge(form2_means, form4_means, how='left', on=['full', 'level_1'],suffixes=('_2','_4'))
final = pd.merge(regulars,result,how='inner',left_index=True,right_on='level_1')
final_na = final.dropna()

In [39]:
#### creating Y and X datasets
y = final_na.loc[:,['total_points']]

X = final_na.drop(['total_points'], axis=1)
X = X.iloc[:,14:]
X = X.drop(X.columns.to_series()["own_goals":"was_home"], axis=1)
X = X.drop(['yellow_cards','level_1','full_y','ppm'],axis=1)

for col in ['opponent_team', 'full_x', 'team']:
    X[col] = X[col].astype('category')


In [40]:
#### these are the features used to predict total points for each player
X.columns

In [41]:
#### encoding catagorical data 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X.iloc[:,0] = labelencoder_X.fit_transform(X.iloc[:,0])
X.iloc[:,1] = labelencoder_X.fit_transform(X.iloc[:,1])
X.iloc[:,2] = labelencoder_X.fit_transform(X.iloc[:,2])

### scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(X)
y = sc_y.fit_transform(y)

## spliting to train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [42]:
#### Fitting SVR, the parameters are a result of tuning not shown here due to CPU 
from sklearn.svm import SVR
regressor = SVR(kernel = 'linear',verbose=True,degree=3,C=10,gamma=10,epsilon=0.05)
regressor.fit(X_train, y_train.ravel())


In [43]:
from sklearn import metrics as met
pred = regressor.predict(X_test)


mae=met.mean_absolute_error(y_test, pred)
mse=met.mean_squared_error(y_test, pred)
r2=met.r2_score(y_test, pred)

print ("MAE: {}".format(mae))
print ("MSE: {}".format(mse))
print ("R2: {}".format(r2))


In [44]:
#### Finally we can inverse the scale to get the actual predicted points
sc_y.inverse_transform(pred)
 

Players performances are well estimated based on the data we have, we can add more features to increase the accuracy