In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


%matplotlib inline

In [2]:
# I did two models, one for the goalkeepers, and defenders and the other is for the rest of the players
# but i had an issue with the other model 

In [3]:
df_rest_players = pd.read_csv('rest_players.csv')

#### 1. Feature engineering steps:

1. Feature scaling: a method used to standardize the range of independent variables or features of data.
2. Aggregation: a process in data processing where summary statistics are calculated from data. 
3. One hot coding: a process used to convert categorical data variables so they can be provided to machine learning algorithms to improve predictions. 

In [4]:
df_rest_players.head()

Unnamed: 0.1,Unnamed: 0,team,position,height,age,appearance,goals,assists,yellow cards,red cards,goals conceded,minutes played,days_injured,award,current_value
0,16,Manchester United,midfield-CentralMidfield,182.0,31.0,55,0.067214,0.313667,0.089619,0.0,0.0,4017,0,10,25000000
1,17,Manchester United,midfield-CentralMidfield,177.0,29.0,74,0.144046,0.086428,0.230474,0.0,0.0,3124,280,8,20000000
2,18,Manchester United,midfield-CentralMidfield,169.0,30.0,92,0.184843,0.221811,0.332717,0.0,0.0,4869,45,11,20000000
3,19,Manchester United,midfield-CentralMidfield,184.0,26.0,31,0.155575,0.0,0.155575,0.0,0.0,1157,378,6,17000000
4,20,Manchester United,midfield-CentralMidfield,181.0,20.0,39,0.129125,0.064562,0.225968,0.0,0.0,2788,0,1,800000


In [5]:
df_rest_players.drop('Unnamed: 0', axis=1, inplace=True)

In [8]:
# first do one hot coding:

In [7]:
df_rest_players.columns

Index(['team', 'position', 'height', 'age', 'appearance', 'goals', 'assists',
       'yellow cards', 'red cards', 'goals conceded', 'minutes played',
       'days_injured', 'award', 'current_value'],
      dtype='object')

In [8]:
# dataset 2: df_rest_players
rest_players_categorical = ['team', 'position']
rest_players_numerical = ['height', 'age', 'appearance', 'goals', 'assists', 'yellow cards',
                              'red cards','goals conceded', 'minutes played', 'days_injured', 'games_injured',
                             'award', 'current_value']


In [9]:
rest_players_categorical

['team', 'position']

In [10]:
rest_players_numerical

['height',
 'age',
 'appearance',
 'goals',
 'assists',
 'yellow cards',
 'red cards',
 'goals conceded',
 'minutes played',
 'days_injured',
 'games_injured',
 'award',
 'current_value']

In [11]:
# get_dummies function: a powerful tool for converting categorical variable(s) into dummy/indicator variables, which is essentially implementing one-hot encoding
df_rest_players = pd.get_dummies(df_rest_players, columns= rest_players_categorical)

In [12]:
# it cretated dummy varieables for categorical feature values 
df_rest_players.shape

(4136, 398)

In [13]:
# now we are going to correlate our data 
correlation = df_rest_players.corr()
print(correlation['current_value'].sort_values(ascending=False))

current_value                          1.000000
appearance                             0.358491
minutes played                         0.345816
award                                  0.273992
team_Bayern Munich                     0.232843
                                         ...   
position_midfield-RightMidfield       -0.036713
position_midfield-DefensiveMidfield   -0.041710
yellow cards                          -0.066831
age                                   -0.090431
goals conceded                              NaN
Name: current_value, Length: 398, dtype: float64


In [16]:
# threeshold: a specific cutoff value used to determine the strength 
# of the relationship between two variables, 
# as measured by a correlation coefficient. 

threshold = 0.15

# We use `abs()` for absolute value to consider both strong positive and negative correlations
selected_features = correlation[abs(correlation['current_value']) > threshold]['current_value'].index
selected_features

Index(['appearance', 'assists', 'minutes played', 'days_injured', 'award',
       'current_value', 'team_Arsenal FC', 'team_Bayern Munich',
       'team_Chelsea FC', 'team_FC Barcelona', 'team_Liverpool FC',
       'team_Manchester City', 'team_SSC Napoli'],
      dtype='object')

In [17]:
selected_features = ['appearance', 'assists', 'minutes played', 'days_injured', 'award',
       'current_value', 'team_Arsenal FC', 'team_Bayern Munich',
       'team_Chelsea FC', 'team_FC Barcelona', 'team_Liverpool FC',
       'team_Manchester City', 'team_SSC Napoli'] 

In [18]:

df_rest_players = df_rest_players[selected_features]
df_rest_players.head()

Unnamed: 0,appearance,assists,minutes played,days_injured,award,current_value,team_Arsenal FC,team_Bayern Munich,team_Chelsea FC,team_FC Barcelona,team_Liverpool FC,team_Manchester City,team_SSC Napoli
0,55,0.313667,4017,0,10,25000000,False,False,False,False,False,False,False
1,74,0.086428,3124,280,8,20000000,False,False,False,False,False,False,False
2,92,0.221811,4869,45,11,20000000,False,False,False,False,False,False,False
3,31,0.0,1157,378,6,17000000,False,False,False,False,False,False,False
4,39,0.064562,2788,0,1,800000,False,False,False,False,False,False,False


In [19]:
# preparing the data
x = df_rest_players.drop(['current_value'], axis=1)
y = df_rest_players['current_value']

In [20]:
# splitting the data into training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                   test_size=0.25,
                                                   shuffle=True,
                                                   random_state=42)
# RS : will ensure that the same rows are selected for training and testing each time the code is executed.

# scaling data
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [21]:
x.shape

(4136, 12)

In [22]:
#### Building the model ####

model = LinearRegression()

In [23]:
#### Training the model ####

## fit the model on the training data
model.fit(x_train_scaled, y_train)

In [25]:
y_pred_train = model.predict(x_train_scaled)
y_pred_test = model.predict(x_test_scaled)


In [26]:
model.coef_

array([ 715828.69872084,  629703.51551412, 1521240.85582339,
        493680.53291939, 1087964.30793605, 1098740.79742569,
        755459.80722843, 1911446.48527044, 1552096.4521879 ,
       1457655.33519595,  938779.58604745, 1444746.66683554])

In [27]:
coeff_rest_players = pd.DataFrame(model.coef_,x.columns, columns=['Coefficient'])
coeff_rest_players

Unnamed: 0,Coefficient
appearance,715828.7
assists,629703.5
minutes played,1521241.0
days_injured,493680.5
award,1087964.0
team_Arsenal FC,1098741.0
team_Bayern Munich,755459.8
team_Chelsea FC,1911446.0
team_FC Barcelona,1552096.0
team_Liverpool FC,1457655.0


## Evaluating the model:

In [None]:
### 1. MSE ###

In [28]:
y_base = [y_train.mean()] * len(y_test)

In [29]:
mse_base = mean_squared_error(y_test, y_base )
print(f'Mean Squared Error of base model: {mse_base}')

Mean Squared Error of base model: 91852440259991.19


In [30]:
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
print(f'Mean Squared Error for training: {mse_train}')
print(f'Mean Squared Error for testing: {mse_test}')

Mean Squared Error for training: 39908174462858.34
Mean Squared Error for testing: 56925247007049.234


In [None]:
### 2. MAE ###

In [31]:
mse_base = mean_absolute_error(y_test, y_base )
print(f'Mean Absolute  Error of base model: {mse_base}')

Mean Absolute  Error of base model: 4667312.659081869


In [32]:
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)
print(f'Mean Absolute Error for training: {mae_train}')
print(f'Mean Absolute Error for testing: {mae_test}')

Mean Absolute Error for training: 3476132.586485709
Mean Absolute Error for testing: 3916530.9184202934


In [None]:
### 3. RMSE ###

In [33]:
print(f'Root Mean Squared  Error of base model: {np.sqrt(mse_base)}')

Root Mean Squared  Error of base model: 2160.396412485882


In [34]:
print(f'Root Mean Squared Error for training: {np.sqrt(mse_train)}')
print(f'Root Mean Squared Error for testing: {np.sqrt(mse_test)}')

Root Mean Squared Error for training: 6317291.703163496
Root Mean Squared Error for testing: 7544882.173172039


In [None]:
### 4. R^ ###

In [35]:
r2_score_train = r2_score(y_train, y_pred_train)
r2_score_test = r2_score(y_test, y_pred_test)
print(f'R Square for training: {r2_score_train}')
print(f'R Square for testing: {r2_score_test}')

R Square for training: 0.37597251056828107
R Square for testing: 0.37823635601263317


In [None]:
### 5. Cross Validation ###

In [36]:
# the values looks close to each other 
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5 folds, data shuffled before splitting

scores = cross_val_score(model, x, y, cv=kf, scoring='neg_mean_squared_error')

mse_scores = -scores
mse_scores

array([6.33684604e+13, 4.36854803e+13, 4.45616353e+13, 3.44540609e+13,
       5.04667203e+13])

In [None]:
#### Regulization ####