In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics

%matplotlib inline

In [None]:
# I did two models, one for the goalkeepers, and defenders and the other is for the rest of the players
# but i had an issue with the other model 

In [2]:
df_gkeeper_defender = pd.read_csv('gkeeper_defender.csv')
df_rest_players = pd.read_csv('rest_players.csv')

#### 1. Feature engineering steps:

1. Feature scaling: a method used to standardize the range of independent variables or features of data.
2. Aggregation: a process in data processing where summary statistics are calculated from data. 
3. One hot coding: a process used to convert categorical data variables so they can be provided to machine learning algorithms to improve predictions. 

In [3]:
#### Dataset 1: df_gkeeper_defender modeling #####

In [4]:
df_gkeeper_defender.head()

Unnamed: 0.1,Unnamed: 0,team,position,height,age,appearance,goals,assists,yellow cards,red cards,goals conceded,clean sheets,minutes played,days_injured,award,current_value
0,0,Manchester United,Goalkeeper,189.0,32.0,104,0.0,0.0,0.009585,0.0,1.217252,0.335463,9390,42,13,15000000
1,1,Manchester United,Goalkeeper,196.0,30.0,15,0.0,0.0,0.069018,0.0,1.242331,0.207055,1304,510,1,1500000
2,2,Manchester United,Goalkeeper,188.0,37.0,4,0.0,0.0,0.0,0.0,0.616438,0.924658,292,697,4,600000
3,3,Manchester United,Defender Centre-Back,175.0,25.0,82,0.02809,0.05618,0.224719,0.0,0.0,0.0,6408,175,9,50000000
4,4,Manchester United,Defender Centre-Back,191.0,30.0,63,0.017889,0.017889,0.053667,0.0,0.0,0.0,5031,238,21,40000000


In [5]:
df_rest_players.head()

Unnamed: 0.1,Unnamed: 0,team,position,height,age,appearance,goals,assists,yellow cards,red cards,goals conceded,minutes played,days_injured,award,current_value
0,16,Manchester United,midfield-CentralMidfield,182.0,31.0,55,0.067214,0.313667,0.089619,0.0,0.0,4017,0,10,25000000
1,17,Manchester United,midfield-CentralMidfield,177.0,29.0,74,0.144046,0.086428,0.230474,0.0,0.0,3124,280,8,20000000
2,18,Manchester United,midfield-CentralMidfield,169.0,30.0,92,0.184843,0.221811,0.332717,0.0,0.0,4869,45,11,20000000
3,19,Manchester United,midfield-CentralMidfield,184.0,26.0,31,0.155575,0.0,0.155575,0.0,0.0,1157,378,6,17000000
4,20,Manchester United,midfield-CentralMidfield,181.0,20.0,39,0.129125,0.064562,0.225968,0.0,0.0,2788,0,1,800000


In [6]:
df_gkeeper_defender.drop('Unnamed: 0', axis=1, inplace=True)
df_rest_players.drop('Unnamed: 0', axis=1, inplace=True)

In [7]:
# first do one hot coding:

In [8]:
# dataset 1: df_gkeeper_defender
gkeeper_defender_categorical = ['team', 'position']
gkeeper_defender_numerical = ['height', 'age', 'appearance', 'goals', 'assists', 'yellow cards',
                              'red cards','goals conceded', 'clean sheets', 'minutes played', 'days_injured', 'games_injured',
                             'award', 'current_value']


In [10]:
gkeeper_defender_categorical

['team', 'position']

In [11]:
gkeeper_defender_numerical

['height',
 'age',
 'appearance',
 'goals',
 'assists',
 'yellow cards',
 'red cards',
 'goals conceded',
 'clean sheets',
 'minutes played',
 'days_injured',
 'games_injured',
 'award',
 'current_value']

In [12]:
# get_dummies function: a powerful tool for converting categorical variable(s) into dummy/indicator variables, which is essentially implementing one-hot encoding
df_gkeeper_defender = pd.get_dummies(df_gkeeper_defender, columns=gkeeper_defender_categorical)

In [13]:
# it cretated dummy varieables for categorical feature values 
df_gkeeper_defender.shape

(4698, 390)

In [14]:
# now we are going to correlate our data 
correlation = df_gkeeper_defender.corr()
print(correlation['current_value'].sort_values(ascending=False))

current_value                1.000000
appearance                   0.437273
minutes played               0.436572
award                        0.267146
team_Bayern Munich           0.205238
                               ...   
team_Daegu FC               -0.023182
team_Daejeon Hana Citizen   -0.024394
age                         -0.065629
goals conceded              -0.081208
position_Goalkeeper         -0.103531
Name: current_value, Length: 390, dtype: float64


In [15]:
# threeshold: a specific cutoff value used to determine the strength 
# of the relationship between two variables, 
# as measured by a correlation coefficient. 

threshold = 0.2

# We use `abs()` for absolute value to consider both strong positive and negative correlations
selected_features = correlation[abs(correlation['current_value']) > threshold]['current_value'].index
selected_features

Index(['appearance', 'minutes played', 'award', 'current_value',
       'team_Bayern Munich'],
      dtype='object')

In [16]:
selected_features = ['appearance', 'minutes played', 'award', 'current_value',
       'team_Bayern Munich'] 

In [17]:

df_gkeeper_defender = df_gkeeper_defender[selected_features]
df_gkeeper_defender.head()

Unnamed: 0,appearance,minutes played,award,current_value,team_Bayern Munich
0,104,9390,13,15000000,False
1,15,1304,1,1500000,False
2,4,292,4,600000,False
3,82,6408,9,50000000,False
4,63,5031,21,40000000,False


In [18]:
# preparing the data
x = df_gkeeper_defender.drop(['current_value'], axis=1)
y = df_gkeeper_defender['current_value']

In [19]:
# splitting the data into training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                   test_size=0.25,
                                                   shuffle=True,
                                                   random_state=42)
# RS : will ensure that the same rows are selected for training and testing each time the code is executed.

# scaling data
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [20]:
x.shape

(4698, 4)

In [21]:
#### Building the model ####

model = LinearRegression()

In [22]:
#### Training the model ####

## fit the model on the training data
model.fit(x_train_scaled, y_train)

In [35]:
y_pred_train = model.predict(x_train_scaled)
y_pred_gk_dd = model.predict(x_test_scaled)

In [24]:
model.coef_

array([1612004.00536328, 1095397.34477944, 1051092.13089036,
        863075.04269504])

In [36]:
coeff_df_gk_dd = pd.DataFrame(model.coef_,x.columns, columns=['Coefficient'])
coeff_df_gk_dd

Unnamed: 0,Coefficient
appearance,1612004.0
minutes played,1095397.0
award,1051092.0
team_Bayern Munich,863075.0


## Evaluating the model:

In [None]:
### 1. MSE ###

In [37]:
y_base = [y_train.mean()] * len(y_test)

In [38]:
mse_base = mean_squared_error(y_test, y_base )
print(f'Mean Squared Error of base model: {mse_base}')

Mean Squared Error of base model: 60321746992098.63


In [39]:
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_gk_dd)
print(f'Mean Squared Error for training: {mse_train}')
print(f'Mean Squared Error for testing: {mse_test}')

Mean Squared Error for training: 35463826700902.86
Mean Squared Error for testing: 43526922670036.76


In [None]:
### 2. MAE ###

In [40]:
mse_base = mean_absolute_error(y_test, y_base )
print(f'Mean Absolute  Error of base model: {mse_base}')

Mean Absolute  Error of base model: 3623120.361635695


In [41]:
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_gk_dd)
print(f'Mean Absolute Error for training: {mae_train}')
print(f'Mean Absolute Error for testing: {mae_test}')

Mean Absolute Error for training: 3089816.2953119003
Mean Absolute Error for testing: 3188346.756520911


In [None]:
### 3. RMSE ###

In [42]:
print(f'Root Mean Squared  Error of base model: {np.sqrt(mse_base)}')

Root Mean Squared  Error of base model: 1903.4495952443015


In [43]:
print(f'Root Mean Squared Error for training: {np.sqrt(mse_train)}')
print(f'Root Mean Squared Error for testing: {np.sqrt(mse_test)}')

Root Mean Squared Error for training: 5955151.274392856
Root Mean Squared Error for testing: 6597493.665782238


In [None]:
### 4. R^ ###

In [44]:
r2_score_train = r2_score(y_train, y_pred_train)
r2_score_test = r2_score(y_test, y_pred_gk_dd)
print(f'R Square for training: {r2_score_train}')
print(f'R Square for testing: {r2_score_test}')

R Square for training: 0.23521305316489938
R Square for testing: 0.27744762532913003


In [None]:
### 5. Cross Validation ###

In [None]:
#### Dataset 2: df_rest_players modeling #####

In [45]:
df_rest_players.head(2)

Unnamed: 0,team,position,height,age,appearance,goals,assists,yellow cards,red cards,goals conceded,minutes played,days_injured,award,current_value
0,Manchester United,midfield-CentralMidfield,182.0,31.0,55,0.067214,0.313667,0.089619,0.0,0.0,4017,0,10,25000000
1,Manchester United,midfield-CentralMidfield,177.0,29.0,74,0.144046,0.086428,0.230474,0.0,0.0,3124,280,8,20000000


In [48]:
# dataset 2: rest_players
rest_players_categorical = ['team', 'position']
rest_players_numerical = ['height', 'age', 'appearance', 'goals', 'assists', 'yellow cards', 
                              'red cards','goals conceded', 'minutes played', 'days_injured', 'games_injured',
                             'award', 'current_value']


In [49]:
rest_players_categorical

['team', 'position']

In [50]:
rest_players_numerical

['height',
 'age',
 'appearance',
 'goals',
 'assists',
 'yellow cards',
 'red cards',
 'goals conceded',
 'minutes played',
 'days_injured',
 'games_injured',
 'award',
 'current_value']

In [53]:
# get_dummies function: a powerful tool for converting categorical variable(s) into dummy/indicator variables, which is essentially implementing one-hot encoding
df_rest_players = pd.get_dummies(df_rest_players, columns=rest_players_categorical)

KeyError: "None of [Index(['team', 'position'], dtype='object')] are in the [columns]"

In [145]:
# it cretated dummy varieables for categorical feature values 
df_rest_players.shape

(4136, 398)

In [148]:
# now we are going to correlate our data 
correlation2 = df_rest_players.corr()
print(correlation['current_value'].sort_values(ascending=False))

current_value                          1.000000
appearance                             0.358491
minutes played                         0.345816
award                                  0.273992
team_Bayern Munich                     0.232843
                                         ...   
position_midfield-RightMidfield       -0.036713
position_midfield-DefensiveMidfield   -0.041710
yellow cards                          -0.066831
age                                   -0.090431
goals conceded                              NaN
Name: current_value, Length: 398, dtype: float64


In [149]:
# threeshold: a specific cutoff value used to determine the strength 
# of the relationship between two variables, 
# as measured by a correlation coefficient. 

threshold2 = 0.2

# We use `abs()` for absolute value to consider both strong positive and negative correlations
selected_features2 = correlation2[abs(correlation2['current_value']) > threshold2]['current_value'].index
selected_features2

Index(['appearance', 'minutes played', 'award', 'current_value',
       'team_Bayern Munich', 'team_Chelsea FC'],
      dtype='object')

In [151]:
selected_features2 = ['appearance', 'minutes played', 'award', 'current_value',
       'team_Bayern Munich', 'team_Chelsea FC'] 

In [152]:

df_rest_players = df_rest_players[selected_features]
df_rest_players.head()

Unnamed: 0,appearance,minutes played,award,current_value,team_Bayern Munich,team_Chelsea FC
0,55,4017,10,25000000,False,False
1,74,3124,8,20000000,False,False
2,92,4869,11,20000000,False,False
3,31,1157,6,17000000,False,False
4,39,2788,1,800000,False,False


In [153]:
# preparing the data
x = df_rest_players.drop(['current_value'], axis=1)
y = df_rest_players['current_value']

In [160]:
# splitting the data into training and testing
x_train2, x_test2, y_train2, y_test2 = train_test_split(x, y, 
                                                   test_size=0.25,
                                                   shuffle=True,
                                                   random_state=42)
# RS : will ensure that the same rows are selected for training and testing each time the code is executed.

# scaling data
scaler2 = StandardScaler()
scaler2.fit(x_train)
x_train_scaled2 = scaler.transform(x_train)
x_test_scaled2 = scaler.transform(x_test)

In [161]:
x.shape

(4136, 5)

In [156]:
#### Building the model ####

model2 = LinearRegression()

In [162]:
#### Training the model ####

## fit the model on the training data
model2.fit(x_train_scaled2, y_train2) # it should not give an error because it's trained on the scaled data

ValueError: Found input variables with inconsistent numbers of samples: [3523, 3102]

In [159]:
y_pred_train2 = model2.predict(x_train_scaled2)
y_rest_players = model2.predict(x_test_scaled2)

NotFittedError: This LinearRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.