In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso, Ridge



%matplotlib inline

In [3]:
# I did two models, one for the goalkeepers, and defenders and the other is for the rest of the players
# but i had an issue with the other model 

In [4]:
df_gkeeper_defender = pd.read_csv('gkeeper_defender.csv')

#### 1. Feature engineering steps:

1. Feature scaling: a method used to standardize the range of independent variables or features of data.
2. Aggregation: a process in data processing where summary statistics are calculated from data. 
3. One hot coding: a process used to convert categorical data variables so they can be provided to machine learning algorithms to improve predictions. 

In [5]:
#### Dataset 1: df_gkeeper_defender modeling #####

In [6]:
df_gkeeper_defender.head()

Unnamed: 0.1,Unnamed: 0,team,position,height,age,appearance,goals,assists,yellow cards,red cards,goals conceded,clean sheets,minutes played,days_injured,award,current_value
0,0,Manchester United,Goalkeeper,189.0,32.0,104,0.0,0.0,0.009585,0.0,1.217252,0.335463,9390,42,13,15000000
1,1,Manchester United,Goalkeeper,196.0,30.0,15,0.0,0.0,0.069018,0.0,1.242331,0.207055,1304,510,1,1500000
2,2,Manchester United,Goalkeeper,188.0,37.0,4,0.0,0.0,0.0,0.0,0.616438,0.924658,292,697,4,600000
3,3,Manchester United,Defender Centre-Back,175.0,25.0,82,0.02809,0.05618,0.224719,0.0,0.0,0.0,6408,175,9,50000000
4,4,Manchester United,Defender Centre-Back,191.0,30.0,63,0.017889,0.017889,0.053667,0.0,0.0,0.0,5031,238,21,40000000


In [7]:
df_gkeeper_defender.drop('Unnamed: 0', axis=1, inplace=True)

In [8]:
# first do one hot coding:

In [9]:
# dataset 1: df_gkeeper_defender
gkeeper_defender_categorical = ['team', 'position']
gkeeper_defender_numerical = ['height', 'age', 'appearance', 'goals', 'assists', 'yellow cards',
                              'red cards','goals conceded', 'clean sheets', 'minutes played', 'days_injured',
                             'award', 'current_value']


In [10]:
gkeeper_defender_categorical

['team', 'position']

In [11]:
gkeeper_defender_numerical

['height',
 'age',
 'appearance',
 'goals',
 'assists',
 'yellow cards',
 'red cards',
 'goals conceded',
 'clean sheets',
 'minutes played',
 'days_injured',
 'award',
 'current_value']

In [12]:
# get_dummies function: a powerful tool for converting categorical variable(s) into dummy/indicator variables, which is essentially implementing one-hot encoding
df_gkeeper_defender = pd.get_dummies(df_gkeeper_defender, columns=gkeeper_defender_categorical)

In [13]:
# it cretated dummy varieables for categorical feature values 
df_gkeeper_defender.shape

(4698, 390)

In [14]:
# now we are going to correlate our data to find the closest relationship between the values with current value 
correlation = df_gkeeper_defender.corr()
print(correlation['current_value'].sort_values(ascending=False))

current_value                1.000000
appearance                   0.437273
minutes played               0.436572
award                        0.267146
team_Bayern Munich           0.205238
                               ...   
team_Daegu FC               -0.023182
team_Daejeon Hana Citizen   -0.024394
age                         -0.065629
goals conceded              -0.081208
position_Goalkeeper         -0.103531
Name: current_value, Length: 390, dtype: float64


In [15]:
# threeshold: a specific cutoff value used to determine the strength 
# of the relationship between two variables, 
# as measured by a correlation coefficient. 

threshold = 0.2

# We use `abs()` for absolute value to consider both strong positive and negative correlations
selected_features = correlation[abs(correlation['current_value']) > threshold]['current_value'].index
selected_features

Index(['appearance', 'minutes played', 'award', 'current_value',
       'team_Bayern Munich'],
      dtype='object')

In [16]:
selected_features = ['appearance', 'minutes played', 'award', 'current_value',
       'team_Bayern Munich'] 

In [17]:

df_gkeeper_defender = df_gkeeper_defender[selected_features]
df_gkeeper_defender.head()

Unnamed: 0,appearance,minutes played,award,current_value,team_Bayern Munich
0,104,9390,13,15000000,False
1,15,1304,1,1500000,False
2,4,292,4,600000,False
3,82,6408,9,50000000,False
4,63,5031,21,40000000,False


In [18]:
# preparing the data
x = df_gkeeper_defender.drop(['current_value'], axis=1)
y = df_gkeeper_defender['current_value']

In [19]:
# splitting the data into training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                   test_size=0.2,
                                                   shuffle=True,
                                                   random_state=42)
# RS : will ensure that the same rows are selected for training and testing each time the code is executed.

# scaling data
scaler = StandardScaler()
scaler.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [20]:
x.shape

(4698, 4)

In [21]:
#### Building the model ####

model = LinearRegression()

In [22]:
#### Training the model ####

## fit the model on the training data
model.fit(x_train_scaled, y_train)

In [23]:
y_pred_train = model.predict(x_train_scaled)
y_pred_test = model.predict(x_test_scaled)


In [24]:
model.coef_

array([1656090.39792957, 1034373.97176721,  995380.96983273,
        848064.48058079])

In [25]:
coeff_df_gk_dd = pd.DataFrame(model.coef_,x.columns, columns=['Coefficient'])
coeff_df_gk_dd

Unnamed: 0,Coefficient
appearance,1656090.0
minutes played,1034374.0
award,995381.0
team_Bayern Munich,848064.5


## Evaluating the model:

Overfitting
Definition: Overfitting occurs when a model learns the training data too well, including the noise and outliers. As a result, the model performs extremely well on the training data but poorly on new, unseen data (the test set).

Characteristics:

1. The model has very low error on the training data.
2. The model has high error on the test data.
3. The model is too complex, with too many parameters relative to the amount of training data.
4. It captures the noise in the training data as if it were a part of the true underlying pattern, leading to poor generalization.

Underfitting
Definition: Underfitting occurs when a model is too simple to capture the underlying patterns in the data. As a result, it performs poorly on both the training data and new data.

Characteristics:

1. The model has high error on both the training data and the test data.
2. The model fails to capture the underlying trends in the data, leading to a lack of flexibility.
3. The model is too simple, with too few parameters to model the complexity of the data.

In [26]:
### 1. MSE ### 

In [27]:
y_base = [y_train.mean()] * len(y_test)

In [28]:
mse_base = mean_squared_error(y_test, y_base )
print(f'Mean Squared Error of base model: {mse_base}')

Mean Squared Error of base model: 69422907951560.72


In [29]:
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
print(f'Mean Squared Error for training: {mse_train}')
print(f'Mean Squared Error for testing: {mse_test}')

Mean Squared Error for training: 34459917695993.91
Mean Squared Error for testing: 49668823993506.875


In [30]:
### 2. MAE ###

In [31]:
mse_base = mean_absolute_error(y_test, y_base )
print(f'Mean Absolute  Error of base model: {mse_base}')

Mean Absolute  Error of base model: 3772914.2906480357


In [32]:
mae_train = mean_absolute_error(y_train, y_pred_train)
mae_test = mean_absolute_error(y_test, y_pred_test)
print(f'Mean Absolute Error for training: {mae_train}')
print(f'Mean Absolute Error for testing: {mae_test}')

Mean Absolute Error for training: 3040941.301852911
Mean Absolute Error for testing: 3329514.711677085


In [33]:
### 3. RMSE ###

In [34]:
print(f'Root Mean Squared  Error of base model: {np.sqrt(mse_base)}')

Root Mean Squared  Error of base model: 1942.3991069417314


In [35]:
# it seems like we have an overfitting here, must back to selecting features or splitted data
print(f'Root Mean Squared Error for training: {np.sqrt(mse_train)}')
print(f'Root Mean Squared Error for testing: {np.sqrt(mse_test)}')

Root Mean Squared Error for training: 5870257.038324123
Root Mean Squared Error for testing: 7047611.226047225


In [36]:
### 4. R^ ###

In [37]:
r2_score_train = r2_score(y_train, y_pred_train)
r2_score_test = r2_score(y_test, y_pred_test)
print(f'R Square for training: {r2_score_train}')
print(f'R Square for testing: {r2_score_test}')

R Square for training: 0.23377463202310045
R Square for testing: 0.2818170323457623


In [38]:
### 5. Cross Validation ###

In [39]:
# the values looks close to each other 
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5 folds, data shuffled before splitting

scores = cross_val_score(model, x, y, cv=kf, scoring='neg_mean_squared_error')

mse_scores = -scores
mse_scores

array([4.96688240e+13, 3.71983345e+13, 3.71865786e+13, 2.94035167e+13,
       3.52313297e+13])

In [40]:
#### Regulization ####

In [41]:
# Applying L1 Regularization (Lasso)
lasso_model = Lasso(alpha=0.1) # alpha is the regulization strength 
lasso_model.fit(x_train_scaled, y_train)

# make predictions
y_pred_lasso_train = lasso_model.predict(x_train_scaled)
y_pred_lasso_test = lasso_model.predict(x_test_scaled)


# evaluation of the model lasso 
lasso_mse_train = mean_absolute_error(y_train, y_pred_lasso_train)
lasso_mse_test = mean_squared_error(y_test, y_pred_lasso_test)
lasso_r2_train = r2_score(y_train, y_pred_lasso_train)
lasso_r2_test = r2_score(y_test, y_pred_lasso_test)

In [42]:
lasso_mse_train

3040941.357161565

In [43]:
lasso_mse_test # high

49668825117856.83

In [44]:
lasso_r2_train

0.2337746320230807

In [45]:
lasso_r2_test

0.281817016088301

In [46]:
# Applying L2 Regularization (Ridge)
ridge_model = Ridge(alpha=0.1)  # Alpha is the regularization strength
ridge_model.fit(x_train_scaled, y_train)

# make predictions
y_pred_ridge_train = ridge_model.predict(x_train_scaled)
y_pred_ridge_test = ridge_model.predict(x_test_scaled)


# evaluation of the ridge Model
ridge_mse_train = mean_squared_error(y_train, y_pred_ridge_train)
ridge_mse_test = mean_squared_error(y_test, y_pred_ridge_test)  # Use y_test here
ridge_r2_train = r2_score(y_train, y_pred_ridge_train)
ridge_r2_test = r2_score(y_test, y_pred_ridge_test)

In [47]:
ridge_mse_train

34459917703889.01

In [48]:
ridge_mse_test  

49668879565559.555

In [49]:
ridge_r2_train

0.23377463184755076

In [50]:
ridge_r2_test

0.28181622880546275