In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./data/all_stats.csv')
player_val = pd.read_csv('./data/player_val_cleaned.csv')
player_wages = pd.read_csv('./data/player_wages.csv')

In [3]:
check = player_val['Player'].isin(df['Player'])
player_val[~check]

Unnamed: 0.1,Unnamed: 0,#,Player,Nat.,Age,Club,Highest value in career,Last update,Market value


In this notebook, my aim is to merge the original dataset with player valuation data. To achieve this, I'll start by merging the tables. Reflecting on the previous notebook where I attained a model with an RMSE of 15, I believe there is room for improvement. Therefore, I'm considering incorporating additional data on player wages to enhance the valuation process further.

### Merge and Clean the Data

In [4]:
df_merge = pd.merge(df, player_val[['Player','Market value']], on=['Player'], how='right')

In [5]:
# Preprocess the Market Value column
df_merge['Market value'] =(df_merge['Market value'].str.replace('€', '')
                              .str.replace('m', '')
                              .astype(float))

In [6]:
df_merge

Unnamed: 0,Player,Tkl,TklW,Tkl.1,Tkl%,Lost,Blocks,Pass,Int,Tkl+Int,...,Age,90s,Def 3rd,Mid 3rd,Att 3rd,TotDist,PrgDist,1/3,Country,Market value
0,Erling Haaland,5,2,2,100.0,0,9,8,1,6,...,23,21.4,0,3,2,2353,437,13,Norway,180.0
1,Phil Foden,24,14,12,50.0,12,21,21,13,37,...,23,25.7,10,8,6,16235,3052,51,United Kingdom,130.0
2,Bukayo Saka,49,26,20,44.4,25,37,35,10,59,...,22,25.1,18,15,16,13843,3481,29,United Kingdom,130.0
3,Declan Rice,55,34,24,58.5,17,31,21,35,90,...,25,26.6,28,21,6,28518,7674,204,United Kingdom,110.0
4,Rodri,54,33,26,54.2,22,25,18,20,74,...,27,24.1,17,27,10,41853,11449,274,Spain,110.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Harvey Barnes,3,1,0,0.0,1,0,0,2,5,...,26,4.0,1,2,0,1376,331,5,United Kingdom,35.0
96,Mykhailo Mudryk,18,11,8,34.8,15,10,9,4,22,...,23,10.1,8,5,5,2995,994,16,Ukraine,35.0
97,Diogo Dalot,56,36,35,72.9,13,34,22,27,83,...,25,25.3,41,8,7,18394,5755,77,Portugal,35.0
98,Dominic Solanke,17,9,7,41.2,10,20,16,3,20,...,26,27.7,1,9,7,3712,919,20,United Kingdom,35.0


In [7]:
# Merge with player_wages
df_merge= pd.merge(df_merge,player_wages,on='Player',how='inner')

In [8]:
df_merge

Unnamed: 0,Player,Tkl,TklW,Tkl.1,Tkl%,Lost,Blocks,Pass,Int,Tkl+Int,...,90s,Def 3rd,Mid 3rd,Att 3rd,TotDist,PrgDist,1/3,Country,Market value,Weekly Wages
0,Erling Haaland,5,2,2,100.0,0,9,8,1,6,...,21.4,0,3,2,2353,437,13,Norway,180.0,375000
1,Phil Foden,24,14,12,50.0,12,21,21,13,37,...,25.7,10,8,6,16235,3052,51,United Kingdom,130.0,225000
2,Bukayo Saka,49,26,20,44.4,25,37,35,10,59,...,25.1,18,15,16,13843,3481,29,United Kingdom,130.0,195000
3,Declan Rice,55,34,24,58.5,17,31,21,35,90,...,26.6,28,21,6,28518,7674,204,United Kingdom,110.0,240000
4,Rodri,54,33,26,54.2,22,25,18,20,74,...,24.1,17,27,10,41853,11449,274,Spain,110.0,220000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,Harvey Barnes,3,1,0,0.0,1,0,0,2,5,...,4.0,1,2,0,1376,331,5,United Kingdom,35.0,80000
95,Mykhailo Mudryk,18,11,8,34.8,15,10,9,4,22,...,10.1,8,5,5,2995,994,16,Ukraine,35.0,100000
96,Diogo Dalot,56,36,35,72.9,13,34,22,27,83,...,25.3,41,8,7,18394,5755,77,Portugal,35.0,85000
97,Dominic Solanke,17,9,7,41.2,10,20,16,3,20,...,27.7,1,9,7,3712,919,20,United Kingdom,35.0,50000


In [9]:
df_merge.dtypes

Player           object
Tkl               int64
TklW              int64
Tkl.1             int64
Tkl%            float64
                 ...   
PrgDist           int64
1/3               int64
Country          object
Market value    float64
Weekly Wages      int64
Length: 93, dtype: object

In [10]:
# Select only the numeric columns 
df_numeric = df_merge.select_dtypes(include=['float64', 'int64'])

In [11]:
df_numeric

Unnamed: 0,Tkl,TklW,Tkl.1,Tkl%,Lost,Blocks,Pass,Int,Tkl+Int,Clr,...,Age,90s,Def 3rd,Mid 3rd,Att 3rd,TotDist,PrgDist,1/3,Market value,Weekly Wages
0,5,2,2,100.0,0,9,8,1,6,13,...,23,21.4,0,3,2,2353,437,13,180.0,375000
1,24,14,12,50.0,12,21,21,13,37,15,...,23,25.7,10,8,6,16235,3052,51,130.0,225000
2,49,26,20,44.4,25,37,35,10,59,14,...,22,25.1,18,15,16,13843,3481,29,130.0,195000
3,55,34,24,58.5,17,31,21,35,90,43,...,25,26.6,28,21,6,28518,7674,204,110.0,240000
4,54,33,26,54.2,22,25,18,20,74,35,...,27,24.1,17,27,10,41853,11449,274,110.0,220000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,3,1,0,0.0,1,0,0,2,5,1,...,26,4.0,1,2,0,1376,331,5,35.0,80000
95,18,11,8,34.8,15,10,9,4,22,6,...,23,10.1,8,5,5,2995,994,16,35.0,100000
96,56,36,35,72.9,13,34,22,27,83,84,...,25,25.3,41,8,7,18394,5755,77,35.0,85000
97,17,9,7,41.2,10,20,16,3,20,34,...,26,27.7,1,9,7,3712,919,20,35.0,50000


### Train the Data

In this project, I will use Root Mean Squared Error as the metrics. The reason is listed below:
- Interpretability: It's expressed in the same units as the target variable.
- Robustness: It's less sensitive to outliers compared to other metrics.
- Model Differentiation: Lower RMSE values indicate better model performance.
- Gradient-based Optimization: It aligns well with optimization algorithms.
- Error Magnitude: It quantifies the typical deviation of predictions from actual values.

In [12]:
from sklearn.model_selection import train_test_split

X = df_numeric.drop(columns=['Market value'])
y = df_numeric['Market value'] 

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Create a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the model performance using Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, y_pred,squared=False)
print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 105.04954660258328


In [14]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the scaler on the training data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

After incorporating the wages column, the initial model's performance has improved significantly, achieving an RMSE of 105 compared to the previous RMSE of 310.

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Create a linear regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

# Evaluate the model performance using Root Mean Squared Error (RMSE)
rmse = mean_squared_error(y_test, y_pred,squared=False)
print("Root Mean Squared Error:", rmse)


Root Mean Squared Error: 113.01718902838414


Fascinatingly, despite the wages column containing values in the hundred-thousands range, scaling the X dataset led to a deterioration in performance, yielding an RMSE of 113. Consequently, we will proceed to train the model without scaling the data.

#### Ridge Regression

In [16]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1.0, 10.0]
}

# Initialize the Ridge regression model and Gridsearch
ridge_model = Ridge(random_state=42)
grid_search = GridSearchCV(estimator=ridge_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the final Ridge model with the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Predict the target variable and evaluate

y_pred = best_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred,squared=False)

print("Root Mean Squared Error:", rmse)


Best Hyperparameters: {'alpha': 10.0}
Root Mean Squared Error: 26.958620156692277


#### Decision Tree Regressor

In [17]:
from sklearn.tree import DecisionTreeRegressor

# Define the parameter grid for Decision Tree Regressor
param_grid_dt = {
    'max_depth': [None, 5, 10, 15], 
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4]  
}

# Initialize the Decision Tree Regressor model and GridSearch
dt_model = DecisionTreeRegressor(random_state=42)
grid_search_dt = GridSearchCV(estimator=dt_model, param_grid=param_grid_dt, scoring='neg_mean_squared_error', cv=5)
grid_search_dt.fit(X_train, y_train)

# Get the best hyperparameters for Decision Tree Regressor
best_params_dt = grid_search_dt.best_params_
print("Best Hyperparameters for Decision Tree Regressor:", best_params_dt)

# Train the final Decision Tree Regressor model with the best hyperparameters
best_dt_model = grid_search_dt.best_estimator_
best_dt_model.fit(X_train, y_train)

# Predict the target variable and evaluate
y_pred = best_dt_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred,squared=False)
print("Root Mean Squared Error for Decision Tree Regressor:", rmse)


Best Hyperparameters for Decision Tree Regressor: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}
Root Mean Squared Error for Decision Tree Regressor: 39.08212666936128


#### XGBoost Regressor

In [18]:
import xgboost as xgb

# Define the parameter grid for XGBoost
param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.2],  
    'max_depth': [3, 5, 7], 
    'min_child_weight': [1, 3, 5],  
    'subsample': [0.5, 0.7, 1.0], 
    'colsample_bytree': [0.5, 0.7, 1.0]  
}

# Initialize the XGBoost Regressor model and GridSearch
xgb_model = xgb.XGBRegressor(random_state=42)
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, scoring='neg_mean_squared_error', cv=5)
grid_search_xgb.fit(X_train, y_train)

# Get the best hyperparameters for XGBoost
best_params_xgb = grid_search_xgb.best_params_
print("Best Hyperparameters for XGBoost:", best_params_xgb)

# Train the final XGBoost model with the best hyperparameters
best_xgb_model = grid_search_xgb.best_estimator_
best_xgb_model.fit(X_train, y_train)

# Predict the target variable and evaluate
y_pred_xgb = best_xgb_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred_xgb,squared=False)

print("Root Mean Squared Error for XGBoost:", rmse)


Best Hyperparameters for XGBoost: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 5, 'subsample': 0.5}
Root Mean Squared Error for XGBoost: 32.27426505064325


Despite the significant improvement observed after adding the weekly wage column to the base model, the overall performance remains considerably poor compared to the initial state. Even with the implementation of XGBoost, the model still exhibits an RMSE of 32. At the end, I will use the XGBoost in notebook Player Valuation for my final model.

### Test the Model

In [21]:
def predict_valuation(df,name,model):
    df=df[df['Player']==name]
    df = df.select_dtypes(include=['float64', 'int64'])
    result = model.predict(df)
    print(name,'valuation: ', result[0])

In [26]:
df = pd.merge(df, player_wages, on=['Player'], how='inner')

In [27]:
predict_valuation(df,'Lewis Dunk',best_model)

Lewis Dunk valuation:  59.80017224963453
