In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./data/all_stats.csv')
player_val = pd.read_csv('./data/player_val_cleaned.csv')

In this notebook, Im going to merge the original data with the player valuation data, doing that I need to merge the table first.

### Merge and Clean the Data

In [3]:
df_merge = pd.merge(df, player_val[['Player','Market value']], on=['Player'], how='inner')

In [4]:
df_merge.head()

Unnamed: 0,Player,Tkl,TklW,Tkl.1,Tkl%,Lost,Blocks,Pass,Int,Tkl+Int,...,Age,90s,Def 3rd,Mid 3rd,Att 3rd,TotDist,PrgDist,1/3,Country,Market value
0,William Saliba,28,17,13,86.7,2,23,12,21,49,...,23,28.0,17,10,1,34755,12635,138,France,€80.00m
1,Declan Rice,55,34,24,58.5,17,31,21,35,90,...,25,26.6,28,21,6,28518,7674,204,United Kingdom,€110.00m
2,Bukayo Saka,49,26,20,44.4,25,37,35,10,59,...,22,25.1,18,15,16,13843,3481,29,United Kingdom,€130.00m
3,Martin Ødegaard,34,13,11,28.9,27,18,17,7,41,...,25,24.5,8,19,7,19337,5570,121,Norway,€95.00m
4,Ben White,31,19,15,48.4,16,30,14,24,55,...,26,23.7,13,15,3,21584,7206,128,United Kingdom,€55.00m


In [5]:
# Preprocess the Market Value column
df_merge['Market value'] =(df_merge['Market value'].str.replace('€', '')
                              .str.replace('m', '')
                              .astype(float))

In [6]:
df_merge.head()

Unnamed: 0,Player,Tkl,TklW,Tkl.1,Tkl%,Lost,Blocks,Pass,Int,Tkl+Int,...,Age,90s,Def 3rd,Mid 3rd,Att 3rd,TotDist,PrgDist,1/3,Country,Market value
0,William Saliba,28,17,13,86.7,2,23,12,21,49,...,23,28.0,17,10,1,34755,12635,138,France,80.0
1,Declan Rice,55,34,24,58.5,17,31,21,35,90,...,25,26.6,28,21,6,28518,7674,204,United Kingdom,110.0
2,Bukayo Saka,49,26,20,44.4,25,37,35,10,59,...,22,25.1,18,15,16,13843,3481,29,United Kingdom,130.0
3,Martin Ødegaard,34,13,11,28.9,27,18,17,7,41,...,25,24.5,8,19,7,19337,5570,121,Norway,95.0
4,Ben White,31,19,15,48.4,16,30,14,24,55,...,26,23.7,13,15,3,21584,7206,128,United Kingdom,55.0


In [7]:
# Select only the numeric columns 
df_numeric = df_merge.select_dtypes(include=['float64', 'int64'])

### Train the Data

In this project, I will use Root Mean Squared Error as the metrics. The reason is listed below:
- Interpretability: It's expressed in the same units as the target variable.
- Robustness: It's less sensitive to outliers compared to other metrics.
- Model Differentiation: Lower RMSE values indicate better model performance.
- Gradient-based Optimization: It aligns well with optimization algorithms.
- Error Magnitude: It quantifies the typical deviation of predictions from actual values.

In [8]:
from sklearn.model_selection import train_test_split

X = df_numeric.drop(columns=['Market value'])
y = df_numeric['Market value'] 

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Create a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the model performance using Root Mean Squared Error (RMSE)
mse = mean_squared_error(y_test, y_pred,squared=False)
print("Root Mean Squared Error:", mse)

Root Mean Squared Error: 310.38495345207554


In [10]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the scaler on the training data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

The initial model yielded unsatisfactory results with an RMSE of 310, likely due to the absence of data preprocessing. To address this, let's attempt to scale the data and reevaluate the model's performance.

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Create a linear regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

# Evaluate the model performance using Root Mean Squared Error (RMSE)
mse = mean_squared_error(y_test, y_pred,squared=False)
print("Root Mean Squared Error:", mse)


Root Mean Squared Error: 310.3849534533404


After scaling the data, the outcome appears unchanged. Let's explore alternative models such as Ridge Regression to introduce regularization, as well as Decision Trees and XGBoost to assess if different modeling approaches yield improved results.

#### Ridge Regression

In [13]:
from sklearn.linear_model import Ridge

# Define the parameter grid
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1.0, 10.0]
}

# Initialize the Ridge regression model and Gridsearch
ridge_model = Ridge(random_state=42)
grid_search = GridSearchCV(estimator=ridge_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the final Ridge model with the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Predict the target variable and evaluate

y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred,squared=False)

print("Root Mean Squared Error:", mse)


Best Hyperparameters: {'alpha': 10.0}
Root Mean Squared Error: 33.146085326315315


#### Decision Tree Regressor

In [14]:
from sklearn.tree import DecisionTreeRegressor

# Define the parameter grid for Decision Tree Regressor
param_grid_dt = {
    'max_depth': [None, 5, 10, 15], 
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4]  
}

# Initialize the Decision Tree Regressor model and GridSearch
dt_model = DecisionTreeRegressor(random_state=42)
grid_search_dt = GridSearchCV(estimator=dt_model, param_grid=param_grid_dt, scoring='neg_mean_squared_error', cv=5)
grid_search_dt.fit(X_train, y_train)

# Get the best hyperparameters for Decision Tree Regressor
best_params_dt = grid_search_dt.best_params_
print("Best Hyperparameters for Decision Tree Regressor:", best_params_dt)

# Train the final Decision Tree Regressor model with the best hyperparameters
best_dt_model = grid_search_dt.best_estimator_
best_dt_model.fit(X_train, y_train)

# Predict the target variable and evaluate
y_pred = best_dt_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred,squared=False)
print("Root Mean Squared Error for Decision Tree Regressor:", mse)


Best Hyperparameters for Decision Tree Regressor: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2}
Root Mean Squared Error for Decision Tree Regressor: 25.336138287921294


#### XGBoost Regressor

In [15]:
import xgboost as xgb

# Define the parameter grid for XGBoost
param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.2],  
    'max_depth': [3, 5, 7], 
    'min_child_weight': [1, 3, 5],  
    'subsample': [0.5, 0.7, 1.0], 
    'colsample_bytree': [0.5, 0.7, 1.0]  
}

# Initialize the XGBoost Regressor model and GridSearch
xgb_model = xgb.XGBRegressor(random_state=42)
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, scoring='neg_mean_squared_error', cv=5)
grid_search_xgb.fit(X_train, y_train)

# Get the best hyperparameters for XGBoost
best_params_xgb = grid_search_xgb.best_params_
print("Best Hyperparameters for XGBoost:", best_params_xgb)

# Train the final XGBoost model with the best hyperparameters
best_xgb_model = grid_search_xgb.best_estimator_
best_xgb_model.fit(X_train, y_train)

# Predict the target variable and evaluate
y_pred_xgb = best_xgb_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred_xgb,squared=False)

print("Root Mean Squared Error for XGBoost:", mse)


Best Hyperparameters for XGBoost: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 3, 'subsample': 0.5}
Root Mean Squared Error for XGBoost: 17.88607116615562


The XGBoost regression model outperformed the Decision Tree regressor and Ridge Regression, yielding the lowest RMSE of 17.8 compared to 25.3 and 33.1, respectively. Therefore, the conclusion is that the XGBoost regressor is superior to the Decision Tree regressor, which in turn is better than Ridge Regression.

### Test the Model

In [16]:
def predict_valuation(df,name,model):
    df=df[df['Player']==name]
    df = df.select_dtypes(include=['float64', 'int64'])
    result = model.predict(df)
    print(name,'valuation: ', result[0])

In [17]:
predict_valuation(df,'Lewis Dunk',best_model)

Lewis Dunk valuation:  29.159814103202052
