In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load and clean data
df = pd.read_csv("Fbref_2023_1.csv", encoding='latin1')
df.columns = df.columns.str.strip()

In [3]:
# Clean 'Market Value Euro' column
df['Market Value Euro'] = df['Market Value Euro'].str.replace(',', '')
df['Market Value Euro'] = pd.to_numeric(df['Market Value Euro'], errors='coerce').fillna(0).astype(int)

In [4]:
# Replace missing values in 'Market Value Euro' with median for robustness
market_value_median = df[df['Market Value Euro'] > 0]['Market Value Euro'].median()
df['Market Value Euro'] = df['Market Value Euro'].replace(0, market_value_median)


In [5]:
# Encode categorical variables
le = LabelEncoder()
df['Pos_Cat'] = le.fit_transform(df['Pos'])
df['Nation_Cat'] = le.fit_transform(df['Nation'])
df['Squad_Cat'] = le.fit_transform(df['Squad'])
df['Comp_Cat'] = le.fit_transform(df['Comp'])

In [6]:
# Select numeric columns and scale features using RobustScaler
numeric_df = df.select_dtypes(include=[np.number])
scaler = RobustScaler()
features_to_scale = numeric_df.drop(columns=['Market Value Euro'])
scaled_features = scaler.fit_transform(features_to_scale)
scaled_numeric_df = pd.DataFrame(scaled_features, columns=features_to_scale.columns)


In [7]:
# Add non-scaled columns
scaled_numeric_df['Market Value Euro'] = df['Market Value Euro']
scaled_numeric_df['Pos_Cat'] = df['Pos_Cat']
scaled_numeric_df['Nation_Cat'] = df['Nation_Cat']
scaled_numeric_df['Squad_Cat'] = df['Squad_Cat']
scaled_numeric_df['Comp_Cat'] = df['Comp_Cat']

In [8]:
# Drop 'Rk' if it exists in the dataset
if 'Rk' in scaled_numeric_df.columns:
    scaled_numeric_df = scaled_numeric_df.drop('Rk', axis=1)


In [9]:
# Check for collinearity and drop highly correlated features
threshold = 0.95
corr_matrix = scaled_numeric_df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
scaled_numeric_df = scaled_numeric_df.drop(columns=to_drop)


## Machine Learning

In [10]:
# Separate data for ML regression (Market Value Euro)
X = scaled_numeric_df.drop(['Market Value Euro'], axis=1)
y_mv = np.log1p(scaled_numeric_df['Market Value Euro'])


In [11]:
# Train-test split for regression
X_train, X_test, y_train, y_test = train_test_split(X, y_mv, test_size=0.2, random_state=42)

## Random Forest

In [12]:
# Train RandomForestRegressor
rf_regressor = RandomForestRegressor(random_state=42)
rf_regressor.fit(X_train, y_train)

In [13]:
# Predict and evaluate
y_pred_rf = rf_regressor.predict(X_test)
print("Random Forest Regressor:")
print(f"Mean Squared Error: {mean_squared_error(np.expm1(y_test), np.expm1(y_pred_rf))}")
print(f"R^2 Score: {r2_score(np.expm1(y_test), np.expm1(y_pred_rf))}")


Random Forest Regressor:
Mean Squared Error: 127645585687722.38
R^2 Score: 0.3699045747078502


In [14]:
# Feature importance for RandomForestRegressor
rf_feature_importance = rf_regressor.feature_importances_
rf_feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_feature_importance
}).sort_values(by='Importance', ascending=False)

In [15]:
# Get top 50 features
top_50_features = rf_feature_importance_df.head(50)['Feature'].values
print("Top 50 Features:")
print(top_50_features)

Top 50 Features:
['Born' '90s' 'MP' 'Comp_Cat' 'Rec' 'Squad_Cat' 'Goals' 'TouAttPen' 'SCA'
 'PasShoCmp' 'Nation_Cat' 'SoT' 'Pas3rd' 'CarProg' 'ToSuc' 'PasTotCmp'
 'Fld' 'GcaPassLive' 'CarTotDist' 'TouAtt3rd' 'AerLost' 'TouMid3rd'
 'PasMedCmp' 'PasDead' 'ScaPassLive' 'Clr' 'PasTotDist' 'Recov' 'Fls'
 'Shots' 'CarPrgDist' 'PasBlocks' 'TouDef3rd' 'CarMis' 'ShoDist' 'RecProg'
 'AerWon' 'TouDefPen' 'Carries' 'TklDriAtt' 'PPA' 'PasLonAtt'
 'PasTotPrgDist' 'PasOff' 'PasProg' 'Car3rd' 'GCA' 'Int' 'TI' 'ScaSh']


In [16]:
# Get the indices of the top 50 features
top_50_feature_indices = rf_feature_importance_df.head(50).index

In [17]:
# Modify the training and testing sets to include only the top 50 features
X_train_top_50 = X_train.iloc[:, top_50_feature_indices]
X_test_top_50 = X_test.iloc[:, top_50_feature_indices]

In [18]:
# Train and evaluate the model with the top 50 features
rf_regressor_top_50 = RandomForestRegressor(random_state=42)
rf_regressor_top_50.fit(X_train_top_50, y_train)

In [19]:
# Predict and evaluate
y_pred_rf_top_50 = rf_regressor_top_50.predict(X_test_top_50)
print("Random Forest Regressor with Top 50 Features:")
print(f"Mean Squared Error: {mean_squared_error(np.expm1(y_test), np.expm1(y_pred_rf_top_50))}")
print(f"R^2 Score: {r2_score(np.expm1(y_test), np.expm1(y_pred_rf_top_50))}")


Random Forest Regressor with Top 50 Features:
Mean Squared Error: 126677491740206.3
R^2 Score: 0.37468336564132765


In [20]:
# Display feature importance for the model with top 50 features
rf_feature_importance_top_50 = rf_regressor_top_50.feature_importances_
rf_feature_importance_top_50_df = pd.DataFrame({
    'Feature': top_50_features,
    'Importance': rf_feature_importance_top_50
}).sort_values(by='Importance', ascending=False)

print("Feature Importance for RandomForestRegressor with Top 50 Features:")
print(rf_feature_importance_top_50_df)

Feature Importance for RandomForestRegressor with Top 50 Features:
          Feature  Importance
0            Born    0.150161
1             90s    0.102019
2              MP    0.090259
3        Comp_Cat    0.069208
4             Rec    0.057146
5       Squad_Cat    0.032880
6           Goals    0.026049
7       TouAttPen    0.022714
8             SCA    0.018482
10     Nation_Cat    0.016521
11            SoT    0.016210
9       PasShoCmp    0.015879
12         Pas3rd    0.014337
13        CarProg    0.013949
14          ToSuc    0.013363
15      PasTotCmp    0.012720
16            Fld    0.012442
27          Recov    0.012109
19      TouAtt3rd    0.011656
20        AerLost    0.011545
17    GcaPassLive    0.011490
23        PasDead    0.011481
28            Fls    0.011424
39      TklDriAtt    0.011017
18     CarTotDist    0.011004
25            Clr    0.010724
21      TouMid3rd    0.010299
30     CarPrgDist    0.010257
34        ShoDist    0.010191
26     PasTotDist    0.010116
24 

In [21]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define the parameter grid
rf_params = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [22]:
# Initialize the RandomForestRegressor
rf_regressor = RandomForestRegressor(random_state=42)

### Gridsearch for RF

In [23]:
# Perform RandomizedSearchCV to find the best parameters
rf_random_search = RandomizedSearchCV(estimator=rf_regressor, param_distributions=rf_params, 
                                      n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
rf_random_search.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [24]:
# Get the best estimator
best_rf = rf_random_search.best_estimator_

In [25]:
# Evaluate the best model
y_pred_rf = best_rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred_rf)
r2 = r2_score(y_test, y_pred_rf)

print("Best Random Forest Regressor after Tuning:")
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Best Random Forest Regressor after Tuning:
Mean Squared Error: 0.8613060296814602
R^2 Score: 0.5413319415895133


### Ensemble ML

In [26]:
from sklearn.ensemble import GradientBoostingRegressor

# Initialize GradientBoostingRegressor
gb_regressor = GradientBoostingRegressor(random_state=42)

# Fit the model
gb_regressor.fit(X_train, y_train)

# Evaluate GradientBoostingRegressor
y_pred_gb = gb_regressor.predict(X_test)
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print("Gradient Boosting Regressor:")
print(f"Mean Squared Error: {mse_gb}")
print(f"R^2 Score: {r2_gb}")

Gradient Boosting Regressor:
Mean Squared Error: 0.786941101966585
R^2 Score: 0.5809332166687469


In [27]:
from sklearn.svm import SVR

# Initialize SVR
svr_regressor = SVR()

# Fit the model
svr_regressor.fit(X_train, y_train)

# Evaluate SVR
y_pred_svr = svr_regressor.predict(X_test)
mse_svr = mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

print("Support Vector Regressor:")
print(f"Mean Squared Error: {mse_svr}")
print(f"R^2 Score: {r2_svr}")

Support Vector Regressor:
Mean Squared Error: 1.3954260665300149
R^2 Score: 0.25689900855865144


In [28]:
from sklearn.neural_network import MLPRegressor

# Initialize MLPRegressor
mlp_regressor = MLPRegressor(random_state=42)

# Fit the model
mlp_regressor.fit(X_train, y_train)

# Evaluate MLPRegressor
y_pred_mlp = mlp_regressor.predict(X_test)
mse_mlp = mean_squared_error(y_test, y_pred_mlp)
r2_mlp = r2_score(y_test, y_pred_mlp)

print("Multi-layer Perceptron Regressor:")
print(f"Mean Squared Error: {mse_mlp}")
print(f"R^2 Score: {r2_mlp}")

Multi-layer Perceptron Regressor:
Mean Squared Error: 3.9810860205727554
R^2 Score: -1.1200327554847744




In [29]:
from sklearn.neighbors import KNeighborsRegressor

# Initialize KNeighborsRegressor
knn_regressor = KNeighborsRegressor()

# Fit the model
knn_regressor.fit(X_train, y_train)

# Evaluate KNeighborsRegressor
y_pred_knn = knn_regressor.predict(X_test)
mse_knn = mean_squared_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)

print("K-Nearest Neighbors Regressor:")
print(f"Mean Squared Error: {mse_knn}")
print(f"R^2 Score: {r2_knn}")

K-Nearest Neighbors Regressor:
Mean Squared Error: 1.7228722411581672
R^2 Score: 0.08252532954681313


In [30]:
from sklearn.ensemble import AdaBoostRegressor

# Initialize AdaBoostRegressor
ada_regressor = AdaBoostRegressor(random_state=42)

# Fit the model
ada_regressor.fit(X_train, y_train)

# Evaluate AdaBoostRegressor
y_pred_ada = ada_regressor.predict(X_test)
mse_ada = mean_squared_error(y_test, y_pred_ada)
r2_ada = r2_score(y_test, y_pred_ada)

print("AdaBoost Regressor:")
print(f"Mean Squared Error: {mse_ada}")
print(f"R^2 Score: {r2_ada}")

AdaBoost Regressor:
Mean Squared Error: 1.1274324260552322
R^2 Score: 0.39961265331089657


In [31]:
from sklearn.ensemble import ExtraTreesRegressor

# Initialize ExtraTreesRegressor
et_regressor = ExtraTreesRegressor(random_state=42)

# Fit the model
et_regressor.fit(X_train, y_train)

# Evaluate ExtraTreesRegressor
y_pred_et = et_regressor.predict(X_test)
mse_et = mean_squared_error(y_test, y_pred_et)
r2_et = r2_score(y_test, y_pred_et)

print("Extra Trees Regressor:")
print(f"Mean Squared Error: {mse_et}")
print(f"R^2 Score: {r2_et}")

Extra Trees Regressor:
Mean Squared Error: 0.8833058632031092
R^2 Score: 0.5296164530418939


In [32]:
from sklearn.linear_model import Ridge

# Initialize Ridge Regression
ridge_regressor = Ridge(random_state=42)

# Fit the model
ridge_regressor.fit(X_train, y_train)

# Evaluate Ridge Regression
y_pred_ridge = ridge_regressor.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print("Ridge Regression:")
print(f"Mean Squared Error: {mse_ridge}")
print(f"R^2 Score: {r2_ridge}")

Ridge Regression:
Mean Squared Error: 1.377904405198307
R^2 Score: 0.26622975292382534


In [33]:
from sklearn.linear_model import Lasso

# Initialize Lasso Regression
lasso_regressor = Lasso(random_state=42)

# Fit the model
lasso_regressor.fit(X_train, y_train)

# Evaluate Lasso Regression
y_pred_lasso = lasso_regressor.predict(X_test)
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print("Lasso Regression:")
print(f"Mean Squared Error: {mse_lasso}")
print(f"R^2 Score: {r2_lasso}")

Lasso Regression:
Mean Squared Error: 1.8802834174410827
R^2 Score: -0.0013002517329294694
