In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./data/all_stats.csv')
player_val = pd.read_csv('./data/player_val_cleaned.csv')
player_wages = pd.read_csv('./data/player_wages.csv')

In [3]:
df.fillna(0,inplace=True)

In [7]:
df_merge = pd.merge(df, player_val[['Player','Market value']], on=['Player'], how='inner')

In [8]:
df_merge['Market value'] =(df_merge['Market value'].str.replace('€', '')
                              .str.replace('m', '')
                              .astype(float))

In [9]:
df_merge= pd.merge(df_merge,player_wages,on='Player')

In [10]:
df_numeric = df_merge.select_dtypes(include=['float64', 'int64'])

In [11]:
from sklearn.model_selection import train_test_split

X = df_numeric.drop(columns=['Market value'])  # Features
y = df_numeric['Market value']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the scaler on the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the fitted scaler
X_test_scaled = scaler.transform(X_test)

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Create a linear regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Predict the target variable on the test data
y_pred = model.predict(X_test)

# Evaluate the model performance using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 5378.653385869333


In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# Define the parameter grid
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1.0, 10.0]  # Regularization strength
}

# Initialize the Ridge regression model
ridge_model = Ridge()

# Initialize GridSearchCV with the Ridge model and parameter grid
grid_search = GridSearchCV(estimator=ridge_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the final Ridge model with the best hyperparameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Predict the target variable on the test data
y_pred = best_model.predict(X_test)

# Evaluate the model performance using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Best Hyperparameters: {'alpha': 10.0}
Mean Squared Error: 1254.1159984344213


In [15]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# Define the parameter grid for Decision Tree Regressor
param_grid_dt = {
    'max_depth': [None, 5, 10, 15],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required at each leaf node
}

# Initialize the Decision Tree Regressor model
dt_model = DecisionTreeRegressor()

# Initialize GridSearchCV with the Decision Tree Regressor model and parameter grid
grid_search_dt = GridSearchCV(estimator=dt_model, param_grid=param_grid_dt, scoring='neg_mean_squared_error', cv=5)

# Perform grid search
grid_search_dt.fit(X_train, y_train)

# Get the best hyperparameters for Decision Tree Regressor
best_params_dt = grid_search_dt.best_params_
print("Best Hyperparameters for Decision Tree Regressor:", best_params_dt)

# Train the final Decision Tree Regressor model with the best hyperparameters
best_dt_model = grid_search_dt.best_estimator_
best_dt_model.fit(X_train, y_train)

# Predict the target variable on the test data using Decision Tree Regressor
y_pred_dt = best_dt_model.predict(X_test)

# Evaluate the Decision Tree Regressor model performance using Mean Squared Error (MSE)
mse_dt = mean_squared_error(y_test, y_pred_dt)
print("Mean Squared Error for Decision Tree Regressor:", mse_dt)


Best Hyperparameters for Decision Tree Regressor: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}
Mean Squared Error for Decision Tree Regressor: 1007.5179178004535


In [16]:
import xgboost as xgb

# Define the parameter grid for XGBoost
param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.2],  # Learning rate
    'max_depth': [3, 5, 7],  # Maximum depth of the trees
    'min_child_weight': [1, 3, 5],  # Minimum sum of instance weight needed in a child
    'subsample': [0.5, 0.7, 1.0],  # Subsample ratio of the training instances
    'colsample_bytree': [0.5, 0.7, 1.0]  # Subsample ratio of columns when constructing each tree
}

# Initialize the XGBoost Regressor model
xgb_model = xgb.XGBRegressor()

# Initialize GridSearchCV with the XGBoost Regressor model and parameter grid
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, scoring='neg_mean_squared_error', cv=5)

# Perform grid search
grid_search_xgb.fit(X_train, y_train)

# Get the best hyperparameters for XGBoost
best_params_xgb = grid_search_xgb.best_params_
print("Best Hyperparameters for XGBoost:", best_params_xgb)

# Train the final XGBoost model with the best hyperparameters
best_xgb_model = grid_search_xgb.best_estimator_
best_xgb_model.fit(X_train, y_train)

# Predict the target variable on the test data using XGBoost
y_pred_xgb = best_xgb_model.predict(X_test)

# Evaluate the XGBoost model performance using Mean Squared Error (MSE)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print("Mean Squared Error for XGBoost:", mse_xgb)


Best Hyperparameters for XGBoost: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.7}
Mean Squared Error for XGBoost: 911.2571831749323


In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Create a linear regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train_scaled, y_train)

# Predict the target variable on the test data
y_pred = model.predict(X_test_scaled)

# Evaluate the model performance using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 26498.510549818464


In [18]:
def predict_valuation(df,name,model):
    df=df[df['Player']==name]
    df = df.select_dtypes(include=['float64', 'int64'])
    result = model.predict(df)
    print(name,'valuation: ', result[0])

In [19]:
predict_valuation(df,'Bruno Fernandes',best_model)

Bruno Fernandes valuation:  73.13600660857524


### Save Model

In [21]:
import pickle

# Save the model to a file
with open('model.pkl', 'wb') as file:
    pickle.dump(best_xgb_model, file)
