In [3]:
import numpy as np
print(np.__version__)

1.26.4


In [4]:
%pip install xgboost

import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
df = pd.read_csv("C:/Users/KIIT/Desktop/Major-Pro/DATASET/Processed/cleaned_final_data.csv")
df.head()

Note: you may need to restart the kernel to use updated packages.


Unnamed: 0,team,position,height,age,appearance,goals,assists,yellow cards,second yellow cards,red cards,goals conceded,clean sheets,minutes played,days_injured,games_injured,award,current_value,position_encoded,winger
0,Manchester United,Goalkeeper,189.0,32.0,104,0.0,0.0,0.009585,0.0,0.0,1.217252,0.335463,9390,42,5,13,15000000,1,0
1,Manchester United,Goalkeeper,196.0,30.0,15,0.0,0.0,0.069018,0.0,0.0,1.242331,0.207055,1304,510,58,1,1500000,1,0
2,Manchester United,Goalkeeper,188.0,37.0,4,0.0,0.0,0.0,0.0,0.0,0.616438,0.924658,292,697,84,4,600000,1,0
3,Manchester United,Defender Centre-Back,175.0,25.0,82,0.02809,0.05618,0.224719,0.0,0.0,0.0,0.0,6408,175,22,9,50000000,2,0
4,Manchester United,Defender Centre-Back,191.0,30.0,63,0.017889,0.017889,0.053667,0.0,0.0,0.0,0.0,5031,238,51,21,40000000,2,0


In [5]:
# Define features (X) and target (y)
X = df.drop(columns=["current_value", "team", "position"])
y = df["current_value"]

X.shape, y.shape


((10754, 16), (10754,))

In [6]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape


((8603, 16), (2151, 16))

In [7]:
# Initialize and train XGBoost model
xgb_model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb_model.fit(X_train, y_train)


In [8]:
# Make predictions and evaluate
y_pred = xgb_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

rmse, r2


(6168604.879234607, 0.5792357325553894)

In [9]:
# One-hot encode team and position, keep all other features
df_encoded = pd.get_dummies(df, columns=["team", "position"], drop_first=True)

# Define features and target again
X = df_encoded.drop(columns=["current_value"])
y = df_encoded["current_value"]

X.shape


(10754, 404)

In [10]:
# Train-test split again with new encoded data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape


((8603, 404), (2151, 404))

In [11]:
# Train a tuned XGBoost model
xgb_model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

xgb_model.fit(X_train, y_train)


In [12]:
from sklearn.metrics import mean_absolute_error

# Make predictions and evaluate performance
y_pred = xgb_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

rmse, mae, r2


(4829514.09755381, 1964501.8203968313, 0.7420878410339355)

In [13]:
# Add ratio-based features
df_encoded["goals_per_appearance"] = df["goals"] / (df["appearance"] + 1e-6)
df_encoded["assists_per_appearance"] = df["assists"] / (df["appearance"] + 1e-6)
df_encoded["minutes_per_game"] = df["minutes played"] / (df["appearance"] + 1e-6)

# Redefine features and target
X = df_encoded.drop(columns=["current_value"])
y = df_encoded["current_value"]

X.shape


(10754, 407)

In [14]:
from sklearn.model_selection import cross_val_score, KFold

xgb_base = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

# 5-fold cross-validation (scoring with R²)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(xgb_base, X, y, cv=cv, scoring="r2")

cv_scores.mean(), cv_scores.std()


(0.6811986684799194, 0.038464272207317046)

In [15]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define parameter grid
param_grid = {
    "n_estimators": [500, 1000, 1500],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [4, 6, 8, 10],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0]
}

xgb_model = XGBRegressor(random_state=42, n_jobs=-1)

# Randomized search with 5-fold CV
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=20,  # number of random combinations
    scoring="r2",
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X, y)

random_search.best_params_, random_search.best_score_


Fitting 3 folds for each of 20 candidates, totalling 60 fits


({'subsample': 0.6,
  'n_estimators': 1500,
  'max_depth': 4,
  'learning_rate': 0.1,
  'colsample_bytree': 0.8},
 -0.17438435554504395)

In [16]:
# Retrain with the best parameters from RandomizedSearchCV
xgb_best = XGBRegressor(
    n_estimators=1500,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.6,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

xgb_best.fit(X_train, y_train)

# Evaluate
y_pred_best = xgb_best.predict(X_test)

mse = mean_squared_error(y_test, y_pred_best)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred_best)
r2 = r2_score(y_test, y_pred_best)

rmse, mae, r2


(4721188.993290649, 2021342.4071254898, 0.7535279393196106)

In [17]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [1000, 1500, 2000],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [4, 6, 8],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0]
}

xgb_model = XGBRegressor(random_state=42, n_jobs=-1)

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring="r2",
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X, y)

grid_search.best_params_, grid_search.best_score_


Fitting 3 folds for each of 243 candidates, totalling 729 fits


({'colsample_bytree': 0.8,
  'learning_rate': 0.1,
  'max_depth': 4,
  'n_estimators': 2000,
  'subsample': 0.8},
 -0.11745754877726237)

In [18]:
xgb_best = XGBRegressor(
    n_estimators=1500,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.6,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

xgb_best.fit(X_train, y_train)

y_pred_best = xgb_best.predict(X_test)

mse = mean_squared_error(y_test, y_pred_best)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred_best)
r2 = r2_score(y_test, y_pred_best)

rmse, mae, r2


(4721188.993290649, 2021342.4071254898, 0.7535279393196106)

In [20]:
import joblib
joblib.dump(xgb_best, "player_value_model.pkl")
print(" Model saved as player_value_model.pkl")


 Model saved as player_value_model.pkl
