In [0]:

%pip install xgboost
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_percentage_error
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.preprocessing import TargetEncoder


df = pd.read_csv("datasets/base.csv")


df['market_value_in_eur'] = df['market_value_in_eur'].fillna(0)
df = df.loc[df['market_value_in_eur']*0.4 < df['transfer_fee']].reset_index(drop=True)
df['Pos'] = df['Pos'].str.split(',').str[0]


df['transfer_season'] = df['transfer_season'].astype('category')
df['from_club_name'] = df['from_club_name'].astype('category')
df['to_club_name'] = df['to_club_name'].astype('category')
df['player_name'] = df['player_name'].astype('category')
df['transfer_date'] = df['transfer_date'].astype('category')
df['Nation'] = df['Nation'].astype('category')
df['Pos'] = df['Pos'].astype('category')
df['Squad'] = df['Squad'].astype('category')
df['Comp'] = df['Comp'].astype('category')


y = df['transfer_fee']
exclude_cols = ['transfer_fee','transfer_date','player_name','from_club_name',
                'Born','to_club_name','previous_season','transfer_season']
X = df.drop(columns=exclude_cols)


X_dummies = pd.get_dummies(X[['Pos','Comp','season_year']], drop_first=True)
encoder = TargetEncoder()
X_target_encoded = pd.DataFrame(encoder.fit_transform(X[['Nation','Squad']], y),
                                columns=['Nation','Squad'])
X = pd.concat([X.drop(columns=['Nation','Pos','Comp','Squad','season_year']),
               X_dummies, X_target_encoded], axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


mi = mutual_info_regression(X_train, y_train, random_state=42)
mi_df = pd.Series(mi, index=X_train.columns, name='mutual_info').sort_values(ascending=False)
selected_features = mi_df[mi_df > 0.01].index
X_train = X_train[selected_features]
X_test = X_test[selected_features]

## MLFLOW
mlflow.set_experiment("/Users/arielon88@gmail.com/transfer_fee_prediction")

def log_model_run(model_name, model, X_train, X_test, y_train, y_test, params=None):
    with mlflow.start_run(run_name=model_name):
        if params:
            mlflow.log_params(params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        mape = mean_absolute_percentage_error(y_test, y_pred)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mape", mape)
        mlflow.sklearn.log_model(model, artifact_path=model_name)
        print(f"{model_name} — R2: {r2:.4f}, MAPE: {mape:.4f}")
        return model, r2, mape


# Linear Regression
lr_model, lr_r2, lr_mape = log_model_run("LinearRegression",
    LinearRegression(), X_train, X_test, y_train, y_test)

# Random Forest
rf_params = {"n_estimators": 300, "max_depth": 10, "random_state": 42}
rf_model, rf_r2, rf_mape = log_model_run("RandomForestRegressor",
    RandomForestRegressor(**rf_params), X_train, X_test, y_train, y_test, rf_params)

# XGBoost
xgb_params = {"max_depth": 5, "n_estimators": 1000, "learning_rate": 0.2, "n_jobs": -1}
xgb_model, xgb_r2, xgb_mape = log_model_run("XGBoostRegressor",
    xgb.XGBRegressor(**xgb_params), X_train, X_test, y_train, y_test, xgb_params)

# SVM
scaler = StandardScaler().fit(X_train)
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)
svm_params = {"kernel": "linear", "C": 500}
svm_model, svm_r2, svm_mape = log_model_run("SVR",
    SVR(**svm_params), X_train_std, X_test_std, y_train, y_test, svm_params)


best_model_name = max(
    [("LinearRegression", lr_r2),
     ("RandomForest", rf_r2),
     ("XGBoost", xgb_r2),
     ("SVM", svm_r2)],
    key=lambda x: x[1]
)[0]

print(f"Best model: {best_model_name}")


[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m




LinearRegression — R2: 0.8320, MAPE: 0.6559




RandomForestRegressor — R2: 0.7846, MAPE: 0.5255




XGBoostRegressor — R2: 0.7311, MAPE: 0.7065




SVR — R2: 0.8372, MAPE: 0.4874
Best model: SVM
