# Baseline Modeling — Competition vs. Cash (FBref + Transfermarkt)

This notebook builds quick baselines to answer:

> **Do players moving from top-5 European leagues to other European leagues experience a performance decline?**

We use the merged dataset built by `src/data/data_preprocess.py`, run cross-validation with leakage-safe grouping, and record both **classification** (decline flag) and **regression** (delta in GA/90) KPIs.


In [2]:
!ls

baseline.ipynb


In [12]:
# Basic
import os
import json
import numpy as np
import pandas as pd

# Modeling
from sklearn.model_selection import GroupKFold, StratifiedKFold, cross_validate
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import make_scorer, f1_score, accuracy_score, roc_auc_score, average_precision_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Display
pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 160)

# Paths (from data_preprocess.py outputs)
PATH_MERGED          = "../data/processed/merged/players_transfer_outcomes.csv"
PATH_CLEAN           = "../data/processed/merged_clean.csv"
PATH_MODEL_READY     = "../data/processed/merged_model_ready.csv"
PATH_PIPELINE        = "../artifacts/preprocessor.pkl"

RANDOM_STATE = 42
N_SPLITS     = 5

# Ensure files exist
for p in [PATH_MERGED, PATH_CLEAN, PATH_MODEL_READY]:
    if not os.path.exists(p):
        raise FileNotFoundError(f"Missing required file: {p} — Run src/data/data_preprocess.py first.")


In [13]:
merged = pd.read_csv(PATH_MERGED)
clean  = pd.read_csv(PATH_CLEAN)
X_all  = pd.read_csv(PATH_MODEL_READY)

print("Shapes:", {"merged": merged.shape, "clean": clean.shape, "X_all(model_ready)": X_all.shape})
display(merged.head(3))
display(merged[['player_name','from_league','to_league','transfer_date','GA90_pre','GA90_post','PerfChange_1y','DeclineFlag_1y']].head(10))


Shapes: {'merged': (2056, 21), 'clean': (2056, 28), 'X_all(model_ready)': (2056, 5385)}


Unnamed: 0,player_name,player_name_norm,player_id,transfer_date,from_club_id,to_club_id,from_league,to_league,GA90_pre,GA90_post,PerfChange_1y,DeclineFlag_1y,PreMinutes,PostMinutes,InsufficientPreData,InsufficientPostData,UsedPostWindowDays,fbref_name,Position,FBref_Percentile_Mean,FBref_AttrVec_Mean
0,Rui Silva,rui silva,234811,2025-07-01,150,336,ES1,PO1,0.0,,,,2700.0,0.0,0,1,,,,,
1,Felix Uduokhai,felix uduokhai,278343,2025-07-01,167,114,L1,TR1,0.0,,,,2440.0,0.0,0,1,,,,,
2,Sofyan Amrabat,sofyan amrabat,287579,2025-07-01,430,36,IT1,TR1,0.157729,,,,2853.0,0.0,0,1,,,,,


Unnamed: 0,player_name,from_league,to_league,transfer_date,GA90_pre,GA90_post,PerfChange_1y,DeclineFlag_1y
0,Rui Silva,ES1,PO1,2025-07-01,0.0,,,
1,Felix Uduokhai,L1,TR1,2025-07-01,0.0,,,
2,Sofyan Amrabat,IT1,TR1,2025-07-01,0.157729,,,
3,Kieran Tierney,GB1,SC1,2025-07-01,0.0,,,
4,Ismail Jakobs,FR1,TR1,2025-07-01,0.069659,,,
5,Borna Barisic,ES1,TR1,2025-06-30,0.233161,,,
6,Miha Zajc,FR1,TR1,2025-06-30,0.0,,,
7,Chuba Akpom,FR1,NL1,2025-06-30,0.521327,,,
8,Jakob Busk,L1,DK1,2025-06-30,0.0,,,
9,Denis Vavro,L1,DK1,2025-06-30,0.057711,,,


In [14]:
merged["DeclineFlag_1y"].isna().sum()

np.int64(1170)

In [15]:
# Classification target: did performance decline >= 20%? (already computed in preprocessor)
y_cls = merged['DeclineFlag_1y'].copy()

# Regression target: absolute change in GA/90 (post - pre)
y_reg = merged['PerfChange_1y'].copy()

# Group key to avoid leakage: group by player across folds
groups = merged['player_name'].fillna(merged.get('player_name_norm', 'unknown')).astype(str)

# Align X with y after dropping NaNs in targets
def align_xy(X, y):
    mask = y.notna()
    return X.loc[mask].reset_index(drop=True), y.loc[mask].reset_index(drop=True), groups.loc[mask].reset_index(drop=True)

X_cls, y_cls, g_cls = align_xy(X_all, y_cls)
X_reg, y_reg, g_reg = align_xy(X_all, y_reg)

print("Post-filter shapes:", {"X_cls": X_cls.shape, "y_cls": y_cls.shape, "X_reg": X_reg.shape, "y_reg": y_reg.shape})
y_cls.value_counts(dropna=False)


Post-filter shapes: {'X_cls': (886, 5385), 'y_cls': (886,), 'X_reg': (1378, 5385), 'y_reg': (1378,)}


DeclineFlag_1y
0.0    586
1.0    300
Name: count, dtype: int64

In [23]:
from sklearn.metrics import make_scorer, accuracy_score, f1_score, roc_auc_score, average_precision_score


def evaluate_classifier(name, estimator, X, y, groups, n_splits=5):
    # Use GroupKFold to avoid player leakage
    gkf = GroupKFold(n_splits=n_splits)
    scoring = {
        "accuracy": "accuracy",
        "f1": "f1",
        "roc_auc": "roc_auc",
        "pr_auc": "average_precision"
    }
    cv = cross_validate(
        estimator, X, y, 
        scoring=scoring, 
        cv=gkf.split(X, y, groups=groups),
        return_train_score=False
    )
    row = {
        "model": name,
        "acc_mean": np.mean(cv['test_accuracy']),
        "acc_std":  np.std(cv['test_accuracy']),
        "f1_mean":  np.mean(cv['test_f1']),
        "f1_std":   np.std(cv['test_f1']),
        "roc_mean": np.mean(cv['test_roc_auc']),
        "roc_std":  np.std(cv['test_roc_auc']),
        "prauc_mean": np.mean(cv['test_pr_auc']),
        "prauc_std":  np.std(cv['test_pr_auc']),
    }
    return row

def evaluate_regressor(name, estimator, X, y, groups, n_splits=5):
    gkf = GroupKFold(n_splits=n_splits)
    rows = []
    for train_idx, test_idx in gkf.split(X, y, groups=groups):
        Xtr, Xte = X.iloc[train_idx], X.iloc[test_idx]
        ytr, yte = y.iloc[train_idx], y.iloc[test_idx]
        est = estimator
        est.fit(Xtr, ytr)
        pred = est.predict(Xte)
        rows.append({
            "mae": mean_absolute_error(yte, pred),
            "rmse": np.sqrt(mean_squared_error(yte, pred)),
            "r2": r2_score(yte, pred)
        })
    df = pd.DataFrame(rows)
    row = {
        "model": name,
        "mae_mean": df["mae"].mean(),   "mae_std": df["mae"].std(),
        "rmse_mean": df["rmse"].mean(), "rmse_std": df["rmse"].std(),
        "r2_mean": df["r2"].mean(),     "r2_std": df["r2"].std(),
    }
    return row


In [24]:
cls_results = []

# 1) Trivial baselines
cls_results.append(
    evaluate_classifier(
        "Dummy(most_frequent)", 
        DummyClassifier(strategy="most_frequent", random_state=RANDOM_STATE), 
        X_cls, y_cls, g_cls, n_splits=N_SPLITS
    )
)

cls_results.append(
    evaluate_classifier(
        "Dummy(stratified)", 
        DummyClassifier(strategy="stratified", random_state=RANDOM_STATE), 
        X_cls, y_cls, g_cls, n_splits=N_SPLITS
    )
)

# 2) Linear baseline
cls_results.append(
    evaluate_classifier(
        "LogisticRegression(l2)", 
        LogisticRegression(max_iter=200, class_weight="balanced", random_state=RANDOM_STATE, n_jobs=None), 
        X_cls, y_cls, g_cls, n_splits=N_SPLITS
    )
)

# 3) Tree baseline
cls_results.append(
    evaluate_classifier(
        "RandomForestClassifier", 
        RandomForestClassifier(
            n_estimators=300, max_depth=None, min_samples_leaf=2, n_jobs=-1, random_state=RANDOM_STATE, class_weight="balanced_subsample"
        ), 
        X_cls, y_cls, g_cls, n_splits=N_SPLITS
    )
)

cls_results_df = pd.DataFrame(cls_results).sort_values("f1_mean", ascending=False)
display(cls_results_df)


Unnamed: 0,model,acc_mean,acc_std,f1_mean,f1_std,roc_mean,roc_std,prauc_mean,prauc_std
2,LogisticRegression(l2),1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,RandomForestClassifier,0.99548,0.006589,0.992885,0.010492,1.0,0.0,1.0,0.0
1,Dummy(stratified),0.560922,0.024509,0.319266,0.05594,0.498042,0.034489,0.340422,0.041234
0,Dummy(most_frequent),0.661436,0.028097,0.0,0.0,0.5,0.0,0.338564,0.028097


The dummy baselines behave normally: ~66% accuracy for “always no-decline” and ~56% with random guessing, showing the true difficulty of the task.
But logistic regression and random forest scoring 1.0 across all metrics is a clear sign of target leakage.
You need to restrict X to only pre-transfer features and keep all post-transfer info (GA90_post, PerfChange, DeclineFlag) strictly as the target.

In [25]:
reg_results = []

# 1) Trivial baseline
reg_results.append(
    evaluate_regressor(
        "Dummy(mean)", 
        DummyRegressor(strategy="mean"), 
        X_reg, y_reg, g_reg, n_splits=N_SPLITS
    )
)

# 2) Linear baseline
reg_results.append(
    evaluate_regressor(
        "LinearRegression", 
        LinearRegression(n_jobs=None), 
        X_reg, y_reg, g_reg, n_splits=N_SPLITS
    )
)

# 3) Tree baseline
reg_results.append(
    evaluate_regressor(
        "RandomForestRegressor", 
        RandomForestRegressor(n_estimators=400, max_depth=None, min_samples_leaf=2, n_jobs=-1, random_state=RANDOM_STATE), 
        X_reg, y_reg, g_reg, n_splits=N_SPLITS
    )
)

reg_results_df = pd.DataFrame(reg_results).sort_values("rmse_mean", ascending=True)
display(reg_results_df)


Unnamed: 0,model,mae_mean,mae_std,rmse_mean,rmse_std,r2_mean,r2_std
2,RandomForestRegressor,0.017411,0.008598,0.105616,0.070444,0.923668,0.068299
1,LinearRegression,0.141541,0.02941,0.31612,0.069959,0.307688,0.191527
0,Dummy(mean),0.21446,0.019124,0.38823,0.093905,-0.004674,0.004162


In [26]:
# Choose primary KPIs:
#  - Classification: F1 (primary), ROC-AUC & PR-AUC (secondary)
#  - Regression: RMSE (primary), MAE & R^2 (secondary)

kpis = {
    "classification": cls_results_df.to_dict(orient="records"),
    "regression": reg_results_df.to_dict(orient="records")
}

os.makedirs("logs", exist_ok=True)
with open("logs/baseline_kpis.json", "w") as f:
    json.dump(kpis, f, indent=2)

print("Saved KPIs → logs/baseline_kpis.json")
display(pd.DataFrame(kpis["classification"]))
display(pd.DataFrame(kpis["regression"]))


Saved KPIs → logs/baseline_kpis.json


Unnamed: 0,model,acc_mean,acc_std,f1_mean,f1_std,roc_mean,roc_std,prauc_mean,prauc_std
0,LogisticRegression(l2),1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,RandomForestClassifier,0.99548,0.006589,0.992885,0.010492,1.0,0.0,1.0,0.0
2,Dummy(stratified),0.560922,0.024509,0.319266,0.05594,0.498042,0.034489,0.340422,0.041234
3,Dummy(most_frequent),0.661436,0.028097,0.0,0.0,0.5,0.0,0.338564,0.028097


Unnamed: 0,model,mae_mean,mae_std,rmse_mean,rmse_std,r2_mean,r2_std
0,RandomForestRegressor,0.017411,0.008598,0.105616,0.070444,0.923668,0.068299
1,LinearRegression,0.141541,0.02941,0.31612,0.069959,0.307688,0.191527
2,Dummy(mean),0.21446,0.019124,0.38823,0.093905,-0.004674,0.004162


In [27]:
print("Class balance (DeclineFlag_1y):")
display(y_cls.value_counts(normalize=True).rename("proportion").to_frame())

print("\nGA/90 change summary:")
display(y_reg.describe(percentiles=[0.1, 0.5, 0.9]))

# Optional: look at minutes gating indirectly via NaNs filtered earlier
missing_targets = merged[['GA90_pre','GA90_post','PerfChange_1y','DeclineFlag_1y']].isna().mean().rename("missing_rate")
display(missing_targets.to_frame())


Class balance (DeclineFlag_1y):


Unnamed: 0_level_0,proportion
DeclineFlag_1y,Unnamed: 1_level_1
0.0,0.6614
1.0,0.3386



GA/90 change summary:


count    1378.000000
mean        0.058586
std         0.397197
min        -5.312500
10%        -0.222011
50%         0.038869
90%         0.420568
max         3.333333
Name: PerfChange_1y, dtype: float64

Unnamed: 0,missing_rate
GA90_pre,0.294261
GA90_post,0.06858
PerfChange_1y,0.329767
DeclineFlag_1y,0.569066


In [28]:
# This is illustrative — not for final causal claims.
rf_cls = RandomForestClassifier(
    n_estimators=400, max_depth=None, min_samples_leaf=2, n_jobs=-1, random_state=RANDOM_STATE, class_weight="balanced_subsample"
)
rf_cls.fit(X_cls, y_cls)

feat_imp = pd.DataFrame({
    "feature": X_cls.columns,
    "importance": rf_cls.feature_importances_
}).sort_values("importance", ascending=False)

display(feat_imp.head(25))


Unnamed: 0,feature,importance
3,num__DeclineFlag_1y,0.288321
2,num__PerfChange_1y,0.271034
1,num__GA90_post,0.126579
5,num__PostMinutes,0.027643
0,num__GA90_pre,0.025334
4,num__PreMinutes,0.013501
5152,from_club_id_1082,0.003536
5157,from_club_id_1123,0.003167
5264,to_club_id_720,0.002988
5204,from_club_id_23826,0.002492


In [29]:
# One compact table with top model from each task
top_cls = cls_results_df.iloc[0][["model","f1_mean","roc_mean","prauc_mean","acc_mean"]]
top_reg = reg_results_df.iloc[0][["model","rmse_mean","mae_mean","r2_mean"]]

summary = pd.DataFrame({
    "Task": ["Classification","Regression"],
    "Model": [top_cls["model"], top_reg["model"]],
    "Primary KPI": ["F1", "RMSE"],
    "KPI Value": [round(top_cls["f1_mean"],4), round(top_reg["rmse_mean"],4)],
    "Secondary": [f"ROC={top_cls['roc_mean']:.3f}, PR-AUC={top_cls['prauc_mean']:.3f}, ACC={top_cls['acc_mean']:.3f}",
                  f"MAE={top_reg['mae_mean']:.3f}, R2={top_reg['r2_mean']:.3f}"]
})
display(summary)

os.makedirs("artifacts", exist_ok=True)
summary.to_csv("artifacts/baseline_summary.csv", index=False)
print("Saved summary → artifacts/baseline_summary.csv")


Unnamed: 0,Task,Model,Primary KPI,KPI Value,Secondary
0,Classification,LogisticRegression(l2),F1,1.0,"ROC=1.000, PR-AUC=1.000, ACC=1.000"
1,Regression,RandomForestRegressor,RMSE,0.1056,"MAE=0.017, R2=0.924"


Saved summary → artifacts/baseline_summary.csv
