# Baseline Modeling — Competition vs. Cash (FBref + Transfermarkt)

This notebook builds quick baselines to answer:

> **Do players moving from top-5 European leagues to other European leagues experience a performance decline?**

We use the merged dataset built by `src/data/data_preprocess.py`, run cross-validation with leakage-safe grouping, and record both **classification** (decline flag) and **regression** (delta in GA/90) KPIs.


In [35]:
%ls

 Volume in drive C has no label.
 Volume Serial Number is 144C-C915

 Directory of c:\Users\Pouya\Desktop\DSBootcampP1\notebooks

10/01/2025  03:10 PM    <DIR>          .
09/30/2025  04:11 PM    <DIR>          ..
09/24/2025  04:28 PM             6,148 .DS_Store
10/01/2025  03:10 PM            66,833 baseline.ipynb
09/30/2025  03:22 PM         1,131,471 eda.ipynb
09/30/2025  03:52 PM           186,359 feature_selection.ipynb
09/30/2025  04:40 PM             6,866 pipeline_demo.ipynb
               5 File(s)      1,397,677 bytes
               2 Dir(s)  57,551,659,008 bytes free


In [36]:
# Basic
import os
import json
import numpy as np
import pandas as pd

# Modeling
from sklearn.model_selection import GroupKFold, StratifiedKFold, cross_validate
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import make_scorer, f1_score, accuracy_score, roc_auc_score, average_precision_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Display
pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 160)

# Paths (from data_preprocess.py outputs)
PATH_MERGED          = "../data/processed/merged/players_transfer_outcomes.csv"

In [37]:
# Load merged dataset
merged = pd.read_csv(PATH_MERGED)
print("Merged dataset shape:", merged.shape)
merged.head()

Merged dataset shape: (4829, 50)


Unnamed: 0.1,player_id,transfer_date,transfer_season,from_club_id,to_club_id,from_club_name,to_club_name,transfer_fee,market_value_in_eur,player_name,player_name_norm,PreMinutes,GA_pre,PostMinutes,GA_post,GA90_pre,GA90_post,PerfChange,DeclineFlag,from_league,to_league,from_league_name,to_league_name,first_name,last_name,name,last_season,current_club_id,player_code,country_of_birth,city_of_birth,country_of_citizenship,date_of_birth,sub_position,position,foot,height_in_cm,contract_expiration_date,agent_name,image_url,url,current_club_domestic_competition_id,current_club_name,market_value_in_eur_player,highest_market_value_in_eur,Unnamed: 0,Name,Attribute Vector,Percentiles,Position
0,182581,2025-02-07,24/25,265,11282,Panathinaikos Athlitikos Omilos,Alanyaspor,0.0,2000000.0,Tonny Vilhena,tonny vilhena,2935.0,3.0,540.0,3.0,0.091993,0.5,0.408007,0.0,GR1,TR1,super-league-1,super-lig,Tonny,Vilhena,Tonny Vilhena,2024,11282,tonny-vilhena,Netherlands,Maassluis,Netherlands,1995-01-03 00:00:00,Central Midfield,Midfield,left,175.0,2025-06-30 00:00:00,Team of Future,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/tonny-vilhena/...,TR1,Alanyaspor,1500000.0,14000000.0,473.0,Tonny Vilhena,"[0.16, 0.1, 1.23, 0.04, 0.07, 0.16, 2.11, 34.3...","[81, 66, 67, 26, 39, 49, 40, 14, 18, 12, 42, 4...",Midfielder
1,339340,2025-02-04,24/25,985,383,Manchester United Football Club,Eindhovense Voetbalvereniging Philips Sport Ve...,0.0,15000000.0,Tyrell Malacia,tyrell malacia,1276.0,0.0,639.0,0.0,0.0,0.0,0.0,1.0,GB1,NL1,premier-league,eredivisie,Tyrell,Malacia,Tyrell Malacia,2024,383,tyrell-malacia,Netherlands,Rotterdam,Netherlands,1999-08-17 00:00:00,Left-Back,Defender,left,169.0,2025-06-30 00:00:00,Darren Dein,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/tyrell-malacia...,NL1,Eindhovense Voetbalvereniging Philips Sport Ve...,12000000.0,22000000.0,175.0,Tyrell Malacia,"[0.0, 0.03, 0.44, 0.0, 0.06, 0.09, 1.97, 56.68...","[20, 33, 32, 11, 26, 26, 47, 66, 95, 62, 36, 4...",FullBack
2,124555,2025-02-03,24/25,430,416,Associazione Calcio Fiorentina,Torino Calcio,0.0,2000000.0,Cristiano Biraghi,cristiano biraghi,4680.0,15.0,560.0,0.0,0.288462,0.0,-0.288462,1.0,IT1,IT1,serie-a,serie-a,Cristiano,Biraghi,Cristiano Biraghi,2024,416,cristiano-biraghi,Italy,Cernusco sul Naviglio,Italy,1992-09-01 00:00:00,Left-Back,Defender,left,185.0,2025-06-30 00:00:00,,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/cristiano-bira...,IT1,Torino Calcio,2000000.0,12000000.0,40.0,Cristiano Biraghi,"[0.07, 0.04, 0.85, 0.14, 0.19, 0.23, 4.66, 70....","[73, 47, 68, 78, 93, 84, 99, 91, 21, 89, 65, 8...",FullBack
3,126719,2025-02-03,24/25,1050,3709,Villarreal Club de Fútbol S.A.D.,Getafe Club de Fútbol S.A.D. Team Dubai,0.0,2500000.0,Juan Bernat,juan bernat,1308.0,4.0,500.0,1.0,0.275229,0.18,-0.095229,1.0,ES1,ES1,laliga,laliga,Juan,Bernat,Juan Bernat,2024,3709,juan-bernat,Spain,Cullera,Spain,1993-03-01 00:00:00,Left-Back,Defender,left,170.0,2025-06-30 00:00:00,InterStarDeporte,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/juan-bernat/pr...,ES1,Getafe Club de Fútbol S.A.D. Team Dubai,2500000.0,20000000.0,38.0,Juan Bernat,"[0.05, 0.04, 0.33, 0.19, 0.15, 0.19, 1.92, 58....","[60, 50, 19, 86, 78, 67, 45, 73, 99, 6, 12, 71...",FullBack
4,157506,2025-02-03,24/25,873,371,Crystal Palace Football Club,The Celtic Football Club,0.0,6000000.0,Jeffrey Schlupp,jeffrey schlupp,2845.0,9.0,807.0,2.0,0.28471,0.223048,-0.061662,1.0,GB1,SC1,premier-league,scottish-premiership,Jeffrey,Schlupp,Jeffrey Schlupp,2024,371,jeffrey-schlupp,Germany,Hamburg,Ghana,1992-12-23 00:00:00,Left Midfield,Midfield,left,178.0,2025-05-31 00:00:00,Unique Sports Group,https://img.a.transfermarkt.technology/portrai...,https://www.transfermarkt.co.uk/jeffrey-schlup...,SC1,The Celtic Football Club,5000000.0,12000000.0,404.0,Jeffrey Schlupp,"[0.11, 0.11, 1.0, 0.04, 0.05, 0.16, 1.56, 32.0...","[66, 74, 47, 22, 23, 50, 15, 8, 11, 18, 40, 71...",Midfielder


In [38]:
merged["DeclineFlag"].isna().sum()

0

In [39]:
# Load merged dataset
merged = pd.read_csv(PATH_MERGED)

# Define target
target = "DeclineFlag"
y = merged[target].copy()

# Define features (drop target + leakage columns)
leakage_cols = [
    target, "PerfChange", "GA90_pre", "GA90_post",
    "PreMinutes", "PostMinutes", "GA_pre", "GA_post"
]

# Drop any leakage or ID columns you don’t want as features
id_cols = ["player_id", "player_name", "player_name_norm", "transfer_date"]

X_all = merged.drop(columns=[c for c in leakage_cols + id_cols if c in merged.columns])

print("Initial feature matrix shape:", X_all.shape)

# Group key to avoid leakage: group by player
groups = merged['player_name'].fillna(merged.get('player_name_norm', 'unknown')).astype(str)

# Align X with y after dropping NaNs
def align_xy(X, y):
    mask = y.notna()
    return (
        X.loc[mask].reset_index(drop=True),
        y.loc[mask].reset_index(drop=True),
        groups.loc[mask].reset_index(drop=True)
    )

X_cls, y_cls, g_cls = align_xy(X_all, y)

print("Post-filter shapes:", {"X_cls": X_cls.shape, "y_cls": y_cls.shape})

# Check distribution of classes
print("\n=== Distribution of target ===")
print(y_cls.value_counts())
print("\n=== Class balance (%) ===")
print((y_cls.value_counts(normalize=True) * 100).round(2))

Initial feature matrix shape: (4829, 38)
Post-filter shapes: {'X_cls': (4829, 38), 'y_cls': (4829,)}

=== Distribution of target ===
DeclineFlag
0.0    2937
1.0    1892
Name: count, dtype: int64

=== Class balance (%) ===
DeclineFlag
0.0    60.82
1.0    39.18
Name: proportion, dtype: float64


In [40]:
X_cls.isna().sum()

transfer_season                            0
from_club_id                               0
to_club_id                                 0
from_club_name                           212
to_club_name                             165
transfer_fee                             123
market_value_in_eur                        8
from_league                              212
to_league                                165
from_league_name                         212
to_league_name                           165
first_name                               300
last_name                                  0
name                                       0
last_season                                0
current_club_id                            0
player_code                                0
country_of_birth                           1
city_of_birth                              1
country_of_citizenship                     0
date_of_birth                              0
sub_position                               0
position  

In [41]:
from sklearn.metrics import make_scorer, accuracy_score, f1_score, roc_auc_score, average_precision_score

RANDOM_STATE = 42
N_SPLITS = 5

def evaluate_classifier(name, estimator, X, y, groups, n_splits=5):
    # Use GroupKFold to avoid player leakage
    gkf = GroupKFold(n_splits=n_splits)
    scoring = {
        "accuracy": "accuracy",
        "f1": "f1",
        "roc_auc": "roc_auc",
        "pr_auc": "average_precision"
    }
    cv = cross_validate(
        estimator, X, y, 
        scoring=scoring, 
        cv=gkf.split(X, y, groups=groups),
        return_train_score=False
    )
    row = {
        "model": name,
        "acc_mean": np.mean(cv['test_accuracy']),
        "acc_std":  np.std(cv['test_accuracy']),
        "f1_mean":  np.mean(cv['test_f1']),
        "f1_std":   np.std(cv['test_f1']),
        "roc_mean": np.mean(cv['test_roc_auc']),
        "roc_std":  np.std(cv['test_roc_auc']),
        "prauc_mean": np.mean(cv['test_pr_auc']),
        "prauc_std":  np.std(cv['test_pr_auc']),
    }
    return row

def evaluate_regressor(name, estimator, X, y, groups, n_splits=5):
    gkf = GroupKFold(n_splits=n_splits)
    rows = []
    for train_idx, test_idx in gkf.split(X, y, groups=groups):
        Xtr, Xte = X.iloc[train_idx], X.iloc[test_idx]
        ytr, yte = y.iloc[train_idx], y.iloc[test_idx]
        est = estimator
        est.fit(Xtr, ytr)
        pred = est.predict(Xte)
        rows.append({
            "mae": mean_absolute_error(yte, pred),
            "rmse": np.sqrt(mean_squared_error(yte, pred)),
            "r2": r2_score(yte, pred)
        })
    df = pd.DataFrame(rows)
    row = {
        "model": name,
        "mae_mean": df["mae"].mean(),   "mae_std": df["mae"].std(),
        "rmse_mean": df["rmse"].mean(), "rmse_std": df["rmse"].std(),
        "r2_mean": df["r2"].mean(),     "r2_std": df["r2"].std(),
    }
    return row


In [42]:
# --- Select features ---
drop_cols = [
    "player_id", "player_name", "player_name_norm", "last_name", "first_name",
    "name", "url", "image_url", "player_code",
    "transfer_date", "transfer_season",   # non-numeric
    "from_club_name", "to_club_name", "from_league_name", "to_league_name",
    "current_club_name"
]

# Keep only numeric + categorical
X_raw = merged.drop(columns=[c for c in drop_cols if c in merged.columns])

# Separate categorical and numeric
categorical_cols = [c for c in X_raw.columns if X_raw[c].dtype == "object"]
numeric_cols     = [c for c in X_raw.columns if X_raw[c].dtype != "object"]

print("Categorical:", categorical_cols[:10])
print("Numeric:", numeric_cols[:10])

# --- Encode categoricals ---
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), numeric_cols),
        
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_cols)
    ]
)

# Now redefine classifiers with preprocessing wrapped
from sklearn.pipeline import make_pipeline

log_reg = make_pipeline(preprocessor, LogisticRegression(max_iter=200, class_weight="balanced", random_state=RANDOM_STATE))
rf      = make_pipeline(preprocessor, RandomForestClassifier(
    n_estimators=300, min_samples_leaf=2, class_weight="balanced_subsample", random_state=RANDOM_STATE, n_jobs=-1
))

# Update evaluate calls
cls_results = []
cls_results.append(evaluate_classifier("Dummy(most_frequent)", DummyClassifier(strategy="most_frequent", random_state=RANDOM_STATE), X_raw, y_cls, g_cls))
cls_results.append(evaluate_classifier("Dummy(stratified)", DummyClassifier(strategy="stratified", random_state=RANDOM_STATE), X_raw, y_cls, g_cls))
cls_results.append(evaluate_classifier("LogisticRegression(L2)", log_reg, X_raw, y_cls, g_cls))
cls_results.append(evaluate_classifier("RandomForest", rf, X_raw, y_cls, g_cls))

cls_results_df = pd.DataFrame(cls_results).sort_values("f1_mean", ascending=False)
display(cls_results_df)

Categorical: ['from_league', 'to_league', 'country_of_birth', 'city_of_birth', 'country_of_citizenship', 'date_of_birth', 'sub_position', 'position', 'foot', 'contract_expiration_date']
Numeric: ['from_club_id', 'to_club_id', 'transfer_fee', 'market_value_in_eur', 'PreMinutes', 'GA_pre', 'PostMinutes', 'GA_post', 'GA90_pre', 'GA90_post']


Unnamed: 0,model,acc_mean,acc_std,f1_mean,f1_std,roc_mean,roc_std,prauc_mean,prauc_std
2,LogisticRegression(L2),1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,RandomForest,1.0,0.0,1.0,0.0,1.0,0.0,1.0,4.9650680000000004e-17
1,Dummy(stratified),0.521019,0.009797,0.378658,0.027199,0.494967,0.014257,0.389909,0.03573619
0,Dummy(most_frequent),0.608203,0.030641,0.0,0.0,0.5,0.0,0.391797,0.03064128


This result is a clear red flag, that strongly suggests data leakage: both LogisticRegression and RandomForest achieved perfect accuracy (1.0) across all folds, with F1, ROC AUC, and PR AUC also equal to 1.0, which is virtually impossible in a real-world football prediction setting with noisy transfer data. The reason is that some features directly encode the target or are derived from it, allowing the models to “cheat.” For example, columns such as PerfChange, GA90_post, GA90_pre, and their strict/extended/relaxed variants are literally how DeclineFlag is computed, meaning the model has access to future information. To fix this and make the classification task valid, all post-transfer performance columns must be removed (PerfChange, PerfChange_*, GA90_post, GA_post, PostMinutes, and any _extended, _strict, or _relaxed versions), since they leak future outcomes. Instead, the model should only keep pre-transfer features and static player or market information such as PreMinutes, GA_pre, GA90_pre, transfer_fee, market_value_in_eur, age (from date of birth), position, foot, height_in_cm, and club/league categorical features. Re-running with this cleaned feature set will likely reduce performance (e.g., F1 around 0.55–0.65), but the results will be realistic and trustworthy.

In [43]:
# --- Select features: drop IDs, text fields, target, and leakage columns ---
drop_cols = [
    # IDs / direct identifiers
    "player_id", "player_name", "player_name_norm", "last_name", "first_name",
    "name", "url", "image_url", "player_code",

    # Time/season identifiers
    "transfer_date", "transfer_season",

    # Club/league names
    "from_club_name", "to_club_name", "from_league_name", "to_league_name",
    "current_club_name",

    # Target itself
    "DeclineFlag",

    # Post-transfer leakage columns
    "PerfChange", "PerfChange_strict", "PerfChange_relaxed", "PerfChange_extended",
    "GA90_post", "GA90_post_strict", "GA90_post_relaxed", "GA90_post_extended",
    "GA_post", "GA_post_x", "GA_post_y",
    "PostMinutes", "PostMinutes_x", "PostMinutes_y"
]

# Filter dataset
X_raw = merged.drop(columns=[c for c in drop_cols if c in merged.columns])
y_cls = merged["DeclineFlag"]

# Keep only rows where target is not null
mask = y_cls.notnull()
X_raw = X_raw.loc[mask].reset_index(drop=True)
y_cls = y_cls.loc[mask].reset_index(drop=True)
g_cls = merged.loc[mask, "player_id"].astype(str)  # grouping key

# --- Separate categorical and numeric features ---
categorical_cols = [c for c in X_raw.columns if X_raw[c].dtype == "object"]
numeric_cols     = [c for c in X_raw.columns if X_raw[c].dtype != "object"]

print("Categorical sample:", categorical_cols[:10])
print("Numeric sample:", numeric_cols[:10])
print("X_raw shape:", X_raw.shape, " Target dist:", y_cls.value_counts().to_dict())

# --- Preprocessing pipeline ---
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier

preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), numeric_cols),
        
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_cols)
    ]
)

# --- Models with preprocessing wrapped ---
log_reg = make_pipeline(
    preprocessor,
    LogisticRegression(max_iter=200, class_weight="balanced", random_state=RANDOM_STATE)
)

rf = make_pipeline(
    preprocessor,
    RandomForestClassifier(
        n_estimators=300,
        min_samples_leaf=2,
        class_weight="balanced_subsample",
        random_state=RANDOM_STATE,
        n_jobs=-1
    )
)

Categorical sample: ['from_league', 'to_league', 'country_of_birth', 'city_of_birth', 'country_of_citizenship', 'date_of_birth', 'sub_position', 'position', 'foot', 'contract_expiration_date']
Numeric sample: ['from_club_id', 'to_club_id', 'transfer_fee', 'market_value_in_eur', 'PreMinutes', 'GA_pre', 'GA90_pre', 'last_season', 'current_club_id', 'height_in_cm']
X_raw shape: (4829, 29)  Target dist: {0.0: 2937, 1.0: 1892}


In [44]:
cls_results = []

cls_results.append(evaluate_classifier(
    "Dummy(most_frequent)",
    DummyClassifier(strategy="most_frequent", random_state=RANDOM_STATE),
    X_raw, y_cls, g_cls
))

cls_results.append(evaluate_classifier(
    "Dummy(stratified)",
    DummyClassifier(strategy="stratified", random_state=RANDOM_STATE),
    X_raw, y_cls, g_cls
))

cls_results.append(evaluate_classifier(
    "LogisticRegression(L2)",
    log_reg, X_raw, y_cls, g_cls
))

cls_results.append(evaluate_classifier(
    "RandomForest",
    rf, X_raw, y_cls, g_cls
))

cls_results_df = pd.DataFrame(cls_results).sort_values("f1_mean", ascending=False)
display(cls_results_df)

Unnamed: 0,model,acc_mean,acc_std,f1_mean,f1_std,roc_mean,roc_std,prauc_mean,prauc_std
2,LogisticRegression(L2),0.713603,0.013946,0.648096,0.014744,0.776579,0.016283,0.668957,0.041946
3,RandomForest,0.691033,0.016273,0.54882,0.020368,0.740875,0.014363,0.648236,0.035078
1,Dummy(stratified),0.521018,0.01331,0.379076,0.030593,0.494764,0.018313,0.390041,0.029807
0,Dummy(most_frequent),0.608199,0.022352,0.0,0.0,0.5,0.0,0.391801,0.022352


In [45]:
# Choose primary KPIs:
#  - Classification: F1 (primary), ROC-AUC & PR-AUC (secondary)

kpis = {
    "classification": cls_results_df.to_dict(orient="records")
}

os.makedirs("../logs", exist_ok=True)
with open("../logs/baseline_kpis.json", "w") as f:
    json.dump(kpis, f, indent=2)

print("Saved KPIs → logs/baseline_kpis.json")
display(pd.DataFrame(kpis["classification"]))

Saved KPIs → logs/baseline_kpis.json


Unnamed: 0,model,acc_mean,acc_std,f1_mean,f1_std,roc_mean,roc_std,prauc_mean,prauc_std
0,LogisticRegression(L2),0.713603,0.013946,0.648096,0.014744,0.776579,0.016283,0.668957,0.041946
1,RandomForest,0.691033,0.016273,0.54882,0.020368,0.740875,0.014363,0.648236,0.035078
2,Dummy(stratified),0.521018,0.01331,0.379076,0.030593,0.494764,0.018313,0.390041,0.029807
3,Dummy(most_frequent),0.608199,0.022352,0.0,0.0,0.5,0.0,0.391801,0.022352


In [46]:
print("Class balance (DeclineFlag):")
display(y_cls.value_counts(normalize=True).rename("proportion").to_frame())

Class balance (DeclineFlag):


Unnamed: 0_level_0,proportion
DeclineFlag,Unnamed: 1_level_1
0.0,0.6082
1.0,0.3918


In [47]:
from sklearn.inspection import permutation_importance

# --- Fit Random Forest inside pipeline ---
rf_cls = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=RANDOM_STATE,
    class_weight="balanced_subsample"
)

# Wrap with your preprocessor
rf_pipeline = make_pipeline(preprocessor, rf_cls)
rf_pipeline.fit(X_raw, y_cls)

# --- Get feature names from pipeline ---
# Numeric features (unchanged names)
num_features = numeric_cols

# Categorical features (expanded by one-hot encoder)
cat_encoder = rf_pipeline.named_steps["columntransformer"].named_transformers_["cat"].named_steps["encoder"]
cat_features = cat_encoder.get_feature_names_out(categorical_cols)

# Combine into full feature list
all_features = list(num_features) + list(cat_features)

# --- Extract feature importances ---
importances = rf_pipeline.named_steps["randomforestclassifier"].feature_importances_
feat_imp = pd.DataFrame({
    "feature": all_features,
    "importance": importances
}).sort_values("importance", ascending=False)

print("\n=== Top 25 Important Features (RandomForest) ===")
display(feat_imp.head(25))



=== Top 25 Important Features (RandomForest) ===


Unnamed: 0,feature,importance
6,GA90_pre,0.066366
5,GA_pre,0.044636
4,PreMinutes,0.034315
3,market_value_in_eur,0.027241
2117,position_Goalkeeper,0.022564
1,to_club_id,0.022416
0,from_club_id,0.022068
11,highest_market_value_in_eur,0.021658
6048,Position_GoalKeeper,0.019944
2107,sub_position_Goalkeeper,0.019241


In [48]:
# If you want one top model summary row instead of full table, uncomment below:

# # One compact summary table: top classification model
# top_cls = cls_results_df.iloc[0][["model","f1_mean","roc_mean","prauc_mean","acc_mean"]]

# summary = pd.DataFrame({
#     "Task": ["Classification"],
#     "Model": [top_cls["model"]],
#     "Primary KPI": ["F1"],
#     "KPI Value": [round(top_cls["f1_mean"], 4)],
#     "Secondary": [f"ROC={top_cls['roc_mean']:.3f}, PR-AUC={top_cls['prauc_mean']:.3f}, ACC={top_cls['acc_mean']:.3f}"]
# })

# If you want the full table with all models, use below:

# Full summary table with all classification models
summary = cls_results_df[["model","f1_mean","roc_mean","prauc_mean","acc_mean"]].copy()

display(summary)

# Save
os.makedirs("../artifacts", exist_ok=True)
summary.to_csv("../artifacts/baseline_summary.csv", index=False)
print("Saved summary → artifacts/baseline_summary.csv")

Unnamed: 0,model,f1_mean,roc_mean,prauc_mean,acc_mean
2,LogisticRegression(L2),0.648096,0.776579,0.668957,0.713603
3,RandomForest,0.54882,0.740875,0.648236,0.691033
1,Dummy(stratified),0.379076,0.494764,0.390041,0.521018
0,Dummy(most_frequent),0.0,0.5,0.391801,0.608199


Saved summary → artifacts/baseline_summary.csv
