In [None]:
%pip install lightgbm optuna scikit-learn pandas matplotlib seaborn
%pip install -U scikit-learn

In [None]:
#BLOCK 1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb
import optuna

In [10]:
# Block 2 Load dataset
df = pd.read_csv(r"C:\Users\aruna\OneDrive\Desktop\Major-Pro\DATASET\Processed\cleaned_final_data.csv")

# Remove missing target rows
df = df.dropna(subset=["current_value"])

# Log-transform target to stabilize outliers
df["current_value"] = np.log1p(df["current_value"])

# Clip extreme outliers at 99th percentile
clip_val = df["current_value"].quantile(0.99)
df["current_value"] = np.clip(df["current_value"], None, clip_val)

# Example engineered feature: Age Bucket
df["age_bucket"] = pd.cut(df["age"], bins=[0,20,25,30,35,100],
                        labels=["<20","20-25","25-30","30-35","35+"])



In [11]:
# block 3 Label encode categorical variables
cat_cols = ["team", "position", "age_bucket"]
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))

# Features & target
X = df.drop(columns=["current_value"])
y = df["current_value"]


In [12]:
# block 4
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [13]:
# BLOCK 5
model = lgb.LGBMRegressor(random_state=42)

scores = cross_val_score(model, X, y, cv=kf, scoring="r2")
print("Baseline CV R²:", scores.mean())


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002001 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2892
[LightGBM] [Info] Number of data points in the train set: 8603, number of used features: 19
[LightGBM] [Info] Start training from score 13.522265
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000950 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2899
[LightGBM] [Info] Number of data points in the train set: 8603, number of used features: 19
[LightGBM] [Info] Start training from score 13.494408
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000950 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug

In [14]:
# BLOCK 6 — Hyperparameter tuning with Optuna (using callbacks)

def objective(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "n_estimators": 5000,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 31, 512),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
    }
    
    cv_scores = cross_val_score(
        lgb.LGBMRegressor(**params),
        X, y, cv=kf, scoring="r2"
    )
    return cv_scores.mean()

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)
print("Best Params:", study.best_params)
print("Best CV R²:", study.best_value)


[I 2025-09-13 20:22:30,701] A new study created in memory with name: no-name-ebc60c5d-7e27-4130-85f5-9fded4443c4e
[I 2025-09-13 20:24:43,600] Trial 0 finished with value: 0.47090773006629344 and parameters: {'learning_rate': 0.01141387591739505, 'num_leaves': 371, 'max_depth': 14, 'min_child_samples': 67, 'subsample': 0.5582720729683084, 'colsample_bytree': 0.7166958871297815, 'lambda_l1': 3.5249018983427676, 'lambda_l2': 0.08600419816041646}. Best is trial 0 with value: 0.47090773006629344.
[I 2025-09-13 20:24:43,600] Trial 0 finished with value: 0.47090773006629344 and parameters: {'learning_rate': 0.01141387591739505, 'num_leaves': 371, 'max_depth': 14, 'min_child_samples': 67, 'subsample': 0.5582720729683084, 'colsample_bytree': 0.7166958871297815, 'lambda_l1': 3.5249018983427676, 'lambda_l2': 0.08600419816041646}. Best is trial 0 with value: 0.47090773006629344.
[I 2025-09-13 20:25:45,484] Trial 1 finished with value: 0.47863436514891544 and parameters: {'learning_rate': 0.0146655

Best Params: {'learning_rate': 0.020985435025234467, 'num_leaves': 280, 'max_depth': 5, 'min_child_samples': 35, 'subsample': 0.6468500139267441, 'colsample_bytree': 0.652167494030897, 'lambda_l1': 7.724505697345126e-07, 'lambda_l2': 1.535513333019838e-06}
Best CV R²: 0.48991169312820926


In [19]:
# BLOCK 7 Train with validation set and early stopping
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

final_model = lgb.LGBMRegressor(**best_params, n_estimators=5000, random_state=42)

final_model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="rmse",
    early_stopping_rounds=100,
    verbose=False
)

print("Best iteration (epochs):", final_model.best_iteration_)


TypeError: lightgbm.sklearn.LGBMRegressor() got multiple values for keyword argument 'n_estimators'

In [None]:
# Evaluate on the test set
y_pred = final_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("MAE:", mae)
print("RMSE:", rmse)
print("R²:", r2)

In [None]:
# Undo the log1p transform
y_test_exp = np.expm1(y_test)
y_pred_exp = np.expm1(y_pred)

mae = mean_absolute_error(y_test_exp, y_pred_exp)
rmse = mean_squared_error(y_test_exp, y_pred_exp, squared=False)
r2 = r2_score(y_test_exp, y_pred_exp)

print("MAE (original scale):", mae)
print("RMSE (original scale):", rmse)
print("R²:", r2)
