In [None]:
# BLOCK 1: Imports
import pandas as pd
import numpy as np
import os # CHANGED: Added 'os' for creating directories and managing file paths
import joblib # CHANGED: Added 'joblib' for saving the model and encoders
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import optuna

print("--- Data Loading and Preprocessing ---")

In [2]:
# Block 2: Load and preprocess dataset
# Make sure to update the file path to where your dataset is located.
try:
    df = pd.read_csv(r"C:\Users\aruna\OneDrive\Desktop\Major-Pro\DATASET\Processed\cleaned_final_data.csv")
except FileNotFoundError:
    print("Error: Dataset file not found. Please update the path in Block 2.")
    exit()

# Remove missing target rows
df = df.dropna(subset=["current_value"])

# Log-transform target to stabilize outliers
df["current_value"] = np.log1p(df["current_value"])

# Clip extreme outliers at 99th percentile
clip_val = df["current_value"].quantile(0.99)
df["current_value"] = np.clip(df["current_value"], None, clip_val)

# Example engineered feature: Age Bucket
df["age_bucket"] = pd.cut(df["age"], bins=[0,20,25,30,35,100],
                        labels=["<20","20-25","25-30","30-35","35+"])

print("Dataset loaded and initial processing complete.")



Dataset loaded and initial processing complete.


In [3]:
# Block 3: Label encode categorical variables
# CHANGED: Storing encoders in a dictionary is a major change.
# This is crucial for correctly processing new data in your Flask app.
cat_cols = ["team", "position", "age_bucket"]
encoders = {} # CHANGED: Initialized a dictionary to hold the fitted encoders
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    encoders[col] = le # CHANGED: Saved the fitted encoder for each column
    print(f"Label encoded '{col}' column.")

# Features & target
X = df.drop(columns=["current_value"])
y = df["current_value"]

Label encoded 'team' column.
Label encoded 'position' column.
Label encoded 'age_bucket' column.


In [4]:
# Block 4: KFold for cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [5]:
# Block 5: Baseline model score
print("\n--- Baseline Model Evaluation ---")
model = lgb.LGBMRegressor(random_state=42)
scores = cross_val_score(model, X, y, cv=kf, scoring="r2")
print(f"Baseline CV R²: {scores.mean()}")



--- Baseline Model Evaluation ---
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001721 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2892
[LightGBM] [Info] Number of data points in the train set: 8603, number of used features: 19
[LightGBM] [Info] Start training from score 13.522265
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2899
[LightGBM] [Info] Number of data points in the train set: 8603, number of used features: 19
[LightGBM] [Info] Start training from score 13.494408
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001036 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2897
[LightGBM] [Info] Num

In [6]:
# BLOCK 6: Hyperparameter tuning with Optuna
print("\n--- Hyperparameter Tuning with Optuna ---")
# CHANGED: The objective function was slightly modified to be more robust for a script
def objective(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "random_state": 42,
        "n_estimators": 1000, # A fixed high number, early stopping will find the best
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 31, 256),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
    }
    
    callbacks = [lgb.early_stopping(100, verbose=False)]
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric="rmse", callbacks=callbacks)
    
    preds = model.predict(X_valid)
    r2 = r2_score(y_valid, preds)
    return r2

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials= 200, show_progress_bar= True) # Using 10 trials for speed, you can increase this
print(f"Best Params from Optuna: {study.best_params}")
print(f"Best CV R² from Optuna: {study.best_value}")

[I 2025-11-04 13:59:43,709] A new study created in memory with name: no-name-fb2cf151-5aa5-4463-bf6f-2c23a074c0df



--- Hyperparameter Tuning with Optuna ---


  0%|          | 0/200 [00:00<?, ?it/s]

[I 2025-11-04 13:59:44,156] Trial 0 finished with value: 0.4977453208699696 and parameters: {'learning_rate': 0.024204110096468914, 'num_leaves': 228, 'max_depth': 3, 'min_child_samples': 21, 'subsample': 0.7854341165406918, 'colsample_bytree': 0.5847879011277439}. Best is trial 0 with value: 0.4977453208699696.
[I 2025-11-04 13:59:45,355] Trial 1 finished with value: 0.5115195810148596 and parameters: {'learning_rate': 0.02829768746105188, 'num_leaves': 130, 'max_depth': 11, 'min_child_samples': 13, 'subsample': 0.6380924866432969, 'colsample_bytree': 0.7345700632433416}. Best is trial 1 with value: 0.5115195810148596.
[I 2025-11-04 13:59:46,214] Trial 2 finished with value: 0.5096324676790075 and parameters: {'learning_rate': 0.025171599408820373, 'num_leaves': 222, 'max_depth': 15, 'min_child_samples': 55, 'subsample': 0.6701472670940104, 'colsample_bytree': 0.931108672936467}. Best is trial 1 with value: 0.5115195810148596.
[I 2025-11-04 13:59:48,232] Trial 3 finished with value: 0

In [7]:
# BLOCK 7: Train the final model with best parameters
print("\n--- Training Final Model ---")
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# CHANGED: Fixed the TypeError from the notebook by correctly combining parameters.
final_params = study.best_params
final_params['random_state'] = 42
final_params['n_estimators'] = 5000 # High number for early stopping
final_params['objective'] = 'regression'
final_params['metric'] = 'rmse'

final_model = lgb.LGBMRegressor(**final_params)

final_model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric="rmse",
    callbacks=[lgb.early_stopping(100, verbose=True)]
)

print(f"Final model trained. Best iteration: {final_model.best_iteration_}")

y_pred = final_model.predict(X_valid)
r2 = r2_score(y_valid, y_pred)
print(f"Final Model R² on Validation Set: {r2}")



--- Training Final Model ---
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[528]	valid_0's rmse: 1.7082
Final model trained. Best iteration: 528
Final Model R² on Validation Set: 0.528412694658182


In [8]:
# BLOCK 8: Save the model and encoders
# CHANGED: This entire block is new. It handles saving the necessary files.
print("\n--- Saving Model and Encoders ---")

if not os.path.exists('models'):
    os.makedirs('models')
    print("Created 'models' directory.")

model_path = os.path.join('models', 'lgbm_model.pkl')
encoders_path = os.path.join('models', 'label_encoders.pkl')

joblib.dump(final_model, model_path)
print(f"Model saved to: {model_path}")

joblib.dump(encoders, encoders_path)
print(f"Encoders saved to: {encoders_path}")


--- Saving Model and Encoders ---
Model saved to: models\lgbm_model.pkl
Encoders saved to: models\label_encoders.pkl
