In [None]:
# --- 1. Import Libraries ---
import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import warnings
import sys # Using sys to exit cleanly if data is missing

warnings.filterwarnings('ignore')

# --- 2. Load Processed Data ---
print("Loading preprocessed data...")
# When running from the 'notebooks' folder, we need to go one level up (../) to find the data folder.
try:
    X_train = pd.read_csv('../data/processed/X_train_processed.csv')
    y_train = pd.read_csv('../data/processed/y_train.csv').values.ravel()
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Error: Processed data not found.")
    print("Please ensure you have run the 'run_pipeline.py' script first from the root directory.")
    # Stop execution cleanly if data is not found
    sys.exit()

# --- 3. Define the Hyperparameter Tuning Objective Function ---
def objective(trial, X, y):
    """
    Defines the objective for Optuna to optimize.
    A 'trial' is a single run with a specific set of hyperparameters.
    """
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'n_estimators': 1000,
        'class_weight': 'balanced',
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }

    N_SPLITS = 5
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
    f1_scores = []

    # Use the passed X and y instead of global variables
    for train_index, val_index in skf.split(X, y):
        X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
        y_train_fold, y_val_fold = y[train_index], y[val_index]

        model = lgb.LGBMClassifier(**params)
        # Use a callback for early stopping
        early_stopping_callback = lgb.early_stopping(100, verbose=False)
        model.fit(X_train_fold, y_train_fold,
                  eval_set=[(X_val_fold, y_val_fold)],
                  eval_metric='f1',
                  callbacks=[early_stopping_callback])

        preds = model.predict(X_val_fold)
        f1 = f1_score(y_val_fold, preds, pos_label=1)
        f1_scores.append(f1)

    return sum(f1_scores) / len(f1_scores)

# --- 4. Run the Optimization ---
# This section will only run if the data was loaded successfully.
print("Starting hyperparameter tuning with Optuna...")

study = optuna.create_study(direction='maximize')

# Use a lambda function to pass X_train and y_train to our objective function
study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=50)

# --- 5. Display the Results ---
print("\nOptimization finished.")
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value (F1-score): ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")