In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

# --- 0. Load the Data ---
try:
    train_df = pd.read_csv('/kaggle/input/playground-series-s5e8/train.csv')
    test_df = pd.read_csv('/kaggle/input/playground-series-s5e8/test.csv')
    print("Files loaded successfully!")
except FileNotFoundError as e:
    print(f"File not found. Please ensure the data files are in the correct directory. Error: {e}")
    exit()

# --- 1. Data Preparation (Same as before) ---
X = train_df.drop('y', axis=1)
y = train_df['y']
test_ids = test_df['id']
combined_df = pd.concat([X, test_df], ignore_index=True)
categorical_features = combined_df.select_dtypes(include=['object']).columns
combined_df = pd.get_dummies(combined_df, columns=categorical_features, drop_first=True)
X_processed = combined_df.iloc[:len(train_df)].drop('id', axis=1)
X_test_processed = combined_df.iloc[len(train_df):].drop('id', axis=1)

# --- 2. Hyperparameter Tuning Setup ---

# Handle class imbalance for LightGBM
scale_pos_weight = y.value_counts()[0] / y.value_counts()[1]

# Initialize the LightGBM model
lgbm = lgb.LGBMClassifier(objective='binary',
                          random_state=42,
                          n_jobs=-1,
                          scale_pos_weight=scale_pos_weight)

# Define a smaller grid of parameters to search.
# A larger grid would give better results but take much longer.
param_grid = {
    'n_estimators': [400, 600],
    'learning_rate': [0.05, 0.1],
    'num_leaves': [31, 40],
}

# Set up the Grid Search with 3-fold cross-validation
# The model will be evaluated using the ROC AUC score
grid_search = GridSearchCV(estimator=lgbm,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=3,       # 3-fold cross-validation
                           verbose=2)  # Shows progress

# --- 3. Model Training ---
print("Starting Hyperparameter Tuning with GridSearchCV... This will take a significant amount of time.")
grid_search.fit(X_processed, y)
print("Tuning complete.")

# Print the best parameters found
print("\nBest parameters found by Grid Search:")
print(grid_search.best_params_)
print(f"Best ROC AUC score on validation data: {grid_search.best_score_:.5f}")


# --- 4. Prediction and Submission ---
print("\nMaking predictions using the best model...")
# The grid_search object automatically retains the best model found
best_model = grid_search.best_estimator_
test_probabilities = best_model.predict_proba(X_test_processed)[:, 1]

# Create and save the new submission file
submission_df = pd.DataFrame({'id': test_ids, 'y': test_probabilities})
submission_df.to_csv('submission_tuned.csv', index=False)

print("\nNew submission file 'submission_tuned.csv' created successfully!")
print(submission_df.head())

Files loaded successfully!
Starting Hyperparameter Tuning with GridSearchCV... This will take a significant amount of time.
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Info] Number of positive: 60326, number of negative: 439674
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035182 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1023
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986270
[LightGBM] [Info] Start training from score -1.986270
[CV] END learning_rate=0.05, n_estimators=400, num_leaves=31; total time=  12.0s
[LightGBM] [Info] Number of positive: 60325, number of negative: 439675
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032056 seconds.

In [3]:
import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.model_selection import StratifiedKFold

# --- 0. Load the Data ---
try:
    train_df = pd.read_csv('/kaggle/input/playground-series-s5e8/train.csv')
    test_df = pd.read_csv('/kaggle/input/playground-series-s5e8/test.csv')
    print("Files loaded successfully!")
except FileNotFoundError as e:
    print(f"File not found. Please ensure the data files are in the correct directory. Error: {e}")
    exit()

# --- 1. Data Preparation ---
X = train_df.drop('y', axis=1)
y = train_df['y']
test_ids = test_df['id']

# Combine train and test for consistent encoding
combined_df = pd.concat([X, test_df], ignore_index=True)

# Identify categorical features
categorical_features = combined_df.select_dtypes(include=['object']).columns.tolist()

# One-hot encode categorical features
combined_df = pd.get_dummies(combined_df, columns=categorical_features, drop_first=True)

# Split back into train and test
X_processed = combined_df.iloc[:len(train_df)].drop('id', axis=1)
X_test_processed = combined_df.iloc[len(train_df):].drop('id', axis=1)

# --- 2. Optuna Hyperparameter Optimization ---
def objective(trial):
    # Define the search space for hyperparameters
    params = {
        "objective": "binary",
        "metric": "auc",
        "boosting_type": "gbdt",
        "n_estimators": trial.suggest_int("n_estimators", 500, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "num_leaves": trial.suggest_int("num_leaves", 20, 80),
        "max_depth": trial.suggest_int("max_depth", 4, 12),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 5.0),
        "scale_pos_weight": y.value_counts()[0] / y.value_counts()[1],
        "n_jobs": -1,
        "random_state": 42
    }
    
    # Using 3-fold cross-validation to save time
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    auc_scores = []
    
    for train_idx, valid_idx in cv.split(X_processed, y):
        X_train, X_valid = X_processed.iloc[train_idx], X_processed.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        
        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric="auc",
            callbacks=[lgb.early_stopping(100, verbose=False)] # Make early stopping less verbose
        )
        
        # Get the best score from the early stopping
        auc_scores.append(model.best_score_["valid_0"]["auc"])
    
    # Return the average AUC score across the folds
    return sum(auc_scores) / len(auc_scores)

# --- 3. Run Optuna Study ---
# We'll run for 15 trials to get a good result in a reasonable time
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=15)

print("\n--- Optuna Study Complete ---")
print("Best Parameters:", study.best_params)
print(f"Best CV AUC: {study.best_value:.5f}")

# --- 4. Train Final Model with Best Parameters ---
print("\nTraining final model with the best parameters found...")
# Get the best hyperparameters from the study
best_params = study.best_params
# Add back the fixed parameters
best_params.update({
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "n_jobs": -1,
    "random_state": 42,
    "scale_pos_weight": y.value_counts()[0] / y.value_counts()[1]
})

final_model = lgb.LGBMClassifier(**best_params)

# Train on the entire dataset. No early stopping here.
final_model.fit(X_processed, y)
print("Final model training complete.")

# --- 5. Predictions & Submission ---
print("\nGenerating predictions for submission...")
test_probabilities = final_model.predict_proba(X_test_processed)[:, 1]
submission_df = pd.DataFrame({'id': test_ids, 'y': test_probabilities})
submission_df.to_csv('submission_optuna_optimized.csv', index=False)

print("\nSubmission file 'submission_optuna_optimized.csv' created successfully!")
print(submission_df.head())


Files loaded successfully!


[I 2025-08-09 12:10:16,485] A new study created in memory with name: no-name-3a685579-9a31-495c-89f7-81d47075ed34


[LightGBM] [Info] Number of positive: 60326, number of negative: 439674
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031803 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1026
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986270
[LightGBM] [Info] Start training from score -1.986270
[LightGBM] [Info] Number of positive: 60325, number of negative: 439675
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033044 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1021
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [In

[I 2025-08-09 12:14:51,501] Trial 0 finished with value: 0.9664692069978825 and parameters: {'n_estimators': 1896, 'learning_rate': 0.01156703251864712, 'num_leaves': 46, 'max_depth': 12, 'min_child_samples': 45, 'subsample': 0.8620539849397684, 'colsample_bytree': 0.9767477946538993, 'reg_alpha': 2.780746754074367, 'reg_lambda': 4.572777053292283}. Best is trial 0 with value: 0.9664692069978825.


[LightGBM] [Info] Number of positive: 60326, number of negative: 439674
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029876 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1026
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986270
[LightGBM] [Info] Start training from score -1.986270
[LightGBM] [Info] Number of positive: 60325, number of negative: 439675
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030254 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1021
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [In

[I 2025-08-09 12:17:17,461] Trial 1 finished with value: 0.967182835983849 and parameters: {'n_estimators': 1249, 'learning_rate': 0.08566203820554803, 'num_leaves': 22, 'max_depth': 7, 'min_child_samples': 93, 'subsample': 0.6781235460331773, 'colsample_bytree': 0.8360610887563331, 'reg_alpha': 1.7128913409462458, 'reg_lambda': 0.4945182067045151}. Best is trial 1 with value: 0.967182835983849.


[LightGBM] [Info] Number of positive: 60326, number of negative: 439674
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030639 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1026
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986270
[LightGBM] [Info] Start training from score -1.986270
[LightGBM] [Info] Number of positive: 60325, number of negative: 439675
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029936 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1021
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [In

[I 2025-08-09 12:18:53,569] Trial 2 finished with value: 0.9653279734042993 and parameters: {'n_estimators': 703, 'learning_rate': 0.04382767230637909, 'num_leaves': 76, 'max_depth': 5, 'min_child_samples': 28, 'subsample': 0.7330247243153039, 'colsample_bytree': 0.7089755034346759, 'reg_alpha': 0.7905499286128626, 'reg_lambda': 0.6570970097823392}. Best is trial 1 with value: 0.967182835983849.


[LightGBM] [Info] Number of positive: 60326, number of negative: 439674
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041817 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1026
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986270
[LightGBM] [Info] Start training from score -1.986270
[LightGBM] [Info] Number of positive: 60325, number of negative: 439675
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033092 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1021
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [In

[I 2025-08-09 12:22:02,380] Trial 3 finished with value: 0.9673257786253094 and parameters: {'n_estimators': 1434, 'learning_rate': 0.03895769313965465, 'num_leaves': 45, 'max_depth': 12, 'min_child_samples': 36, 'subsample': 0.7455826827318893, 'colsample_bytree': 0.9379878239086542, 'reg_alpha': 3.043807548029553, 'reg_lambda': 2.596930999170338}. Best is trial 3 with value: 0.9673257786253094.


[LightGBM] [Info] Number of positive: 60326, number of negative: 439674
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030043 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1026
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986270
[LightGBM] [Info] Start training from score -1.986270
[LightGBM] [Info] Number of positive: 60325, number of negative: 439675
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.127642 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1021
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [In

[I 2025-08-09 12:23:10,400] Trial 4 finished with value: 0.9650446849899431 and parameters: {'n_estimators': 559, 'learning_rate': 0.08613924223862085, 'num_leaves': 57, 'max_depth': 4, 'min_child_samples': 82, 'subsample': 0.6668936715013294, 'colsample_bytree': 0.8116214614717411, 'reg_alpha': 3.6491432329672406, 'reg_lambda': 2.6093744332616535}. Best is trial 3 with value: 0.9673257786253094.


[LightGBM] [Info] Number of positive: 60326, number of negative: 439674
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031165 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1026
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986270
[LightGBM] [Info] Start training from score -1.986270
[LightGBM] [Info] Number of positive: 60325, number of negative: 439675
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029713 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1021
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [In

[I 2025-08-09 12:25:39,651] Trial 5 finished with value: 0.9636299373392575 and parameters: {'n_estimators': 1056, 'learning_rate': 0.01687947868386261, 'num_leaves': 48, 'max_depth': 5, 'min_child_samples': 73, 'subsample': 0.9781250486210614, 'colsample_bytree': 0.6908286763861108, 'reg_alpha': 2.657440793297155, 'reg_lambda': 0.42734232332995725}. Best is trial 3 with value: 0.9673257786253094.


[LightGBM] [Info] Number of positive: 60326, number of negative: 439674
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029619 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1026
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986270
[LightGBM] [Info] Start training from score -1.986270
[LightGBM] [Info] Number of positive: 60325, number of negative: 439675
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030063 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1021
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [In

[I 2025-08-09 12:28:43,814] Trial 6 finished with value: 0.9673963648070786 and parameters: {'n_estimators': 1413, 'learning_rate': 0.08774337462567865, 'num_leaves': 46, 'max_depth': 6, 'min_child_samples': 100, 'subsample': 0.8025531485483209, 'colsample_bytree': 0.8959645194386967, 'reg_alpha': 2.0593367592280427, 'reg_lambda': 2.08899548446257}. Best is trial 6 with value: 0.9673963648070786.


[LightGBM] [Info] Number of positive: 60326, number of negative: 439674
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031750 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1026
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986270
[LightGBM] [Info] Start training from score -1.986270
[LightGBM] [Info] Number of positive: 60325, number of negative: 439675
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032841 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1021
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [In

[I 2025-08-09 12:31:08,298] Trial 7 finished with value: 0.9672741385276123 and parameters: {'n_estimators': 1877, 'learning_rate': 0.08810838319091162, 'num_leaves': 68, 'max_depth': 7, 'min_child_samples': 31, 'subsample': 0.9452704295539116, 'colsample_bytree': 0.9334281507798825, 'reg_alpha': 2.276438717020817, 'reg_lambda': 3.8180986190487016}. Best is trial 6 with value: 0.9673963648070786.


[LightGBM] [Info] Number of positive: 60326, number of negative: 439674
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030447 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1026
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986270
[LightGBM] [Info] Start training from score -1.986270
[LightGBM] [Info] Number of positive: 60325, number of negative: 439675
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029923 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1021
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [In

[I 2025-08-09 12:33:52,628] Trial 8 finished with value: 0.9678661440158285 and parameters: {'n_estimators': 1400, 'learning_rate': 0.06605690093239386, 'num_leaves': 73, 'max_depth': 11, 'min_child_samples': 44, 'subsample': 0.6981900731526091, 'colsample_bytree': 0.6635929664744389, 'reg_alpha': 2.291649849209423, 'reg_lambda': 4.414417571894578}. Best is trial 8 with value: 0.9678661440158285.


[LightGBM] [Info] Number of positive: 60326, number of negative: 439674
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029953 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1026
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986270
[LightGBM] [Info] Start training from score -1.986270
[LightGBM] [Info] Number of positive: 60325, number of negative: 439675
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029932 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1021
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [In

[I 2025-08-09 12:36:02,332] Trial 9 finished with value: 0.9662098651482404 and parameters: {'n_estimators': 1000, 'learning_rate': 0.04597050861786051, 'num_leaves': 58, 'max_depth': 5, 'min_child_samples': 24, 'subsample': 0.848232763104704, 'colsample_bytree': 0.7457461306250155, 'reg_alpha': 1.8385843247999212, 'reg_lambda': 0.5427222589120456}. Best is trial 8 with value: 0.9678661440158285.


[LightGBM] [Info] Number of positive: 60326, number of negative: 439674
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029163 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1026
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986270
[LightGBM] [Info] Start training from score -1.986270
[LightGBM] [Info] Number of positive: 60325, number of negative: 439675
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031161 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1021
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [In

[I 2025-08-09 12:39:09,410] Trial 10 finished with value: 0.9679968837990561 and parameters: {'n_estimators': 1625, 'learning_rate': 0.06658663448083295, 'num_leaves': 76, 'max_depth': 10, 'min_child_samples': 10, 'subsample': 0.609759839080195, 'colsample_bytree': 0.6040452754747877, 'reg_alpha': 4.328629523429931, 'reg_lambda': 4.834446316820131}. Best is trial 10 with value: 0.9679968837990561.


[LightGBM] [Info] Number of positive: 60326, number of negative: 439674
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031598 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1026
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986270
[LightGBM] [Info] Start training from score -1.986270
[LightGBM] [Info] Number of positive: 60325, number of negative: 439675
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030071 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1021
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [In

[I 2025-08-09 12:42:16,070] Trial 11 finished with value: 0.9680823761837827 and parameters: {'n_estimators': 1660, 'learning_rate': 0.0642117598814213, 'num_leaves': 80, 'max_depth': 10, 'min_child_samples': 15, 'subsample': 0.603818943476667, 'colsample_bytree': 0.6036110745459654, 'reg_alpha': 4.618086055925335, 'reg_lambda': 4.673068316934098}. Best is trial 11 with value: 0.9680823761837827.


[LightGBM] [Info] Number of positive: 60326, number of negative: 439674
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033613 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1026
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986270
[LightGBM] [Info] Start training from score -1.986270
[LightGBM] [Info] Number of positive: 60325, number of negative: 439675
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032337 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1021
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [In

[I 2025-08-09 12:45:08,238] Trial 12 finished with value: 0.9680552543292652 and parameters: {'n_estimators': 1688, 'learning_rate': 0.062587317511998, 'num_leaves': 80, 'max_depth': 10, 'min_child_samples': 14, 'subsample': 0.6005661163510833, 'colsample_bytree': 0.6013202633415391, 'reg_alpha': 4.756310604257763, 'reg_lambda': 4.915897074655953}. Best is trial 11 with value: 0.9680823761837827.


[LightGBM] [Info] Number of positive: 60326, number of negative: 439674
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029118 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1026
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986270
[LightGBM] [Info] Start training from score -1.986270
[LightGBM] [Info] Number of positive: 60325, number of negative: 439675
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029932 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1021
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [In

[I 2025-08-09 12:48:20,758] Trial 13 finished with value: 0.9679834652924111 and parameters: {'n_estimators': 1676, 'learning_rate': 0.06611100570498453, 'num_leaves': 80, 'max_depth': 9, 'min_child_samples': 10, 'subsample': 0.6029693557098815, 'colsample_bytree': 0.6014761904040524, 'reg_alpha': 4.944703292078138, 'reg_lambda': 3.6141362561541213}. Best is trial 11 with value: 0.9680823761837827.


[LightGBM] [Info] Number of positive: 60326, number of negative: 439674
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030017 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1026
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986270
[LightGBM] [Info] Start training from score -1.986270
[LightGBM] [Info] Number of positive: 60325, number of negative: 439675
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029886 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1021
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [In

[I 2025-08-09 12:51:36,275] Trial 14 finished with value: 0.9679055884421103 and parameters: {'n_estimators': 1684, 'learning_rate': 0.05622044920393819, 'num_leaves': 64, 'max_depth': 9, 'min_child_samples': 63, 'subsample': 0.6349173698142061, 'colsample_bytree': 0.6520467058566942, 'reg_alpha': 4.968794516121531, 'reg_lambda': 3.507759660673453}. Best is trial 11 with value: 0.9680823761837827.



--- Optuna Study Complete ---
Best Parameters: {'n_estimators': 1660, 'learning_rate': 0.0642117598814213, 'num_leaves': 80, 'max_depth': 10, 'min_child_samples': 15, 'subsample': 0.603818943476667, 'colsample_bytree': 0.6036110745459654, 'reg_alpha': 4.618086055925335, 'reg_lambda': 4.673068316934098}
Best CV AUC: 0.96808

Training final model with the best parameters found...
[LightGBM] [Info] Number of positive: 90488, number of negative: 659512
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.098572 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1027
[LightGBM] [Info] Number of data points in the train set: 750000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120651 -> initscore=-1.986283
[LightGBM] [Info] Start training from score -1.986283
Final model training complete.

Generating predictions for submission...

Submission file 'submission_optuna_optimized.csv