In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

# --- 0. Load the Data ---
try:
    train_df = pd.read_csv('/kaggle/input/playground-series-s5e8/train.csv')
    test_df = pd.read_csv('/kaggle/input/playground-series-s5e8/test.csv')
    print("Files loaded successfully!")
except FileNotFoundError as e:
    print(f"File not found. Please ensure the data files are in the correct directory. Error: {e}")
    exit()

# --- 1. Data Preparation (Same as before) ---
X = train_df.drop('y', axis=1)
y = train_df['y']
test_ids = test_df['id']
combined_df = pd.concat([X, test_df], ignore_index=True)
categorical_features = combined_df.select_dtypes(include=['object']).columns
combined_df = pd.get_dummies(combined_df, columns=categorical_features, drop_first=True)
X_processed = combined_df.iloc[:len(train_df)].drop('id', axis=1)
X_test_processed = combined_df.iloc[len(train_df):].drop('id', axis=1)

# --- 2. Hyperparameter Tuning Setup ---

# Handle class imbalance for LightGBM
scale_pos_weight = y.value_counts()[0] / y.value_counts()[1]

# Initialize the LightGBM model
lgbm = lgb.LGBMClassifier(objective='binary',
                          random_state=42,
                          n_jobs=-1,
                          scale_pos_weight=scale_pos_weight)

# Define a smaller grid of parameters to search.
# A larger grid would give better results but take much longer.
param_grid = {
    'n_estimators': [400, 600],
    'learning_rate': [0.05, 0.1],
    'num_leaves': [31, 40],
}

# Set up the Grid Search with 3-fold cross-validation
# The model will be evaluated using the ROC AUC score
grid_search = GridSearchCV(estimator=lgbm,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=3,       # 3-fold cross-validation
                           verbose=2)  # Shows progress

# --- 3. Model Training ---
print("Starting Hyperparameter Tuning with GridSearchCV... This will take a significant amount of time.")
grid_search.fit(X_processed, y)
print("Tuning complete.")

# Print the best parameters found
print("\nBest parameters found by Grid Search:")
print(grid_search.best_params_)
print(f"Best ROC AUC score on validation data: {grid_search.best_score_:.5f}")


# --- 4. Prediction and Submission ---
print("\nMaking predictions using the best model...")
# The grid_search object automatically retains the best model found
best_model = grid_search.best_estimator_
test_probabilities = best_model.predict_proba(X_test_processed)[:, 1]

# Create and save the new submission file
submission_df = pd.DataFrame({'id': test_ids, 'y': test_probabilities})
submission_df.to_csv('submission_tuned.csv', index=False)

print("\nNew submission file 'submission_tuned.csv' created successfully!")
print(submission_df.head())

Files loaded successfully!
Starting Hyperparameter Tuning with GridSearchCV... This will take a significant amount of time.
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[LightGBM] [Info] Number of positive: 60326, number of negative: 439674
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035182 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1023
[LightGBM] [Info] Number of data points in the train set: 500000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986270
[LightGBM] [Info] Start training from score -1.986270
[CV] END learning_rate=0.05, n_estimators=400, num_leaves=31; total time=  12.0s
[LightGBM] [Info] Number of positive: 60325, number of negative: 439675
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032056 seconds.