In [3]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

Kaggle credentials set.


# New Section

# New Section

In [4]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

playground_series_s5e8_path = kagglehub.competition_download('playground-series-s5e8')

print('Data source import complete.')


Downloading from https://www.kaggle.com/api/v1/competitions/data/download-all/playground-series-s5e8...


100%|██████████| 14.7M/14.7M [00:00<00:00, 169MB/s]

Extracting files...





Data source import complete.


In [15]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping

# --- 0. Load the Data ---
try:
    # Corrected file paths for Kaggle environment
    train_df = pd.read_csv('/content/train.csv')
    test_df = pd.read_csv('/content/test.csv')
    print("Files loaded successfully!")
except FileNotFoundError as e:
    print(f"File not found. Please ensure the data files are in the correct directory. Error: {e}")
    exit()

# --- 1. Data Preparation ---
# Separate target and features
X = train_df.drop('y', axis=1)
y = train_df['y']
test_ids = test_df['id']

# Combine for consistent processing
combined_df = pd.concat([X, test_df], ignore_index=True)

# One-Hot Encode Categorical Features
categorical_features = combined_df.select_dtypes(include=['object']).columns
combined_df = pd.get_dummies(combined_df, columns=categorical_features, drop_first=True)

# Separate back into training and testing sets
X_processed = combined_df.iloc[:len(train_df)].drop('id', axis=1)
X_test_processed = combined_df.iloc[len(train_df):].drop('id', axis=1)

# --- 2. Feature Scaling (Crucial for Neural Networks) ---
# Identify numerical columns to scale (all columns are now numeric)
numerical_cols = X_processed.columns

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both train and test data
X_processed_scaled = scaler.fit_transform(X_processed)
X_test_processed_scaled = scaler.transform(X_test_processed)


# --- 3. Build the Neural Network Model ---
# Set random seed for reproducibility
tf.random.set_seed(42)

# Define the model architecture
model = tf.keras.Sequential([
    # Input layer - specify the input shape
    tf.keras.layers.Input(shape=(X_processed_scaled.shape[1],)),

    # First hidden layer
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3), # Dropout for regularization

    # Second hidden layer
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.3),

    # Third hidden layer
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),

    # Output layer - sigmoid for binary classification probability
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=[tf.keras.metrics.AUC(name='auc')] # Use AUC as a metric
)

# Print model summary
model.summary()


# --- 4. Handle Class Imbalance ---
# Calculate class weights
neg, pos = np.bincount(y)
total = neg + pos
class_weight = {0: (1 / neg) * (total / 2.0),
                1: (1 / pos) * (total / 2.0)}

print(f"\nClass weights: {class_weight}")


# --- 5. Train the Model ---
print("\nStarting Neural Network training with Early Stopping...")
# Define the Early Stopping callback
early_stopping = EarlyStopping(
    monitor='val_auc', # Monitor validation AUC
    patience=5,       # Number of epochs with no improvement after which training will be stopped.
    mode='max',       # Stop when the monitored quantity is maximized
    restore_best_weights=True # Restore model weights from the epoch with the best value of the monitored quantity.
)

history = model.fit(
    X_processed_scaled,
    y,
    epochs=50, # Set a higher number of epochs, as early stopping will stop training when appropriate
    batch_size=512,
    validation_split=0.2, # Use 20% of data for validation
    class_weight=class_weight,
    callbacks=[early_stopping], # Add the early stopping callback
    verbose=1
)
print("Training complete.")


# --- 6. Prediction and Submission ---
print("\nMaking predictions with the trained Neural Network...")
# Predict probabilities on the scaled test set
test_probabilities_nn = model.predict(X_test_processed_scaled).flatten() # flatten to get a 1D array

# Create and save the new submission file
submission_df_nn = pd.DataFrame({'id': test_ids, 'y': test_probabilities_nn})
submission_df_nn.to_csv('submission_nn.csv', index=False)

print("\nNew submission file 'submission_nn.csv' created successfully!")
print(submission_df_nn.head())

Files loaded successfully!



Class weights: {0: np.float64(0.5686022392314317), 1: np.float64(4.1441959154805055)}

Starting Neural Network training with Early Stopping...
Epoch 1/50
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 8ms/step - auc: 0.9037 - loss: 0.3775 - val_auc: 0.9514 - val_loss: 0.2795
Epoch 2/50
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - auc: 0.9467 - loss: 0.2825 - val_auc: 0.9547 - val_loss: 0.2692
Epoch 3/50
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - auc: 0.9505 - loss: 0.2724 - val_auc: 0.9569 - val_loss: 0.2583
Epoch 4/50
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - auc: 0.9528 - loss: 0.2664 - val_auc: 0.9588 - val_loss: 0.2605
Epoch 5/50
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 8ms/step - auc: 0.9548 - loss: 0.2615 - val_auc: 0.9596 - val_loss: 0.2541
Epoch 6/50
[1m1172/1172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7m

# Task
Enhance the provided Python code for a neural network model by implementing hyperparameter tuning, cross-validation, early stopping, exploring alternative models, feature engineering, further data preprocessing, and considering additional evaluation metrics.

## Hyperparameter tuning

### Subtask:
Experiment with different hyperparameters such as the number of layers, neurons per layer, activation functions, learning rate, and dropout rates to optimize model performance.


**Reasoning**:
The first step is to import the necessary libraries for hyperparameter tuning and define a function to build the Keras model with hyperparameters as arguments. We will use Keras Tuner for hyperparameter tuning.



**Reasoning**:
The previous command failed because Keras Tuner is not installed. I need to install it before proceeding.



In [7]:
!pip install keras-tuner


Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


**Reasoning**:
Keras Tuner is now installed. I can proceed with importing the necessary libraries and defining the model building function as planned in the first step.



In [8]:
import keras_tuner as kt
from tensorflow import keras

def build_model(hp):
    model = keras.Sequential()
    model.add(keras.layers.Input(shape=(X_processed_scaled.shape[1],)))

    # Tune the number of layers.
    for i in range(hp.Int('num_layers', 1, 3)):
        model.add(keras.layers.Dense(units=hp.Int('units_' + str(i),
                                            min_value=32,
                                            max_value=128,
                                            step=32),
                                      activation=hp.Choice('activation_' + str(i), ['relu', 'tanh'])))
        model.add(keras.layers.Dropout(rate=hp.Float('dropout_' + str(i),
                                               min_value=0.0,
                                               max_value=0.5,
                                               step=0.1)))

    model.add(keras.layers.Dense(1, activation='sigmoid'))

    # Tune the learning rate for the Adam optimizer.
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss='binary_crossentropy',
                  metrics=[tf.keras.metrics.AUC(name='auc')])
    return model


**Reasoning**:
Now that the model building function is defined, I need to set up and run the hyperparameter search using Keras Tuner. I will use the Hyperband tuner for this purpose, specifying the objective as maximizing the AUC, the maximum number of epochs, and a directory to save the tuning results. I will then fit the tuner to the scaled training data, including the class weights to handle the imbalance.



In [10]:
tuner = kt.Hyperband(build_model,
                     objective='auc',
                     max_epochs=5, # Reduced max_epochs
                     directory='my_dir',
                     project_name='intro_to_kt')

# Split data for validation during tuning
X_train_tune, X_val_tune, y_train_tune, y_val_tune = train_test_split(
    X_processed_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Start the hyperparameter search
print("Starting hyperparameter tuning...")
tuner.search(X_train_tune, y_train_tune, epochs=5, validation_data=(X_val_tune, y_val_tune), class_weight=class_weight)
print("Hyperparameter tuning complete.")

Reloading Tuner from my_dir/intro_to_kt/tuner0.json
Starting hyperparameter tuning...
Hyperparameter tuning complete.


## Cross-Validation

### Subtask:
Implement k-fold cross-validation to get a more robust estimate of the model's performance.

**Reasoning**:
Now that we have the best hyperparameters from the tuning process, we will use k-fold cross-validation to train the model on different subsets of the data and evaluate its performance. This will give us a more reliable measure of how well the model is likely to perform on unseen data compared to a single train-validation split.

In [14]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the model with the best hyperparameters - this is not strictly needed for CV,
# but kept for consistency if needed later
# best_model = build_model(best_hps)

# Define the number of folds for cross-validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

auc_scores = []
fold = 1

print(f"\nStarting {n_splits}-fold cross-validation...")

# Prepare data for cross-validation (using the full training data)
X_cv = X_processed_scaled
y_cv = y

for train_index, val_index in skf.split(X_cv, y_cv):
    print(f"\n--- Fold {fold}/{n_splits} ---")
    X_train_fold, X_val_fold = X_cv[train_index], X_cv[val_index]
    y_train_fold, y_val_fold = y_cv[train_index], y_cv[val_index]

    # Build a fresh model for each fold to avoid data leakage
    fold_model = build_model(best_hps)

    # Calculate class weights for the current fold's training data
    neg_fold, pos_fold = np.bincount(y_train_fold)
    total_fold = neg_fold + pos_fold
    class_weight_fold = {0: (1 / neg_fold) * (total_fold / 2.0),
                         1: (1 / pos_fold) * (total_fold / 2.0)}


    # Train the model for the current fold
    history_fold = fold_model.fit(
        X_train_fold,
        y_train_fold,
        epochs=5, # Use a fixed number of epochs for CV, or get it from tuner results if available and intended
        batch_size=512, # Use a default batch size
        validation_data=(X_val_fold, y_val_fold),
        class_weight=class_weight_fold,
        verbose=0 # Set to 1 to see training progress per epoch
    )

    # Evaluate the model on the validation fold
    scores = fold_model.evaluate(X_val_fold, y_val_fold, verbose=0)
    print(f"Fold {fold} - AUC: {scores[1]:.4f}")
    auc_scores.append(scores[1])

    fold += 1

print(f"\nCross-validation complete. Average AUC: {np.mean(auc_scores):.4f}")
print(f"AUC scores per fold: {auc_scores}")


Starting 5-fold cross-validation...

--- Fold 1/5 ---
Fold 1 - AUC: 0.9563

--- Fold 2/5 ---
Fold 2 - AUC: 0.9557

--- Fold 3/5 ---
Fold 3 - AUC: 0.9567

--- Fold 4/5 ---
Fold 4 - AUC: 0.9565

--- Fold 5/5 ---
Fold 5 - AUC: 0.9566

Cross-validation complete. Average AUC: 0.9564
AUC scores per fold: [0.956321120262146, 0.9557490348815918, 0.9566560983657837, 0.9564590454101562, 0.9566218256950378]


## Alternative Models

### Subtask:
Explore alternative models to potentially achieve better results.

**Reasoning**:
Exploring alternative models is crucial to determine if a different algorithm might be better suited for the dataset. LightGBM is a gradient boosting framework that is known for its speed and efficiency, and it often performs well on tabular data. We will train a LightGBM model and compare its performance to the neural network.

In [16]:
import lightgbm as lgb
from sklearn.model_selection import cross_val_score

# Initialize LightGBM Classifier
lgbm = lgb.LGBMClassifier(random_state=42)

print("Starting LightGBM training with cross-validation...")

# Perform cross-validation
# Using the same StratifiedKFold as before for consistency
lgbm_auc_scores = cross_val_score(lgbm, X_processed_scaled, y, cv=skf, scoring='roc_auc')

print(f"\nLightGBM Cross-validation complete. Average AUC: {np.mean(lgbm_auc_scores):.4f}")
print(f"LightGBM AUC scores per fold: {lgbm_auc_scores}")

# Train the LightGBM model on the full training data
lgbm.fit(X_processed_scaled, y)

# Make predictions on the test set
test_probabilities_lgbm = lgbm.predict_proba(X_test_processed_scaled)[:, 1]

# Create and save the submission file
submission_df_lgbm = pd.DataFrame({'id': test_ids, 'y': test_probabilities_lgbm})
submission_df_lgbm.to_csv('submission_lgbm.csv', index=False)

print("\nNew submission file 'submission_lgbm.csv' created successfully!")
print(submission_df_lgbm.head())

Starting LightGBM training with cross-validation...
[LightGBM] [Info] Number of positive: 72391, number of negative: 527609
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.204857 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1064
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986273
[LightGBM] [Info] Start training from score -1.986273




[LightGBM] [Info] Number of positive: 72391, number of negative: 527609
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.204270 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1062
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120652 -> initscore=-1.986273
[LightGBM] [Info] Start training from score -1.986273




[LightGBM] [Info] Number of positive: 72390, number of negative: 527610
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.205292 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1060
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120650 -> initscore=-1.986289
[LightGBM] [Info] Start training from score -1.986289




[LightGBM] [Info] Number of positive: 72390, number of negative: 527610
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.206738 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1057
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120650 -> initscore=-1.986289
[LightGBM] [Info] Start training from score -1.986289




[LightGBM] [Info] Number of positive: 72390, number of negative: 527610
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.201793 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1061
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120650 -> initscore=-1.986289
[LightGBM] [Info] Start training from score -1.986289





LightGBM Cross-validation complete. Average AUC: 0.9650
LightGBM AUC scores per fold: [0.96568322 0.96449104 0.96457037 0.96544599 0.96468736]
[LightGBM] [Info] Number of positive: 90488, number of negative: 659512
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.257824 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1063
[LightGBM] [Info] Number of data points in the train set: 750000, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120651 -> initscore=-1.986283
[LightGBM] [Info] Start training from score -1.986283





New submission file 'submission_lgbm.csv' created successfully!
       id         y
0  750000  0.003172
1  750001  0.123749
2  750002  0.001596
3  750003  0.000669
4  750004  0.022046
