In [11]:
import os
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, RobustScaler
from imblearn.over_sampling import SMOTE
from sklearn.utils.class_weight import compute_class_weight

import tensorflow as tf
from tensorflow import keras

try:
    
    from kerastuner.tuners import BayesianOptimization
    from kerastuner.engine.hyperparameters import HyperParameters
    KT_BACKEND = 'kerastuner'
except Exception:
    import keras_tuner as kt
    from keras_tuner.tuners import BayesianOptimization
    from keras_tuner.engine.hyperparameters import HyperParameters
    KT_BACKEND = 'keras_tuner'

warnings.filterwarnings("ignore")
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # silence TF logs

In [12]:
train = pd.read_csv("../train.csv")
test = pd.read_csv("../test.csv")


dup_count = train.duplicated().sum()
print(f"\nNumber of duplicate rows in train: {dup_count}")

if dup_count > 0:
    train.drop_duplicates(inplace=True)
    print("Duplicates removed. New shape:", train.shape)


Number of duplicate rows in train: 13
Duplicates removed. New shape: (59598, 24)


In [13]:
log_transform_cols = [
    'monthly_revenue_generated', 'funding_rounds_led',
    'num_dependents', 'years_with_startup'
]

def feature_engineer(df):
    df = df.copy()
    if 'years_with_startup' in df.columns and 'years_since_founding' in df.columns:
        df['experience_ratio'] = df['years_with_startup'] / (df['years_since_founding'] + np.finfo(float).eps)
    if 'founder_age' in df.columns and 'years_with_startup' in df.columns:
        df['founder_join_age'] = df['founder_age'] - df['years_with_startup']
    if 'monthly_revenue_generated' in df.columns and 'funding_rounds_led' in df.columns:
        df['revenue_per_round'] = df['monthly_revenue_generated'] / (df['funding_rounds_led'] + 1)
    for col in log_transform_cols:
        if col in df.columns:
            df[f'log_{col}'] = np.log1p(df[col])
            df.drop(col, axis=1, inplace=True)
    return df

train_fe = feature_engineer(train.drop(columns=['founder_id']))
test_fe = feature_engineer(test.drop(columns=['founder_id']))

X = train_fe.drop(columns=['retention_status'])
y = train_fe['retention_status'].map({'Stayed': 1, 'Left': 0}).astype(int)
X_test_final = test_fe.copy()

In [14]:
numerical_cols = [
    'years_since_founding', 'founder_age', 'distance_from_investor_hub',
    'experience_ratio', 'founder_join_age', 'revenue_per_round',
    'log_monthly_revenue_generated', 'log_funding_rounds_led',
    'log_num_dependents', 'log_years_with_startup'
]

binary_cols = ['working_overtime', 'remote_operations', 'leadership_scope', 'innovation_support']

ordinal_cols = {
    'work_life_balance_rating': ['Poor', 'Fair', 'Good', 'Excellent'],
    'venture_satisfaction': ['Low', 'Medium', 'High', 'Very High'],
    'startup_performance_rating': ['Low', 'Below Average', 'Average', 'High'],
    'startup_reputation': ['Poor', 'Fair', 'Good', 'Excellent'],
    'founder_visibility': ['Low', 'Medium', 'High', 'Very High'],
    'startup_stage': ['Entry', 'Mid', 'Senior'],
    'team_size_category': ['Small', 'Medium', 'Large']
}
ordinal_feature_names = list(ordinal_cols.keys())
ordinal_categories = list(ordinal_cols.values())

nominal_cols = ['founder_gender', 'founder_role', 'education_background', 'personal_status']

In [15]:
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

binary_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('mapper', OrdinalEncoder(categories=[['No', 'Yes']] * len(binary_cols),
                              handle_unknown='use_encoded_value', unknown_value=-1))
])

ordinal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value', unknown_value=-1))
])

nominal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),
        ('bin', binary_pipeline, binary_cols),
        ('ord', ordinal_pipeline, ordinal_feature_names),
        ('nom', nominal_pipeline, nominal_cols)
    ],
    remainder='drop'
)

In [16]:
X_train_df, X_val_df, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Raw train/val sizes: {X_train_df.shape}, {X_val_df.shape}")

X_train_processed = preprocessor.fit_transform(X_train_df)
X_val_processed = preprocessor.transform(X_val_df)
X_test_processed = preprocessor.transform(X_test_final)

print("Processed shapes (train/val/test):", X_train_processed.shape, X_val_processed.shape, X_test_processed.shape)


Raw train/val sizes: (47678, 25), (11920, 25)
Processed shapes (train/val/test): (47678, 32) (11920, 32) (14900, 32)


In [17]:
def build_keras_model(hp, input_dim):
    model = keras.Sequential()
    model.add(keras.layers.InputLayer(input_shape=(input_dim,)))
    # first dense
    model.add(keras.layers.Dense(units=hp.Int('units_1', 64, 512, step=64), activation='relu'))
    # additional layers (0-2)
    for i in range(hp.Int('num_layers', 0, 2)):
        model.add(keras.layers.Dense(units=hp.Int(f'units_{i+2}', 32, 256, step=32), activation='relu'))
        if hp.Boolean(f'dropout_{i}', default=False):
            model.add(keras.layers.Dropout(rate=hp.Float(f'dropout_rate_{i}', 0.1, 0.5, step=0.1)))
    model.add(keras.layers.Dense(1, activation='sigmoid'))
    # important: name the AUC metric 'auc' so tuner sees 'val_auc'
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])),
        loss='binary_crossentropy',
        metrics=[tf.keras.metrics.AUC(name='auc'), 'accuracy']
    )
    return model

In [18]:
def get_bayesian_tuner(input_dim, project_name, max_trials=8, executions_per_trial=1):
    def model_builder(hp):
        return build_keras_model(hp, input_dim=input_dim)
    # For kerastuner compatibility, objective can be 'val_auc'
    tuner = BayesianOptimization(
        model_builder,
        objective='val_auc',
        max_trials=max_trials,
        executions_per_trial=executions_per_trial,
        directory='kt_dir',
        project_name=project_name,
        seed=42
    )
    return tuner


In [19]:
BATCH_SIZE = 128
EPOCHS = 15
PATIENCE = 3
early_stop = keras.callbacks.EarlyStopping(monitor='val_auc', patience=PATIENCE, mode='max', restore_best_weights=True, verbose=1)

def probs_to_labels(probs, thresh=0.5):
    return np.where(probs.ravel() >= thresh, 'Stayed', 'Left')

# ------------------ MODEL A: No Sampling ------------------
print("\n===== MODEL A (No Sampling) =====")
input_dim = X_train_processed.shape[1]
tuner_a = get_bayesian_tuner(input_dim=input_dim, project_name='nn_nosample', max_trials=8, executions_per_trial=1)

# compute class weights for imbalance
classes = np.unique(y_train)
class_weights_vals = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
cw = {int(c): float(w) for c, w in zip(classes, class_weights_vals)}

# search
tuner_a.search(
    x=X_train_processed,
    y=y_train,
    validation_data=(X_val_processed, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stop],
    verbose=2
)

best_hps_a = tuner_a.get_best_hyperparameters(num_trials=1)[0]
print("Best hyperparameters (no-sample):", best_hps_a.values)

# build best model and final fit on combined train+val
best_model_a = tuner_a.hypermodel.build(best_hps_a)
X_full = np.vstack([X_train_processed, X_val_processed])
y_full = np.concatenate([y_train.values, y_val.values])

best_model_a.fit(
    X_full, y_full,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stop],
    validation_split=0.1,
    class_weight=cw,
    verbose=2
)

probs_a = best_model_a.predict(X_test_processed, batch_size=BATCH_SIZE, verbose=0)
labels_a = probs_to_labels(probs_a)
pd.DataFrame({'founder_id': test['founder_id'], 'retention_status': labels_a}).to_csv('submission_nn_nosample.csv', index=False)
print("Saved submission_nn_nosample.csv")


Trial 8 Complete [00h 00m 22s]
val_auc: 0.8328278064727783

Best val_auc So Far: 0.8356896638870239
Total elapsed time: 00h 14m 20s
Best hyperparameters (no-sample): {'units_1': 320, 'num_layers': 0, 'learning_rate': 0.001, 'units_2': 64, 'dropout_0': True, 'units_3': 160, 'dropout_1': True}
Epoch 1/15
420/420 - 3s - 6ms/step - accuracy: 0.7295 - auc: 0.8115 - loss: 0.5252 - val_accuracy: 0.7369 - val_auc: 0.8264 - val_loss: 0.5148
Epoch 2/15
420/420 - 1s - 3ms/step - accuracy: 0.7404 - auc: 0.8280 - loss: 0.5049 - val_accuracy: 0.7424 - val_auc: 0.8286 - val_loss: 0.5097
Epoch 3/15
420/420 - 1s - 2ms/step - accuracy: 0.7440 - auc: 0.8314 - loss: 0.5005 - val_accuracy: 0.7409 - val_auc: 0.8306 - val_loss: 0.5024
Epoch 4/15
420/420 - 1s - 2ms/step - accuracy: 0.7453 - auc: 0.8333 - loss: 0.4978 - val_accuracy: 0.7406 - val_auc: 0.8322 - val_loss: 0.4990
Epoch 5/15
420/420 - 1s - 3ms/step - accuracy: 0.7477 - auc: 0.8359 - loss: 0.4944 - val_accuracy: 0.7448 - val_auc: 0.8339 - val_loss:

In [20]:
print("\n===== MODEL B (SMOTE) =====")
smote = SMOTE(random_state=42)
X_sm, y_sm = smote.fit_resample(X_train_processed, y_train)
print("After SMOTE:", X_sm.shape, y_sm.shape)

tuner_b = get_bayesian_tuner(input_dim=input_dim, project_name='nn_smote', max_trials=8, executions_per_trial=1)

tuner_b.search(
    x=X_sm,
    y=y_sm,
    validation_data=(X_val_processed, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stop],
    verbose=2
)

best_hps_b = tuner_b.get_best_hyperparameters(num_trials=1)[0]
print("Best hyperparameters (SMOTE):", best_hps_b.values)

best_model_b = tuner_b.hypermodel.build(best_hps_b)

# final fit on SMOTE train + val combined
X_combined = np.vstack([X_sm, X_val_processed])
y_combined = np.concatenate([y_sm, y_val.values])

best_model_b.fit(
    X_combined, y_combined,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stop],
    validation_split=0.1,
    verbose=2
)

probs_b = best_model_b.predict(X_test_processed, batch_size=BATCH_SIZE, verbose=0)
labels_b = probs_to_labels(probs_b)
pd.DataFrame({'founder_id': test['founder_id'], 'retention_status': labels_b}).to_csv('submission_nn_smote.csv', index=False)
print("Saved submission_nn_smote.csv")

print("\n===== ALL DONE =====")
print("Files created: submission_nn_nosample.csv, submission_nn_smote.csv")

Trial 8 Complete [00h 00m 09s]
val_auc: 0.8304513692855835

Best val_auc So Far: 0.831977367401123
Total elapsed time: 00h 00m 55s
Best hyperparameters (SMOTE): {'units_1': 512, 'num_layers': 1, 'learning_rate': 0.001, 'units_2': 128, 'dropout_0': False, 'units_3': 32, 'dropout_1': True}
Epoch 1/15
436/436 - 3s - 7ms/step - accuracy: 0.7310 - auc: 0.8163 - loss: 0.5195 - val_accuracy: 0.7389 - val_auc: 0.8287 - val_loss: 0.5035
Epoch 2/15
436/436 - 2s - 3ms/step - accuracy: 0.7438 - auc: 0.8308 - loss: 0.5010 - val_accuracy: 0.7370 - val_auc: 0.8308 - val_loss: 0.5040
Epoch 3/15
436/436 - 2s - 4ms/step - accuracy: 0.7481 - auc: 0.8360 - loss: 0.4930 - val_accuracy: 0.7438 - val_auc: 0.8317 - val_loss: 0.4993
Epoch 3: early stopping
Restoring model weights from the end of the best epoch: 1.
Saved submission_nn_smote.csv

===== ALL DONE =====
Files created: submission_nn_nosample.csv, submission_nn_smote.csv
