In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Sklearn imports
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import xgboost as xgb


# Clustering
from sklearn.cluster import MiniBatchKMeans
from sklearn.mixture import GaussianMixture

# Boosting
import lightgbm as lgb
from xgboost import XGBClassifier
import xgboost.callback as xgb_callback
from catboost import CatBoostClassifier

# TensorFlow / Keras
import tensorflow as tf
from tensorflow import keras
from keras import layers, regularizers, callbacks

In [14]:
train = pd.read_csv("../train.csv")
test = pd.read_csv("../test.csv")

# Constants
TARGET = "retention_status"
IDCOL = "founder_id"

# Remove duplicates (on entire train DF)
dup_count = train.duplicated().sum()
print(f"\nNumber of duplicate rows in train: {dup_count}")
if dup_count > 0:
    train = train.drop_duplicates().reset_index(drop=True)
    print("Duplicates removed. New shape:", train.shape)


Number of duplicate rows in train: 13
Duplicates removed. New shape: (59598, 24)


In [15]:
log_cols = [
    'monthly_revenue_generated', 'funding_rounds_led',
    'num_dependents', 'years_with_startup'
]

def feature_engineer(df):
    df = df.copy()
    if 'years_with_startup' in df and 'years_since_founding' in df:
        df['experience_ratio'] = df['years_with_startup'] / (df['years_since_founding'] + 1e-9)
    if 'founder_age' in df and 'years_with_startup' in df:
        df['founder_join_age'] = df['founder_age'] - df['years_with_startup']
    if 'monthly_revenue_generated' in df and 'funding_rounds_led' in df:
        df['revenue_per_round'] = df['monthly_revenue_generated'] / (df['funding_rounds_led'] + 1)

    for c in log_cols:
        if c in df:
            df[f"log_{c}"] = np.log1p(df[c])
            df.drop(c, axis=1, inplace=True)
    return df

# Apply feature engineering but keep founder_id and target in train_fe for alignment
train_fe = feature_engineer(train.copy())
test_fe = feature_engineer(test.copy())

# Ensure IDCOL exists in test_fe (for submission). If not, try to copy from test.
if IDCOL not in test_fe.columns and IDCOL in test.columns:
    test_fe[IDCOL] = test[IDCOL]


In [16]:
X = train_fe.drop(columns=[TARGET, IDCOL])
y = train_fe[TARGET].map({"Stayed": 1, "Left": 0}).astype(int)
X_test_final = test_fe.drop(columns=[IDCOL], errors="ignore")

print("Raw feature shapes:", X.shape, X_test_final.shape, y.shape)

Raw feature shapes: (59598, 25) (14900, 25) (59598,)


In [17]:
numerical_cols = [
    'years_since_founding', 'founder_age', 'distance_from_investor_hub',
    'experience_ratio', 'founder_join_age', 'revenue_per_round',
    'log_monthly_revenue_generated', 'log_funding_rounds_led',
    'log_num_dependents', 'log_years_with_startup'
]

binary_cols = [
    'working_overtime', 'remote_operations',
    'leadership_scope', 'innovation_support'
]

ordinal_cols = {
    'work_life_balance_rating': ['Poor', 'Fair', 'Good', 'Excellent'],
    'venture_satisfaction': ['Low', 'Medium', 'High', 'Very High'],
    'startup_performance_rating': ['Low', 'Below Average', 'Average', 'High'],
    'startup_reputation': ['Poor', 'Fair', 'Good', 'Excellent'],
    'founder_visibility': ['Low', 'Medium', 'High', 'Very High'],
    'startup_stage': ['Entry', 'Mid', 'Senior'],
    'team_size_category': ['Small', 'Medium', 'Large']
}

ordinal_feature_names = list(ordinal_cols.keys())
ordinal_categories = list(ordinal_cols.values())
nominal_cols = ['founder_gender', 'founder_role', 'education_background', 'personal_status']

# Safety: keep only columns that actually exist in X
numerical_cols = [c for c in numerical_cols if c in X.columns]
binary_cols = [c for c in binary_cols if c in X.columns]
ordinal_feature_names = [c for c in ordinal_feature_names if c in X.columns]
nominal_cols = [c for c in nominal_cols if c in X.columns]


In [18]:
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
]) if numerical_cols else ("num", "passthrough", [])

binary_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("binmap", OrdinalEncoder(categories=[["No","Yes"]] * len(binary_cols)))
]) if binary_cols else ("bin", "passthrough", [])

ordinal_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ordenc", OrdinalEncoder(categories=ordinal_categories))
]) if ordinal_feature_names else ("ord", "passthrough", [])

nominal_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(drop="first", sparse_output=False))
]) if nominal_cols else ("nom", "passthrough", [])

# Create ColumnTransformer only with available transformers
transformers = []
if numerical_cols:
    transformers.append(("num", numerical_pipeline, numerical_cols))
if binary_cols:
    transformers.append(("bin", binary_pipeline, binary_cols))
if ordinal_feature_names:
    transformers.append(("ord", ordinal_pipeline, ordinal_feature_names))
if nominal_cols:
    transformers.append(("nom", nominal_pipeline, nominal_cols))

preprocessor = ColumnTransformer(transformers=transformers, remainder="drop")

In [19]:
X_processed = preprocessor.fit_transform(X)            # numpy array, len = len(X)
X_test = preprocessor.transform(X_test_final)          # test processed


feature_names = preprocessor.get_feature_names_out()
X_cluster = pd.DataFrame(X_processed, index=X.index, columns=feature_names)
X_test_cluster = pd.DataFrame(X_test, index=X_test_final.index, columns=feature_names)

print("Processed shapes (full):", X_cluster.shape, X_test_cluster.shape)

X_train_idx, X_val_idx, y_train, y_val = train_test_split(
    X.index, y, test_size=0.2, random_state=42, stratify=y
)

X_train = X_cluster.loc[X_train_idx].reset_index(drop=True)
X_val = X_cluster.loc[X_val_idx].reset_index(drop=True)
y_train = y.loc[X_train_idx].reset_index(drop=True)
y_val = y.loc[X_val_idx].reset_index(drop=True)

print("Train shape (after split):", X_train.shape)
print("Val shape (after split):", X_val.shape)

Processed shapes (full): (59598, 32) (14900, 32)
Train shape (after split): (47678, 32)
Val shape (after split): (11920, 32)


In [20]:
print("Adding clustering features...")

# Ensure column names are strings (fix for GMM)
X_cluster.columns = X_cluster.columns.astype(str)
X_test_cluster.columns = X_test_cluster.columns.astype(str)

K = 12
kmeans = MiniBatchKMeans(n_clusters=K, batch_size=4096, random_state=42)
kmeans.fit(X_cluster)
X_cluster["kmeans_label"] = kmeans.labels_
X_test_cluster["kmeans_label"] = kmeans.predict(X_test_cluster)

G = 6
gmm = GaussianMixture(n_components=G, covariance_type="diag", random_state=42)
gmm.fit(X_cluster)
gmm_train_proba = gmm.predict_proba(X_cluster)
gmm_test_proba = gmm.predict_proba(X_test_cluster)

for i in range(G):
    X_cluster[f"gmm_prob_{i}"] = gmm_train_proba[:, i]
    X_test_cluster[f"gmm_prob_{i}"] = gmm_test_proba[:, i]

print("New shapes (clustered):", X_cluster.shape, X_test_cluster.shape)



Adding clustering features...
New shapes (clustered): (59598, 39) (14900, 39)


In [21]:
RND = 42
frac = 0.20

# keep IDCOL in train_fe for sampling (we used train.copy earlier so index aligns)
if IDCOL not in train_fe.columns:
    train_fe[IDCOL] = train[IDCOL].values

nn_sample = train.sample(frac=frac, random_state=RND)  # sample rows (index preserved)
nn_ids = nn_sample[IDCOL].tolist()

# Create boolean mask aligned with X_cluster index (same as train_fe index)
mask = train_fe[IDCOL].isin(nn_ids)

# Apply mask to X_cluster and y (both indexed by train_fe index)
X_nn = X_cluster[mask].reset_index(drop=True)
y_nn = y[mask].reset_index(drop=True)

print("NN sample shapes:", X_nn.shape, y_nn.shape)   # must match

# If too small, fallback to using a random subset of X_cluster with same size
if len(X_nn) < 10:
    raise RuntimeError("NN sample too small — check sampling fraction and data.")

# Train/validation split for NN (within sampled set)
X_nn_train, X_nn_val, y_nn_train, y_nn_val = train_test_split(
    X_nn, y_nn, test_size=0.2, random_state=RND, stratify=y_nn
)

scaler_nn = StandardScaler()
X_nn_train_s = scaler_nn.fit_transform(X_nn_train)
X_nn_val_s   = scaler_nn.transform(X_nn_val)
X_test_s_nn  = scaler_nn.transform(X_test_cluster)

print("NN shapes (scaled):", X_nn_train_s.shape, X_nn_val_s.shape)

# ------------------------------
# Neural network model
# ------------------------------
def build_nn(input_dim, lr=1e-3, l2=1e-5, dropout=0.3):
    inputs = layers.Input(shape=(input_dim,))
    x = layers.BatchNormalization()(inputs)
    x = layers.Dense(256, activation="relu", kernel_regularizer=regularizers.l2(l2))(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Dense(128, activation="relu", kernel_regularizer=regularizers.l2(l2))(x)
    x = layers.Dropout(dropout/2)(x)
    x = layers.Dense(64, activation="relu")(x)
    x = layers.Dropout(dropout/3)(x)
    out = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, out)
    model.compile(optimizer=keras.optimizers.Adam(lr),
                  loss="binary_crossentropy",
                  metrics=[keras.metrics.AUC(name="auc")])
    return model

nn_model = build_nn(X_nn_train_s.shape[1])

early = callbacks.EarlyStopping(monitor="val_auc", patience=6, mode="max", restore_best_weights=True)
reduce_lr = callbacks.ReduceLROnPlateau(monitor="val_auc", factor=0.5, patience=3, mode="max")

history = nn_model.fit(
    X_nn_train_s, y_nn_train,
    validation_data=(X_nn_val_s, y_nn_val),
    epochs=50,
    batch_size=1024,
    callbacks=[early, reduce_lr],
    verbose=2
)

nn_val_probs = nn_model.predict(X_nn_val_s).ravel()
nn_val_auc = roc_auc_score(y_nn_val, nn_val_probs)
print("NN Validation AUC:", nn_val_auc)

nn_test_probs = nn_model.predict(X_test_s_nn).ravel()

NN sample shapes: (11920, 39) (11920,)
NN shapes (scaled): (9536, 39) (2384, 39)
Epoch 1/50
10/10 - 2s - 222ms/step - auc: 0.5935 - loss: 0.6845 - val_auc: 0.7504 - val_loss: 0.6234 - learning_rate: 0.0010
Epoch 2/50
10/10 - 0s - 21ms/step - auc: 0.7501 - loss: 0.6004 - val_auc: 0.7763 - val_loss: 0.5672 - learning_rate: 0.0010
Epoch 3/50
10/10 - 0s - 19ms/step - auc: 0.7865 - loss: 0.5573 - val_auc: 0.7895 - val_loss: 0.5577 - learning_rate: 0.0010
Epoch 4/50
10/10 - 0s - 20ms/step - auc: 0.7996 - loss: 0.5448 - val_auc: 0.7925 - val_loss: 0.5513 - learning_rate: 0.0010
Epoch 5/50
10/10 - 0s - 22ms/step - auc: 0.8024 - loss: 0.5397 - val_auc: 0.7924 - val_loss: 0.5503 - learning_rate: 0.0010
Epoch 6/50
10/10 - 0s - 23ms/step - auc: 0.8063 - loss: 0.5346 - val_auc: 0.7924 - val_loss: 0.5502 - learning_rate: 0.0010
Epoch 7/50
10/10 - 0s - 26ms/step - auc: 0.8108 - loss: 0.5300 - val_auc: 0.7925 - val_loss: 0.5520 - learning_rate: 0.0010
Epoch 8/50
10/10 - 0s - 29ms/step - auc: 0.8130 - 

In [22]:
# Logistic Regression tuning
print("\nTuning Logistic Regression (FAST global search)...")

log_reg = LogisticRegression(max_iter=2000, random_state=42)
params = {
    'C': [0.01, 0.1, 1, 10, 50, 100],
    'solver': ['liblinear', 'lbfgs', 'newton-cg', 'sag', 'saga']
}

# Use X_cluster (full processed + clustering features) and y (aligned)
grid = GridSearchCV(log_reg, params, cv=5, n_jobs=-1, scoring='roc_auc')
grid.fit(X_cluster, y)
best_lr = grid.best_estimator_
print("Best LR:", grid.best_params_)


Tuning Logistic Regression (FAST global search)...
Best LR: {'C': 0.1, 'solver': 'newton-cg'}


In [38]:

NFOLDS = 5
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=RND)

# storage
oof_lgb = np.zeros(len(X_cluster))
oof_xgb = np.zeros(len(X_cluster))
oof_cat = np.zeros(len(X_cluster))
oof_lr  = np.zeros(len(X_cluster))

test_lgb = np.zeros(len(X_test_cluster))
test_xgb = np.zeros(len(X_test_cluster))
test_cat = np.zeros(len(X_test_cluster))
test_lr  = np.zeros(len(X_test_cluster))

# Logistic Regression
lr_model = LogisticRegression(
    solver="lbfgs",
    max_iter=2000,
    n_jobs=-1
)

# -------------------------
# MODEL PARAMETERS
# -------------------------
lgb_params = dict(
    n_estimators=500, learning_rate=0.05,
    num_leaves=64, subsample=0.8,
    colsample_bytree=0.8, random_state=42
)

xgb_params = dict(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric="auc",
    random_state=42
)

cat_params = dict(
    iterations=500, learning_rate=0.05,
    depth=6, random_seed=42, verbose=0
)

# -------------------------
# K-FOLD TRAINING
# -------------------------
for fold, (tr_idx, val_idx) in enumerate(kf.split(X_cluster, y), 1):
    print(f"\n--- FOLD {fold} ---")
    X_tr, X_val = X_cluster.iloc[tr_idx], X_cluster.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

    # -------------------------
    # LIGHTGBM
    # -------------------------
    lgbm = lgb.LGBMClassifier(**lgb_params)
    lgbm.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(50, verbose=False)]
    )
    oof_lgb[val_idx] = lgbm.predict_proba(X_val)[:,1]
    test_lgb += lgbm.predict_proba(X_test_cluster)[:,1] / NFOLDS
    print("LGB AUC:", roc_auc_score(y_val, oof_lgb[val_idx]))

    # -------------------------
    # XGBOOST (train full n_estimators)
    # -------------------------
    xgbm = XGBClassifier(**xgb_params)
    xgbm.fit(X_tr, y_tr)  # no early stopping
    oof_xgb[val_idx] = xgbm.predict_proba(X_val)[:,1]
    test_xgb += xgbm.predict_proba(X_test_cluster)[:,1] / NFOLDS
    print("XGB AUC:", roc_auc_score(y_val, oof_xgb[val_idx]))

    # -------------------------
    # CATBOOST
    # -------------------------
    cat = CatBoostClassifier(**cat_params)
    cat.fit(X_tr, y_tr, eval_set=(X_val, y_val), verbose=False)
    oof_cat[val_idx] = cat.predict_proba(X_val)[:,1]
    test_cat += cat.predict_proba(X_test_cluster)[:,1] / NFOLDS
    print("CAT AUC:", roc_auc_score(y_val, oof_cat[val_idx]))

    # -------------------------
    # LOGISTIC REGRESSION
    # -------------------------
    lr_model.fit(X_tr, y_tr)
    oof_lr[val_idx] = lr_model.predict_proba(X_val)[:,1]
    test_lr += lr_model.predict_proba(X_test_cluster)[:,1] / NFOLDS
    print("LR AUC:", roc_auc_score(y_val, oof_lr[val_idx]))



--- FOLD 1 ---
[LightGBM] [Info] Number of positive: 25007, number of negative: 22671
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002327 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2747
[LightGBM] [Info] Number of data points in the train set: 47678, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.524498 -> initscore=0.098069
[LightGBM] [Info] Start training from score 0.098069
LGB AUC: 0.8203084224178006
XGB AUC: 0.8177149681405087
CAT AUC: 0.8235702130314664
LR AUC: 0.8087863581612819

--- FOLD 2 ---
[LightGBM] [Info] Number of positive: 25028, number of negative: 22650
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004519 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[Lig

In [39]:
# Compute OOF AUC
auc_lgb = roc_auc_score(y, oof_lgb)
auc_xgb = roc_auc_score(y, oof_xgb)
auc_cat = roc_auc_score(y, oof_cat)
auc_lr  = roc_auc_score(y, oof_lr)

print("\nOOF LGB:", auc_lgb)
print("OOF XGB:", auc_xgb)
print("OOF CAT:", auc_cat)
print("OOF LR :", auc_lr)
print("NN VAL :", nn_val_auc)


OOF LGB: 0.8404795775536034
OOF XGB: 0.8381304392362247
OOF CAT: 0.844350728368235
OOF LR : 0.8275043803330665
NN VAL : 0.7932894637511526


In [42]:
# Blend weights
raw = np.array([auc_lgb, auc_xgb, auc_cat, auc_lr, nn_val_auc])
# guard in case any AUC is NaN
raw = np.nan_to_num(raw, nan=0.0)
if raw.sum() <= 0:
    # fallback to equal weights
    weights = np.ones(len(raw)) / len(raw)
else:
    weights = raw / raw.sum()
print("\nBlend Weights:", weights)

# Blended test predictions
test_blend = (
    weights[0]*test_lgb +
    weights[1]*test_xgb +
    weights[2]*test_cat +
    weights[3]*test_lr +
    weights[4]*nn_test_probs
)

# Save submission
submission = pd.DataFrame({
    IDCOL: test[IDCOL] if IDCOL in test.columns else np.arange(len(test)),
    TARGET: np.where(test_blend >= 0.5, "Stayed", "Left")
})
submission.to_csv("submission.csv", index=False)
print("\nSubmission saved as submission.csv")



Blend Weights: [0.20283044 0.20226353 0.20376466 0.19969918 0.19144219]

Submission saved as submission.csv
