In [16]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, RobustScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.cluster import MiniBatchKMeans

import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import torch
import torch.nn as nn
import torch.optim as optim

In [17]:
train = pd.read_csv("../train.csv")
test = pd.read_csv("../test.csv")

TARGET = "retention_status"
IDCOL = "founder_id"

# Remove duplicates
train = train.drop_duplicates().reset_index(drop=True)

In [18]:
log_cols = ['monthly_revenue_generated', 'funding_rounds_led', 'num_dependents', 'years_with_startup']

def feature_engineer(df):
    df = df.copy()
    if 'years_with_startup' in df and 'years_since_founding' in df:
        df['experience_ratio'] = df['years_with_startup'] / (df['years_since_founding'] + 1e-9)
    if 'founder_age' in df and 'years_with_startup' in df:
        df['founder_join_age'] = df['founder_age'] - df['years_with_startup']
    if 'monthly_revenue_generated' in df and 'funding_rounds_led' in df:
        df['revenue_per_round'] = df['monthly_revenue_generated'] / (df['funding_rounds_led'] + 1)
    for c in log_cols:
        if c in df:
            df[f"log_{c}"] = np.log1p(df[c])
            df.drop(c, axis=1, inplace=True)
    return df

train_fe = feature_engineer(train)
test_fe = feature_engineer(test)

if IDCOL not in test_fe.columns and IDCOL in test.columns:
    test_fe[IDCOL] = test[IDCOL]

In [19]:
X = train_fe.drop(columns=[TARGET, IDCOL])
y = train_fe[TARGET].map({"Stayed": 1, "Left": 0}).astype(int)
X_test = test_fe.drop(columns=[IDCOL], errors="ignore")

In [20]:
# Feature columns
numerical_cols = [c for c in [
    'years_since_founding', 'founder_age', 'distance_from_investor_hub',
    'experience_ratio', 'founder_join_age', 'revenue_per_round',
    'log_monthly_revenue_generated', 'log_funding_rounds_led',
    'log_num_dependents', 'log_years_with_startup'
] if c in X.columns]

binary_cols = [c for c in [
    'working_overtime', 'remote_operations',
    'leadership_scope', 'innovation_support'
] if c in X.columns]

ordinal_cols = {
    'work_life_balance_rating': ['Poor', 'Fair', 'Good', 'Excellent'],
    'venture_satisfaction': ['Low', 'Medium', 'High', 'Very High'],
    'startup_performance_rating': ['Low', 'Below Average', 'Average', 'High'],
    'startup_reputation': ['Poor', 'Fair', 'Good', 'Excellent'],
    'founder_visibility': ['Low', 'Medium', 'High', 'Very High'],
    'startup_stage': ['Entry', 'Mid', 'Senior'],
    'team_size_category': ['Small', 'Medium', 'Large']
}
ordinal_feature_names = [c for c in ordinal_cols.keys() if c in X.columns]
ordinal_categories = [ordinal_cols[c] for c in ordinal_feature_names]

nominal_cols = [c for c in ['founder_gender', 'founder_role', 'education_background', 'personal_status'] if c in X.columns]

# Pipelines
transformers = []

if numerical_cols:
    num_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', RobustScaler())
    ])
    transformers.append(('num', num_pipe, numerical_cols))

if binary_cols:
    bin_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder(categories=[['No','Yes']]*len(binary_cols)))
    ])
    transformers.append(('bin', bin_pipe, binary_cols))

if ordinal_feature_names:
    ord_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder(categories=ordinal_categories))
    ])
    transformers.append(('ord', ord_pipe, ordinal_feature_names))

if nominal_cols:
    nom_pipe = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(drop='first', sparse_output=False))
    ])
    transformers.append(('nom', nom_pipe, nominal_cols))

preprocessor = ColumnTransformer(transformers=transformers, remainder='drop')

X = pd.DataFrame(preprocessor.fit_transform(X), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

print("Processed X shape:", X.shape, "X_test shape:", X_test.shape)

Processed X shape: (59598, 32) X_test shape: (14900, 32)


In [21]:
K = 12
kmeans = MiniBatchKMeans(n_clusters=K, batch_size=4096, random_state=42)
kmeans.fit(X)
X["cluster_label"] = kmeans.labels_
X_test["cluster_label"] = kmeans.predict(X_test)

print("Cluster labels added!")

Cluster labels added!


In [22]:
# KFold setup
NFOLDS = 5
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=42)

oof_lr = np.zeros(len(X))
oof_lgb = np.zeros(len(X))
oof_xgb = np.zeros(len(X))
oof_cat = np.zeros(len(X))
oof_mlp = np.zeros(len(X))

test_lr = np.zeros(len(X_test))
test_lgb = np.zeros(len(X_test))
test_xgb = np.zeros(len(X_test))
test_cat = np.zeros(len(X_test))
test_mlp = np.zeros(len(X_test))

In [23]:
lr_model_template = LogisticRegression(max_iter=2000, solver='lbfgs', class_weight='balanced', random_state=42, n_jobs=-1)
lgb_params = dict(n_estimators=800, learning_rate=0.05, num_leaves=64, max_depth=-1, subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1, verbose=-1)
xgb_params = dict(n_estimators=800, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1)
cat_params = dict(iterations=800, learning_rate=0.05, depth=6, random_seed=42, verbose=0)

scaler = RobustScaler()

In [24]:
# MLP model
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(128, 64), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(64, 1), nn.Sigmoid()
        )
    def forward(self, x):
        return self.model(x)

def train_mlp(X_tr, y_tr, X_val):
    X_tr = torch.tensor(X_tr.values, dtype=torch.float32)
    y_tr = torch.tensor(y_tr.values.reshape(-1,1), dtype=torch.float32)
    X_val = torch.tensor(X_val.values, dtype=torch.float32)
    model = MLP(X_tr.shape[1])
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.BCELoss()
    model.train()
    for epoch in range(15):
        optimizer.zero_grad()
        preds = model(X_tr)
        loss = loss_fn(preds, y_tr)
        loss.backward()
        optimizer.step()
    model.eval()
    with torch.no_grad():
        val_preds = model(X_val).numpy().flatten()
    return model, val_preds


In [25]:
# Training loop
for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y), 1):
    print(f"\n=== FOLD {fold} ===")
    X_tr, X_val = X.iloc[train_idx], X.iloc[valid_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[valid_idx]

    # Logistic Regression
    X_tr_s = scaler.fit_transform(X_tr)
    X_val_s = scaler.transform(X_val)
    lr = LogisticRegression(**lr_model_template.get_params())
    lr.fit(X_tr_s, y_tr)
    oof_lr[valid_idx] = lr.predict_proba(X_val_s)[:,1]
    test_lr += lr.predict_proba(scaler.transform(X_test))[:,1] / NFOLDS
    print("LR AUC:", roc_auc_score(y_val, oof_lr[valid_idx]))

    # LightGBM
    lgbm = lgb.LGBMClassifier(**lgb_params)
    lgbm.fit(X_tr, y_tr)
    oof_lgb[valid_idx] = lgbm.predict_proba(X_val)[:,1]
    test_lgb += lgbm.predict_proba(X_test)[:,1] / NFOLDS
    print("LGB AUC:", roc_auc_score(y_val, oof_lgb[valid_idx]))

    # XGBoost
    xgb = XGBClassifier(**xgb_params)
    xgb.fit(X_tr, y_tr)
    oof_xgb[valid_idx] = xgb.predict_proba(X_val)[:,1]
    test_xgb += xgb.predict_proba(X_test)[:,1] / NFOLDS
    print("XGB AUC:", roc_auc_score(y_val, oof_xgb[valid_idx]))

    # CatBoost
    cat = CatBoostClassifier(**cat_params)
    cat.fit(X_tr, y_tr)
    oof_cat[valid_idx] = cat.predict_proba(X_val)[:,1]
    test_cat += cat.predict_proba(X_test)[:,1] / NFOLDS
    print("CAT AUC:", roc_auc_score(y_val, oof_cat[valid_idx]))

    # MLP
    mlp_model, val_mlp = train_mlp(X_tr, y_tr, X_val)
    oof_mlp[valid_idx] = val_mlp
    with torch.no_grad():
        test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
        test_mlp += mlp_model(test_tensor).numpy().flatten() / NFOLDS
    print("MLP AUC:", roc_auc_score(y_val, oof_mlp[valid_idx]))



=== FOLD 1 ===
LR AUC: 0.8088208431084188
LGB AUC: 0.8152529630316006
XGB AUC: 0.8144807062223685
CAT AUC: 0.8218376971889209
MLP AUC: 0.7348986761985093

=== FOLD 2 ===
LR AUC: 0.8333668757865127
LGB AUC: 0.8409727751508028
XGB AUC: 0.842338635692736
CAT AUC: 0.8489692508517357
MLP AUC: 0.7679057175523454

=== FOLD 3 ===
LR AUC: 0.8329919166366488
LGB AUC: 0.8442913052274872
XGB AUC: 0.8433421315039106
CAT AUC: 0.8495660272493316
MLP AUC: 0.7826053369015227

=== FOLD 4 ===
LR AUC: 0.8354715737117034
LGB AUC: 0.8450837135258378
XGB AUC: 0.8448763743107475
CAT AUC: 0.8514871653043457
MLP AUC: 0.7209838498539842

=== FOLD 5 ===
LR AUC: 0.8296824971385737
LGB AUC: 0.8394296173710358
XGB AUC: 0.8401831295931623
CAT AUC: 0.8477735292082591
MLP AUC: 0.7517334347847149


In [27]:
# Stack predictions
stack_train = pd.DataFrame({
    "lr": oof_lr,
    "lgb": oof_lgb,
    "xgb": oof_xgb,
    "cat": oof_cat,
    "mlp": oof_mlp
})
stack_test = pd.DataFrame({
    "lr": test_lr,
    "lgb": test_lgb,
    "xgb": test_xgb,
    "cat": test_cat,
    "mlp": test_mlp
})

# Meta-model
meta = LogisticRegression(max_iter=2000, solver='lbfgs', random_state=42)
meta.fit(stack_train, y)
meta_oof = meta.predict_proba(stack_train)[:,1]
print("META OOF AUC:", roc_auc_score(y, meta_oof))

# Final predictions
final_probs = meta.predict_proba(stack_test)[:,1]
final_labels = np.where(final_probs >= 0.5, "Stayed", "Left")
submission = pd.DataFrame({IDCOL: test[IDCOL] if IDCOL in test.columns else np.arange(len(test)), TARGET: final_labels})
submission.to_csv("submission_safe_stack.csv", index=False)
print("Submission saved!")

META OOF AUC: 0.8424858407649873
Submission saved!
