In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Sklearn imports
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

# Clustering
from sklearn.mixture import GaussianMixture

# Boosting
from xgboost import XGBClassifier

In [2]:
train = pd.read_csv("../train.csv")
test = pd.read_csv("../test.csv")

TARGET = "retention_status"
IDCOL = "founder_id"

# Remove duplicates
train = train.drop_duplicates().reset_index(drop=True)

In [3]:
log_cols = [
    'monthly_revenue_generated', 'funding_rounds_led',
    'num_dependents', 'years_with_startup'
]

def feature_engineer(df):
    df = df.copy()
    if 'years_with_startup' in df and 'years_since_founding' in df:
        df['experience_ratio'] = df['years_with_startup'] / (df['years_since_founding'] + 1e-9)
    if 'founder_age' in df and 'years_with_startup' in df:
        df['founder_join_age'] = df['founder_age'] - df['years_with_startup']
    if 'monthly_revenue_generated' in df and 'funding_rounds_led' in df:
        df['revenue_per_round'] = df['monthly_revenue_generated'] / (df['funding_rounds_led'] + 1)
    
    for c in log_cols:
        if c in df:
            df[f"log_{c}"] = np.log1p(df[c])
            df.drop(c, axis=1, inplace=True)
    return df

train_fe = feature_engineer(train.copy())
test_fe = feature_engineer(test.copy())

# Ensure IDCOL exists in test
if IDCOL not in test_fe.columns and IDCOL in test.columns:
    test_fe[IDCOL] = test[IDCOL]

In [4]:
X = train_fe.drop(columns=[TARGET, IDCOL])
y = train_fe[TARGET].map({"Stayed": 1, "Left": 0}).astype(int)
X_test = test_fe.drop(columns=[IDCOL], errors="ignore")

In [5]:
numerical_cols = [
    'years_since_founding', 'founder_age', 'distance_from_investor_hub',
    'experience_ratio', 'founder_join_age', 'revenue_per_round',
    'log_monthly_revenue_generated', 'log_funding_rounds_led',
    'log_num_dependents', 'log_years_with_startup'
]

binary_cols = [
    'working_overtime', 'remote_operations',
    'leadership_scope', 'innovation_support'
]

ordinal_cols = {
    'work_life_balance_rating': ['Poor', 'Fair', 'Good', 'Excellent'],
    'venture_satisfaction': ['Low', 'Medium', 'High', 'Very High'],
    'startup_performance_rating': ['Low', 'Below Average', 'Average', 'High'],
    'startup_reputation': ['Poor', 'Fair', 'Good', 'Excellent'],
    'founder_visibility': ['Low', 'Medium', 'High', 'Very High'],
    'startup_stage': ['Entry', 'Mid', 'Senior'],
    'team_size_category': ['Small', 'Medium', 'Large']
}

ordinal_feature_names = list(ordinal_cols.keys())
ordinal_categories = list(ordinal_cols.values())
nominal_cols = ['founder_gender', 'founder_role', 'education_background', 'personal_status']

# Keep only columns that exist
numerical_cols = [c for c in numerical_cols if c in X.columns]
binary_cols = [c for c in binary_cols if c in X.columns]
ordinal_feature_names = [c for c in ordinal_feature_names if c in X.columns]
nominal_cols = [c for c in nominal_cols if c in X.columns]


In [6]:
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
]) if numerical_cols else ("num", "passthrough", [])

binary_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("binmap", OrdinalEncoder(categories=[["No","Yes"]]*len(binary_cols)))
]) if binary_cols else ("bin", "passthrough", [])

ordinal_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ordenc", OrdinalEncoder(categories=ordinal_categories))
]) if ordinal_feature_names else ("ord", "passthrough", [])

nominal_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(drop="first", sparse_output=False))
]) if nominal_cols else ("nom", "passthrough", [])

transformers = []
if numerical_cols:
    transformers.append(("num", numerical_pipeline, numerical_cols))
if binary_cols:
    transformers.append(("bin", binary_pipeline, binary_cols))
if ordinal_feature_names:
    transformers.append(("ord", ordinal_pipeline, ordinal_feature_names))
if nominal_cols:
    transformers.append(("nom", nominal_pipeline, nominal_cols))

preprocessor = ColumnTransformer(transformers=transformers, remainder="drop")
X_processed = preprocessor.fit_transform(X)
X_test_processed = preprocessor.transform(X_test)

# Convert back to DataFrame
X = pd.DataFrame(X_processed, index=X.index, columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(X_test_processed, index=X_test.index, columns=preprocessor.get_feature_names_out())


In [7]:
BEST_K = 8
gmm = GaussianMixture(n_components=BEST_K, covariance_type="full", random_state=42)
gmm.fit(X)

# Cluster IDs
train_fe["Cluster"] = gmm.predict(X)
test_fe["Cluster"] = gmm.predict(X_test)

# Cluster probability features
cluster_probs_train = gmm.predict_proba(X)
cluster_probs_test  = gmm.predict_proba(X_test)

for i in range(BEST_K):
    X[f"ClusterProb_{i}"] = cluster_probs_train[:, i]
    X_test[f"ClusterProb_{i}"] = cluster_probs_test[:, i]

# ------------------------------
# 4. TRAIN XGBOOST
# ------------------------------
xgb_model = XGBClassifier(
    n_estimators=600,
    max_depth=5,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(X))
models = []

for train_index, val_index in kf.split(X, y):
    X_tr, X_val = X.iloc[train_index], X.iloc[val_index]
    y_tr, y_val = y.iloc[train_index], y.iloc[val_index]

    xgb_model.fit(X_tr, y_tr)
    pred = xgb_model.predict(X_val)

    oof[val_index] = pred
    models.append(xgb_model)

from sklearn.metrics import f1_score
print("OOF F1 Score:", f1_score(y, oof))


OOF F1 Score: 0.7635091977516607


In [11]:
test_probs = np.zeros(len(X_test))
for m in models:
    test_probs += m.predict_proba(X_test)[:, 1]  # probability of "Stayed"
test_probs /= len(models)

# Map probabilities to discrete labels
threshold = 0.5
test_labels = np.where(test_probs >= threshold, "Stayed", "Left")

# ------------------------------
# 6. SAVE SUBMISSION
# ------------------------------
submission = pd.DataFrame({
    IDCOL: test[IDCOL] if IDCOL in test.columns else np.arange(len(test)),
    TARGET: test_labels
})
submission.to_csv("submission_xgb_gmm_labels.csv", index=False)
print("Submission saved as submission_xgb_gmm_labels.csv")

Submission saved as submission_xgb_gmm_labels.csv
