In [1]:
# IMPORTS 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy.stats import loguniform
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.experimental import enable_halving_search_cv  # noqa: F401
from sklearn.model_selection import HalvingRandomSearchCV
from imblearn.over_sampling import SMOTE

warnings.filterwarnings("ignore")

In [2]:
# LOAD DATA
train = pd.read_csv("../train.csv")
test = pd.read_csv("../test.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)


dup_count = train.duplicated().sum()
print(f"\nNumber of duplicate rows in train: {dup_count}")

if dup_count > 0:
    train.drop_duplicates(inplace=True)
    print("Duplicates removed. New shape:", train.shape)


Train shape: (59611, 24)
Test shape: (14900, 23)

Number of duplicate rows in train: 13
Duplicates removed. New shape: (59598, 24)


In [3]:
# FEATURE ENGINEERING 
log_transform_cols = [
    'monthly_revenue_generated', 'funding_rounds_led',
    'num_dependents', 'years_with_startup'
]

def feature_engineer(df):
    df = df.copy()

    if 'years_with_startup' in df.columns and 'years_since_founding' in df.columns:
        df['experience_ratio'] = df['years_with_startup'] / (df['years_since_founding'] + np.finfo(float).eps)
    
    if 'founder_age' in df.columns and 'years_with_startup' in df.columns:
        df['founder_join_age'] = df['founder_age'] - df['years_with_startup']
    
    if 'monthly_revenue_generated' in df.columns and 'funding_rounds_led' in df.columns:
        df['revenue_per_round'] = df['monthly_revenue_generated'] / (df['funding_rounds_led'] + 1)

    for col in log_transform_cols:
        if col in df.columns:
            df[f'log_{col}'] = np.log1p(df[col])
            df.drop(col, axis=1, inplace=True)

    return df


train_fe = feature_engineer(train.drop(columns=['founder_id']))
test_fe = feature_engineer(test.drop(columns=['founder_id']))

X = train_fe.drop(columns=['retention_status'])
y = train_fe['retention_status'].map({'Stayed': 1, 'Left': 0})
X_test_final = test_fe

X_train, X_test_val, y_train, y_test_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [4]:
# COLUMN DEFINITIONS 
numerical_cols = [
    'years_since_founding', 'founder_age', 'distance_from_investor_hub',
    'experience_ratio', 'founder_join_age', 'revenue_per_round',
    'log_monthly_revenue_generated', 'log_funding_rounds_led',
    'log_num_dependents', 'log_years_with_startup'
]

binary_cols = ['working_overtime', 'remote_operations', 'leadership_scope', 'innovation_support']

ordinal_cols = {
    'work_life_balance_rating': ['Poor', 'Fair', 'Good', 'Excellent'],
    'venture_satisfaction': ['Low', 'Medium', 'High', 'Very High'],
    'startup_performance_rating': ['Low', 'Below Average', 'Average', 'High'],
    'startup_reputation': ['Poor', 'Fair', 'Good', 'Excellent'],
    'founder_visibility': ['Low', 'Medium', 'High', 'Very High'],
    'startup_stage': ['Entry', 'Mid', 'Senior'],
    'team_size_category': ['Small', 'Medium', 'Large']
}

ordinal_feature_names = list(ordinal_cols.keys())
ordinal_categories = list(ordinal_cols.values())

nominal_cols = ['founder_gender', 'founder_role', 'education_background', 'personal_status']

In [5]:
#  PREPROCESSING
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

binary_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('mapper', OrdinalEncoder(categories=[['No', 'Yes']] * len(binary_cols),
                              handle_unknown='use_encoded_value', unknown_value=-1))
])

ordinal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder(categories=ordinal_categories,
                                       handle_unknown='use_encoded_value', unknown_value=-1))
])

nominal_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),
        ('bin', binary_pipeline, binary_cols),
        ('ord', ordinal_pipeline, ordinal_feature_names),
        ('nom', nominal_pipeline, nominal_cols)
    ]
)


In [6]:
# TRANSFORM DATA 
X_train_processed = preprocessor.fit_transform(X_train)
X_val_processed = preprocessor.transform(X_test_val)
X_test_processed = preprocessor.transform(X_test_final)

In [7]:
#  SVM CONFIG 
svc = SVC(
    random_state=42,
    class_weight='balanced',
    probability=False,
    cache_size=1500,
    max_iter=100000
)

param_dist = {
    'C': loguniform(1e-2, 1e3),
    'gamma': loguniform(1e-4, 1e1),
    'kernel': ['rbf']
}

halving_search = HalvingRandomSearchCV(
    estimator=svc,
    param_distributions=param_dist,
    n_candidates=28,
    factor=3,
    resource='n_samples',
    scoring='roc_auc',
    cv=3,
    random_state=42,
    n_jobs=-1,
    verbose=2
)

In [11]:

#  MODEL 1: TRAIN WITHOUT ANY SAMPLING

print("\n================= Training WITHOUT SAMPLING =================")
halving_search.fit(X_train_processed, y_train)

best_model_no_sample = halving_search.best_estimator_

val_scores = best_model_no_sample.decision_function(X_val_processed)
auc_no_sample = roc_auc_score(y_test_val, val_scores)

print("AUC Without Sampling:", auc_no_sample)

# Final training
X_full_proc = np.vstack([X_train_processed, X_val_processed])
y_full = np.concatenate([y_train.values, y_test_val.values])

best_model_no_sample.fit(X_full_proc, y_full)

# Save CSV
df_pred = pd.DataFrame({
    'founder_id': test['founder_id'],
    'retention_status': np.where(
        best_model_no_sample.predict(X_test_processed) == 1, 'Stayed', 'Left'
    )
})
df_pred.to_csv("submission_svm_nosample.csv", index=False)
print("Saved: submission_svm_nosample.csv")



n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 8
min_resources_: 12
max_resources_: 47678
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 28
n_resources: 12
Fitting 3 folds for each of 28 candidates, totalling 84 fits
----------
iter: 1
n_candidates: 10
n_resources: 36
Fitting 3 folds for each of 10 candidates, totalling 30 fits
----------
iter: 2
n_candidates: 4
n_resources: 108
Fitting 3 folds for each of 4 candidates, totalling 12 fits
----------
iter: 3
n_candidates: 2
n_resources: 324
Fitting 3 folds for each of 2 candidates, totalling 6 fits
AUC Without Sampling: 0.8268994570996278
Saved: submission_svm_nosample.csv


In [9]:

#  MODEL 2: TRAIN WITH SMOTE 

print("\n================= Training WITH SMOTE =================")
smote = SMOTE(random_state=42)
X_sm, y_sm = smote.fit_resample(X_train_processed, y_train)

halving_search.fit(X_sm, y_sm)
best_model_smote = halving_search.best_estimator_

val_scores_smote = best_model_smote.decision_function(X_val_processed)
auc_smote = roc_auc_score(y_test_val, val_scores_smote)

print("AUC With SMOTE:", auc_smote)

best_model_smote.fit(X_full_proc, y_full)

df_pred2 = pd.DataFrame({
    'founder_id': test['founder_id'],
    'retention_status': np.where(
        best_model_smote.predict(X_test_processed) == 1, 'Stayed', 'Left'
    )
})
df_pred2.to_csv("submission_svm_smote.csv", index=False)
print("Saved: submission_svm_smote.csv")


n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 8
min_resources_: 12
max_resources_: 50016
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 28
n_resources: 12
Fitting 3 folds for each of 28 candidates, totalling 84 fits
----------
iter: 1
n_candidates: 10
n_resources: 36
Fitting 3 folds for each of 10 candidates, totalling 30 fits
----------
iter: 2
n_candidates: 4
n_resources: 108
Fitting 3 folds for each of 4 candidates, totalling 12 fits
----------
iter: 3
n_candidates: 2
n_resources: 324
Fitting 3 folds for each of 2 candidates, totalling 6 fits
AUC With SMOTE: 0.8221561901885117
Saved: submission_svm_smote.csv


In [10]:

print("\n================ FINAL SUMMARY ================")
print(f"AUC (No Sampling): {auc_no_sample:.4f}")
print(f"AUC (SMOTE): {auc_smote:.4f}")
print("Generated:")
print("- submission_svm_nosample.csv")
print("- submission_svm_smote.csv")


AUC (No Sampling): 0.8269
AUC (SMOTE): 0.8222
Generated:
- submission_svm_nosample.csv
- submission_svm_smote.csv
