In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE

warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv("../train.csv")
test = pd.read_csv("../test.csv")

dup_count = train.duplicated().sum()
print(f"\nNumber of duplicate rows in train: {dup_count}")

if dup_count > 0:
    train.drop_duplicates(inplace=True)
    print("Duplicates removed. New shape:", train.shape)


Number of duplicate rows in train: 13
Duplicates removed. New shape: (59598, 24)


In [3]:
log_cols = [
    'monthly_revenue_generated', 'funding_rounds_led',
    'num_dependents', 'years_with_startup'
]

def feature_engineer(df):
    df = df.copy()
    if 'years_with_startup' in df and 'years_since_founding' in df:
        df['experience_ratio'] = df['years_with_startup'] / (df['years_since_founding'] + 1e-9)
    if 'founder_age' in df and 'years_with_startup' in df:
        df['founder_join_age'] = df['founder_age'] - df['years_with_startup']
    if 'monthly_revenue_generated' in df and 'funding_rounds_led' in df:
        df['revenue_per_round'] = df['monthly_revenue_generated'] / (df['funding_rounds_led'] + 1)

    for c in log_cols:
        if c in df:
            df[f"log_{c}"] = np.log1p(df[c])
            df.drop(c, axis=1, inplace=True)

    return df


train_fe = feature_engineer(train.drop(columns=['founder_id']))
test_fe = feature_engineer(test.drop(columns=['founder_id']))

X = train_fe.drop(columns=['retention_status'])
y = train_fe['retention_status'].map({"Stayed": 1, "Left": 0}).astype(int)
X_test_final = test_fe.copy()

In [4]:
numerical_cols = [
    'years_since_founding', 'founder_age', 'distance_from_investor_hub',
    'experience_ratio', 'founder_join_age', 'revenue_per_round',
    'log_monthly_revenue_generated', 'log_funding_rounds_led',
    'log_num_dependents', 'log_years_with_startup'
]

binary_cols = ['working_overtime', 'remote_operations', 'leadership_scope', 'innovation_support']

ordinal_cols = {
    'work_life_balance_rating': ['Poor', 'Fair', 'Good', 'Excellent'],
    'venture_satisfaction': ['Low', 'Medium', 'High', 'Very High'],
    'startup_performance_rating': ['Low', 'Below Average', 'Average', 'High'],
    'startup_reputation': ['Poor', 'Fair', 'Good', 'Excellent'],
    'founder_visibility': ['Low', 'Medium', 'High', 'Very High'],
    'startup_stage': ['Entry', 'Mid', 'Senior'],
    'team_size_category': ['Small', 'Medium', 'Large']
}

ordinal_feature_names = list(ordinal_cols.keys())
ordinal_categories = list(ordinal_cols.values())

nominal_cols = ['founder_gender', 'founder_role', 'education_background', 'personal_status']

In [5]:
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])

binary_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("binmap", OrdinalEncoder(categories=[["No", "Yes"]] * len(binary_cols)))
])

ordinal_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ordenc", OrdinalEncoder(categories=ordinal_categories))
])

nominal_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(drop="first", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_cols),
        ("bin", binary_pipeline, binary_cols),
        ("ord", ordinal_pipeline, ordinal_feature_names),
        ("nom", nominal_pipeline, nominal_cols),
    ]
)

In [6]:
X_train_df, X_val_df, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train = preprocessor.fit_transform(X_train_df)
X_val = preprocessor.transform(X_val_df)
X_test = preprocessor.transform(X_test_final)

print("Train shape:", X_train.shape)
print("Val shape:", X_val.shape)

Train shape: (47678, 32)
Val shape: (11920, 32)


In [7]:
param_grid = {
    "C": [0.01, 0.1, 1, 10, 50],
    "solver": ["liblinear", "lbfgs", "saga", "newton-cg"]
}

In [8]:
print("\n===== MODEL A (NO SAMPLING) =====")

# class weights (important)
cw_vals = compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)
class_weights = {0: cw_vals[0], 1: cw_vals[1]}

logreg = LogisticRegression(max_iter=2000, n_jobs=-1, class_weight=class_weights)

grid_a = GridSearchCV(
    estimator=logreg,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring="accuracy"
)

grid_a.fit(X_train, y_train)
print("Best params:", grid_a.best_params_)
print("Training accuracy:", grid_a.best_score_)
print("Validation accuracy:", grid_a.score(X_val, y_val))

# predict test
pred_a = grid_a.predict(X_test)
pred_a_labels = np.where(pred_a == 1, "Stayed", "Left")

pd.DataFrame({
    "founder_id": test["founder_id"],
    "retention_status": pred_a_labels
}).to_csv("submission_logreg_nosample.csv", index=False)

print("Saved → submission_logreg_nosample.csv")


===== MODEL A (NO SAMPLING) =====
Best params: {'C': 0.1, 'solver': 'saga'}
Training accuracy: 0.7403414697880996
Validation accuracy: 0.7400167785234899
Saved → submission_logreg_nosample.csv


In [9]:
print("\n===== MODEL B (SMOTE) =====")

sm = SMOTE(random_state=42)
X_sm, y_sm = sm.fit_resample(X_train, y_train)

print("After SMOTE:", X_sm.shape)

logreg2 = LogisticRegression(max_iter=2000, n_jobs=-1)

grid_b = GridSearchCV(
    estimator=logreg2,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring="accuracy"
)

grid_b.fit(X_sm, y_sm)
print("Best params:", grid_b.best_params_)
print("Training accuracy:", grid_b.best_score_)
print("Validation accuracy:", grid_b.score(X_val, y_val))

# test predictions
pred_b = grid_b.predict(X_test)
pred_b_labels = np.where(pred_b == 1, "Stayed", "Left")

pd.DataFrame({
    "founder_id": test["founder_id"],
    "retention_status": pred_b_labels
}).to_csv("submission_logreg_smote.csv", index=False)

print("Saved → submission_logreg_smote.csv")

print("\n===== ALL DONE =====")



===== MODEL B (SMOTE) =====
After SMOTE: (50016, 32)
Best params: {'C': 10, 'solver': 'liblinear'}
Training accuracy: 0.742242453213656
Validation accuracy: 0.7396812080536913
Saved → submission_logreg_smote.csv

===== ALL DONE =====
