In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score

warnings.filterwarnings("ignore")

In [3]:
train = pd.read_csv("../train.csv")
test = pd.read_csv("../test.csv")

dup_count = train.duplicated().sum()
print(f"\nNumber of duplicate rows in train: {dup_count}")

if dup_count > 0:
    train.drop_duplicates(inplace=True)
    print("Duplicates removed. New shape:", train.shape)


Number of duplicate rows in train: 13
Duplicates removed. New shape: (59598, 24)


In [4]:
log_cols = [
    'monthly_revenue_generated', 'funding_rounds_led',
    'num_dependents', 'years_with_startup'
]

def feature_engineer(df):
    df = df.copy()
    if 'years_with_startup' in df and 'years_since_founding' in df:
        df['experience_ratio'] = df['years_with_startup'] / (df['years_since_founding'] + 1e-9)
    if 'founder_age' in df and 'years_with_startup' in df:
        df['founder_join_age'] = df['founder_age'] - df['years_with_startup']
    if 'monthly_revenue_generated' in df and 'funding_rounds_led' in df:
        df['revenue_per_round'] = df['monthly_revenue_generated'] / (df['funding_rounds_led'] + 1)

    for c in log_cols:
        if c in df:
            df[f"log_{c}"] = np.log1p(df[c])
            df.drop(c, axis=1, inplace=True)

    return df


train_fe = feature_engineer(train.drop(columns=['founder_id']))
test_fe = feature_engineer(test.drop(columns=['founder_id']))

In [5]:
X = train_fe.drop(columns=['retention_status'])
y = train_fe['retention_status'].map({"Stayed": 1, "Left": 0}).astype(int)
X_test_final = test_fe.copy()

In [6]:
numerical_cols = [
    'years_since_founding', 'founder_age', 'distance_from_investor_hub',
    'experience_ratio', 'founder_join_age', 'revenue_per_round',
    'log_monthly_revenue_generated', 'log_funding_rounds_led',
    'log_num_dependents', 'log_years_with_startup'
]

binary_cols = [
    'working_overtime', 'remote_operations',
    'leadership_scope', 'innovation_support'
]

ordinal_cols = {
    'work_life_balance_rating': ['Poor', 'Fair', 'Good', 'Excellent'],
    'venture_satisfaction': ['Low', 'Medium', 'High', 'Very High'],
    'startup_performance_rating': ['Low', 'Below Average', 'Average', 'High'],
    'startup_reputation': ['Poor', 'Fair', 'Good', 'Excellent'],
    'founder_visibility': ['Low', 'Medium', 'High', 'Very High'],
    'startup_stage': ['Entry', 'Mid', 'Senior'],
    'team_size_category': ['Small', 'Medium', 'Large']
}

ordinal_feature_names = list(ordinal_cols.keys())
ordinal_categories = list(ordinal_cols.values())

nominal_cols = ['founder_gender', 'founder_role',
                'education_background', 'personal_status']

In [7]:
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])

binary_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("binmap", OrdinalEncoder(categories=[["No", "Yes"]] * len(binary_cols)))
])

ordinal_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ordenc", OrdinalEncoder(categories=ordinal_categories))
])

nominal_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(drop="first", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_pipeline, numerical_cols),
        ("bin", binary_pipeline, binary_cols),
        ("ord", ordinal_pipeline, ordinal_feature_names),
        ("nom", nominal_pipeline, nominal_cols),
    ]
)

In [8]:
X_train_df, X_val_df, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train = preprocessor.fit_transform(X_train_df)
X_val = preprocessor.transform(X_val_df)
X_test = preprocessor.transform(X_test_final)

print("Train shape:", X_train.shape)
print("Val shape:", X_val.shape)

Train shape: (47678, 32)
Val shape: (11920, 32)


In [9]:
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)

cb_model = CatBoostClassifier(
    iterations=1200,
    depth=10,
    learning_rate=0.03,
    l2_leaf_reg=5,
    border_count=128,
    random_state=42,
    loss_function="Logloss",
    eval_metric="F1",
    class_weights={0: class_weights[0], 1: class_weights[1]},
    boosting_type="Plain",
    bootstrap_type="Bayesian",
    verbose=200
)

cb_model.fit(X_train, y_train, eval_set=(X_val, y_val))

# Threshold tuning
val_proba = cb_model.predict_proba(X_val)[:, 1]
best_t = 0.5
best_f1 = 0

for t in np.arange(0.1, 0.9, 0.01):
    f1 = f1_score(y_val, (val_proba >= t).astype(int))
    if f1 > best_f1:
        best_f1 = f1
        best_t = t

print("BEST THRESHOLD:", best_t)
print("BEST F1:", best_f1)

# Predict test
test_proba = cb_model.predict_proba(X_test)[:, 1]
test_pred = (test_proba >= best_t).astype(int)
test_labels = np.where(test_pred == 1, "Stayed", "Left")

pd.DataFrame({
    "founder_id": test["founder_id"],
    "retention_status": test_labels
}).to_csv("submission_catboost_nosample.csv", index=False)

print("Saved → submission_catboost_nosample.csv")


0:	learn: 0.7347099	test: 0.7290383	best: 0.7290383 (0)	total: 111ms	remaining: 2m 12s
200:	learn: 0.7973328	test: 0.7492855	best: 0.7521380 (124)	total: 5.52s	remaining: 27.4s
400:	learn: 0.8392482	test: 0.7486305	best: 0.7521380 (124)	total: 19.8s	remaining: 39.5s
600:	learn: 0.8760561	test: 0.7478523	best: 0.7521380 (124)	total: 32.1s	remaining: 32s
800:	learn: 0.9079429	test: 0.7474240	best: 0.7521380 (124)	total: 43.2s	remaining: 21.5s
1000:	learn: 0.9325114	test: 0.7478617	best: 0.7521380 (124)	total: 51.4s	remaining: 10.2s
1199:	learn: 0.9502232	test: 0.7483689	best: 0.7521380 (124)	total: 58.8s	remaining: 0us

bestTest = 0.752137952
bestIteration = 124

Shrink model to first 125 iterations.
BEST THRESHOLD: 0.3699999999999999
BEST F1: 0.7750035466023549
Saved → submission_catboost_nosample.csv
