In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


print("Loading Data...")
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

train['is_train'] = 1
test['is_train'] = 0
test['retention_status'] = 'Unknown'
combined = pd.concat([train, test], axis=0)


print("Applying Advanced Feature Engineering...")

# A. CLEANING
cols_to_drop = ['founder_id', 'founder_visibility', 'innovation_support'] 
combined = combined.drop(columns=cols_to_drop)

# B. IMPUTE
combined['monthly_revenue_generated'] = combined['monthly_revenue_generated'].fillna(combined['monthly_revenue_generated'].median())
combined['years_since_founding'] = combined['years_since_founding'].fillna(combined['years_since_founding'].median())
combined['num_dependents'] = combined['num_dependents'].fillna(combined['num_dependents'].mode()[0])
combined['work_life_balance_rating'] = combined['work_life_balance_rating'].fillna('Unknown')
combined['venture_satisfaction'] = combined['venture_satisfaction'].fillna('Unknown')



combined['revenue_efficiency'] = np.log1p(combined['monthly_revenue_generated']) / (combined['years_since_founding'] + 1)

# 2. Founder Experience Gap
combined['prior_experience'] = combined['founder_age'] - combined['years_with_startup']


binary_map = {'No': 0, 'Yes': 1}
combined['working_overtime'] = combined['working_overtime'].map(binary_map)
combined['remote_operations'] = combined['remote_operations'].map(binary_map)


sat_map = {'Unknown': 2, 'Low': 0, 'Poor': 0, 'Below Average': 1, 'Fair': 2, 'Medium': 2, 'Average': 2, 'Good': 3, 'High': 3, 'Very High': 4, 'Excellent': 4}
combined['satisfaction_score'] = combined['venture_satisfaction'].map(sat_map)

combined['burnout_index'] = combined['working_overtime'] / (combined['satisfaction_score'] + 1)

# D. ORDINAL ENCODING
for col in ['work_life_balance_rating', 'venture_satisfaction', 'startup_performance_rating', 'startup_reputation']:
    combined[col] = combined[col].map(sat_map).fillna(2) 

stage_map = {'Entry': 1, 'Mid': 2, 'Senior': 3, 'Growth': 3, 'Established': 4}
combined['startup_stage'] = combined['startup_stage'].map(stage_map).fillna(1)

# E. ONE-HOT ENCODING
combined = pd.get_dummies(combined, columns=['founder_gender', 'education_background', 'personal_status', 'founder_role', 'team_size_category', 'leadership_scope'], drop_first=True)


print("Scaling Data...")
train_final = combined[combined['is_train'] == 1].drop(columns=['is_train'])
test_final = combined[combined['is_train'] == 0].drop(columns=['is_train', 'retention_status'])

y = train_final['retention_status'].map({'Stayed': 0, 'Left': 1})
X = train_final.drop(columns=['retention_status'])
X_submit = test_final[X.columns]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_submit_scaled = scaler.transform(X_submit)

print("Training data shape:", X_scaled.shape)
print("Submission data shape:", X_submit_scaled.shape)