In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')


train['is_train'] = 1
test['is_train'] = 0
test['retention_status'] = 'Unknown'
combined = pd.concat([train, test], axis=0)




cols_to_drop = [
    'founder_id', 'founder_role', 'leadership_scope',
    'founder_visibility', 'innovation_support', 'team_size_category'
]
combined = combined.drop(columns=cols_to_drop)


combined['monthly_revenue_generated'] = np.log1p(
    combined['monthly_revenue_generated'].fillna(
        combined['monthly_revenue_generated'].median()
    )
)

# . IMPUTE MISSING VALUES
combined['years_since_founding'] = combined['years_since_founding'].fillna(
    combined['years_since_founding'].median()
)
combined['num_dependents'] = combined['num_dependents'].fillna(
    combined['num_dependents'].mode()[0]
)
combined['work_life_balance_rating'] = combined['work_life_balance_rating'].fillna('Unknown')
combined['venture_satisfaction'] = combined['venture_satisfaction'].fillna('Unknown')

# D. FEATURE ENGINEERING
combined['founder_age'] = combined['founder_age'].clip(lower=18)
combined['start_age'] = combined['founder_age'] - combined['years_with_startup']

# E. ORDINAL ENCODING
rating_map = {
    'Unknown': 0, 'Low': 1, 'Poor': 1, 'Below Average': 2,
    'Fair': 3, 'Medium': 3, 'Average': 3,
    'Good': 4, 'High': 4, 'Very High': 5, 'Excellent': 5
}

for col in [
    'work_life_balance_rating', 'venture_satisfaction',
    'startup_performance_rating', 'startup_reputation'
]:
    combined[col] = combined[col].map(rating_map).fillna(0)

stage_map = {'Entry': 1, 'Mid': 2, 'Senior': 3, 'Growth': 3, 'Established': 4}
combined['startup_stage'] = combined['startup_stage'].map(stage_map).fillna(1)

binary_map = {'No': 0, 'Yes': 1}
for col in ['working_overtime', 'remote_operations']:
    combined[col] = combined[col].map(binary_map)

# F. ONE-HOT ENCODING
combined = pd.get_dummies(
    combined,
    columns=['founder_gender', 'education_background', 'personal_status'],
    drop_first=True
)


train_final = combined[combined['is_train'] == 1].drop(columns=['is_train'])
test_final = combined[combined['is_train'] == 0].drop(columns=['is_train', 'retention_status'])

# Target Mapping
target_map = {'Stayed': 0, 'Left': 1}
y = train_final['retention_status'].map(target_map)
X = train_final.drop(columns=['retention_status'])
X_submit = test_final[X.columns]

# Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_submit_scaled = scaler.transform(X_submit)

print("Training data shape:", X_scaled.shape)
print("Submission data shape:", X_submit_scaled.shape)
