In [None]:
import pandas as pd
import numpy as np
import time
import os
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler



OUTPUT_DIR = '../output'
os.makedirs(OUTPUT_DIR, exist_ok=True) 

# --- 1. LOAD DATA ---
print("Loading Data...")
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# Combine for consistent processing
train['is_train'] = 1
test['is_train'] = 0
test['retention_status'] = 'Unknown'
combined = pd.concat([train, test], axis=0)


print("Applying Forensic Cleaning Pipeline...")


cols_to_drop = ['founder_id', 'founder_role', 'leadership_scope', 
                'founder_visibility', 'innovation_support', 'team_size_category']
combined = combined.drop(columns=cols_to_drop)

#  FIX SKEW
combined['monthly_revenue_generated'] = np.log1p(combined['monthly_revenue_generated'].fillna(combined['monthly_revenue_generated'].median()))

#  IMPUTE MISSING VALUES
combined['years_since_founding'] = combined['years_since_founding'].fillna(combined['years_since_founding'].median())
combined['num_dependents'] = combined['num_dependents'].fillna(combined['num_dependents'].mode()[0])
combined['work_life_balance_rating'] = combined['work_life_balance_rating'].fillna('Unknown')
combined['venture_satisfaction'] = combined['venture_satisfaction'].fillna('Unknown')

# FEATURE ENGINEERING
combined['founder_age'] = combined['founder_age'].clip(lower=18)
combined['start_age'] = combined['founder_age'] - combined['years_with_startup']

#  ORDINAL ENCODING
rating_map = {'Unknown': 0, 'Low': 1, 'Poor': 1, 'Below Average': 2, 
              'Fair': 3, 'Medium': 3, 'Average': 3, 
              'Good': 4, 'High': 4, 'Very High': 5, 'Excellent': 5}
for col in ['work_life_balance_rating', 'venture_satisfaction', 'startup_performance_rating', 'startup_reputation']:
    combined[col] = combined[col].map(rating_map).fillna(0)

stage_map = {'Entry': 1, 'Mid': 2, 'Senior': 3, 'Growth': 3, 'Established': 4}
combined['startup_stage'] = combined['startup_stage'].map(stage_map).fillna(1)

binary_map = {'No': 0, 'Yes': 1}
for col in ['working_overtime', 'remote_operations']:
    combined[col] = combined[col].map(binary_map)

#  ONE-HOT ENCODING
combined = pd.get_dummies(combined, columns=['founder_gender', 'education_background', 'personal_status'], drop_first=True)


print("Scaling Data...")
train_final = combined[combined['is_train'] == 1].drop(columns=['is_train'])
test_final = combined[combined['is_train'] == 0].drop(columns=['is_train', 'retention_status'])

# Target Mapping
target_map = {'Stayed': 0, 'Left': 1}
y = train_final['retention_status'].map(target_map)
X = train_final.drop(columns=['retention_status'])
X_submit = test_final[X.columns]

# Scale (Critical for SVM/NN)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_submit_scaled = scaler.transform(X_submit)

inverse_map = {0: 'Stayed', 1: 'Left'}



# Gradient boost, Random Forest, SVM and Neural Network Models

In [None]:


models_config = {
    "GradientBoosting": {
        "estimator": GradientBoostingClassifier(random_state=42),
        "params": {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.05, 0.1],
            'max_depth': [3, 4, 5],
            'subsample': [0.8, 0.9, 1.0]
        }
    },
    "RandomForest": {
        "estimator": RandomForestClassifier(random_state=42),
        "params": {
            'n_estimators': [100, 200, 300],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    "SVM": {
        "estimator": SVC(random_state=42),
        "params": {
            'C': [0.1, 1, 10],
            'kernel': ['rbf'], 
            'gamma': ['scale', 'auto']
        }
    },
    "NeuralNetwork": {
        "estimator": MLPClassifier(max_iter=500, random_state=42),
        "params": {
            'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
            'activation': ['relu', 'tanh'],
            'alpha': [0.0001, 0.001],
            'learning_rate_init': [0.001, 0.01]
        }
    }
}



print("\n" + "="*50)
print("   STARTING MULTI-MODEL GENERATION")
print("="*50)

for name, config in models_config.items():
    print(f"\nProcessing Model: {name}...")
    start = time.time()
    
    # 1. Hyperparameter Tuning (Random Search)
    search = RandomizedSearchCV(
        estimator=config['estimator'],
        param_distributions=config['params'],
        n_iter=10,
        cv=3,
        scoring='accuracy',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    
    search.fit(X_scaled, y)
    best_model = search.best_estimator_
    
    print(f"  -> Best Params: {search.best_params_}")
    print(f"  -> Best CV Score: {search.best_score_:.4%}")
    
    # 2. Predict on Test Set using the BEST model
    preds = best_model.predict(X_submit_scaled)
    
    # 3. Save CSV to OUTPUT_DIR
    filename = f"{OUTPUT_DIR}/submission_{name}.csv"
    submission = pd.DataFrame({
        'founder_id': test['founder_id'],
        'retention_status': [inverse_map[p] for p in preds]
    })
    submission.to_csv(filename, index=False)
    
    elapsed = time.time() - start
    print(f"  -> Saved to '{filename}' (Time: {elapsed:.1f}s)")

print("\n" + "="*50)
print(f"   ALL FILES GENERATED IN {OUTPUT_DIR}/")
print("="*50)

# Using all the csv generated to Ensemble and get better accuracy

In [None]:
import pandas as pd
import numpy as np
import os
from scipy.stats import mode

# --- CONFIGURATION ---
OUTPUT_DIR = '../output'
files = [
    'submission_GradientBoosting.csv',
    'submission_RandomForest.csv',
    'submission_SVM.csv',
    'submission_NeuralNetwork.csv'
]

print("Loading predictions...")
dfs = []
for f in files:
    path = os.path.join(OUTPUT_DIR, f)
    if os.path.exists(path):
        df = pd.read_csv(path)
        # Convert to numbers for math (Stayed=0, Left=1)
        df['retention_numeric'] = df['retention_status'].map({'Stayed': 0, 'Left': 1})
        dfs.append(df['retention_numeric'].values)
        print(f"Loaded: {f}")
    else:
        print(f"Warning: {f} not found. Skipping.")

if not dfs:
    print("No files found!")
    exit()

# --- HARD VOTING ---
# We stack the predictions and take the "Mode" (Majority Vote)
stacked_preds = np.array(dfs)

final_votes, _ = mode(stacked_preds, axis=0)

# Flatten the array
final_votes = final_votes.ravel()


inverse_map = {0: 'Stayed', 1: 'Left'}
submission_ensemble = pd.read_csv(os.path.join(OUTPUT_DIR, files[0])) # Load template
submission_ensemble['retention_status'] = [inverse_map[p] for p in final_votes]

output_path = os.path.join(OUTPUT_DIR, 'submission_Ensemble_Voting.csv')
submission_ensemble.to_csv(output_path, index=False)

print(f"\nâœ… Created Ensemble Submission: {output_path}")
print("This file combines the 'wisdom' of all your models.")