In [3]:

# Load the data (assuming you are in the same session)
# df = pd.read_csv('data/train.csv') # Uncomment if you need to reload
import pandas as pd
import numpy as np

# Load Data
df = pd.read_csv('../data/train.csv')

def clean_data(df):
    # 1. Drop Duplicates
    df = df.drop_duplicates()
    
    # 2. Anomaly Removal
    # Keep only founders 18 or older
    df = df[df['founder_age'] >= 18]
    
    # Keep only valid tenures (started at age 16+)
    df = df[(df['founder_age'] - df['years_with_startup']) >= 16]
    
    # 3. Missing Value Imputation
    # Numerical: Median (robust to outliers)
    num_cols = ['monthly_revenue_generated', 'years_since_founding', 'num_dependents']
    for col in num_cols:
        df[col] = df[col].fillna(df[col].median())
        
    # Categorical: Treat "Missing" as "Unknown" for psychological surveys
    # This captures the "silence" signal
    df['work_life_balance_rating'] = df['work_life_balance_rating'].fillna('Unknown')
    df['venture_satisfaction'] = df['venture_satisfaction'].fillna('Unknown')
    
    # For other structural columns, use Mode
    df['team_size_category'] = df['team_size_category'].fillna(df['team_size_category'].mode()[0])
    
    # 4. Feature Engineering (Simplification)
    # Create a "Start Age" feature to capture the age they started
    df['start_age'] = df['founder_age'] - df['years_with_startup']
    
    # 5. Encoding Target
    # Stayed -> 0, Left -> 1 (Standard for Churn/Exit prediction)
    target_map = {'Stayed': 0, 'Left': 1}
    df['target'] = df['retention_status'].map(target_map)
    
    # Drop unused columns
    cols_to_drop = ['founder_id', 'retention_status']
    df = df.drop(columns=cols_to_drop)
    
    return df

# Apply the cleaning
df_clean = clean_data(df)

print("Data Cleaned Successfully!")
print(f"New Shape: {df_clean.shape}")
print(df_clean.head())

Data Cleaned Successfully!
New Shape: (40543, 24)
   founder_age founder_gender  years_with_startup founder_role  \
1           59         Female                   4        Media   
3           36         Female                   7    Education   
5           38         Female                   3   Technology   
6           47           Male                  23    Education   
7           48           Male                  16      Finance   

   monthly_revenue_generated work_life_balance_rating venture_satisfaction  \
1                     5534.0                     Poor                 High   
3                     3989.0                     Good                 High   
5                     9977.0                     Fair                 High   
6                     3681.0                  Unknown              Unknown   
7                    11223.0                Excellent            Very High   

  startup_performance_rating  funding_rounds_led working_overtime  ...  \
1         

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def encode_and_split(df):
    # --- 1. ORDINAL ENCODING (Preserving Order) ---
    # We manually map these so the model knows 'Excellent' is better than 'Poor'
    
    rating_map = {'Unknown': 0, 'Low': 1, 'Poor': 1, 'Below Average': 2, 
                  'Fair': 3, 'Medium': 3, 'Average': 3, 
                  'Good': 4, 'High': 4, 'Very High': 5, 'Excellent': 5}
    
    # Apply to all rating-like columns
    ord_cols = ['work_life_balance_rating', 'venture_satisfaction', 
                'startup_performance_rating', 'startup_reputation', 'founder_visibility']
    
    for col in ord_cols:
        # Map and fill any unexpected values with 0 (Unknown)
        df[col] = df[col].map(rating_map).fillna(0)

    # Map Binary Columns (Yes/No)
    binary_map = {'No': 0, 'Yes': 1}
    bin_cols = ['working_overtime', 'remote_operations', 'innovation_support', 'leadership_scope']
    for col in bin_cols:
        df[col] = df[col].map(binary_map)

    # Map Startup Stage (Roughly ordered)
    stage_map = {'Entry': 1, 'Mid': 2, 'Senior': 3, 'Growth': 3, 'Established': 4}
    df['startup_stage'] = df['startup_stage'].map(stage_map).fillna(1)

    # --- 2. ONE-HOT ENCODING (Nominal Data) ---
    # For Gender, Role, Education, etc.
    # drop_first=True helps avoid multicollinearity (redundancy)
    df = pd.get_dummies(df, columns=['founder_gender', 'founder_role', 
                                     'education_background', 'personal_status', 
                                     'team_size_category'], drop_first=True)

    # --- 3. SPLITTING ---
    X = df.drop('target', axis=1)
    y = df['target']
    
    # 80% Train, 20% Test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # --- 4. SCALING ---
    # Scale numerical features so age (e.g., 40) doesn't get overpowered by revenue (e.g., 5000)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Convert back to dataframe for readability (optional but helpful)
    X_train_final = pd.DataFrame(X_train_scaled, columns=X.columns)
    X_test_final = pd.DataFrame(X_test_scaled, columns=X.columns)
    
    return X_train_final, X_test_final, y_train, y_test

# Execute
X_train, X_test, y_train, y_test = encode_and_split(df_clean)

print("Data Transformation Complete!")
print(f"Training Features Shape: {X_train.shape}")
print(f"Testing Features Shape: {X_test.shape}")
print("\nFirst 5 rows of processed data (All Numbers Now!):")
print(X_train.head())

Data Transformation Complete!
Training Features Shape: (32434, 31)
Testing Features Shape: (8109, 31)

First 5 rows of processed data (All Numbers Now!):
   founder_age  years_with_startup  monthly_revenue_generated  \
0     1.136680            1.229897                  -1.294496   
1    -0.727069           -1.239229                   1.443614   
2     0.870430           -1.136349                  -0.693043   
3     1.314180            0.921256                  -0.074537   
4     1.580430            0.303975                  -2.125930   

   work_life_balance_rating  venture_satisfaction  startup_performance_rating  \
0                 -1.683026             -2.061083                    0.067999   
1                  1.253636              1.091734                    0.067999   
2                 -1.095694             -0.169393                    1.421057   
3                 -1.095694             -1.430520                   -2.638117   
4                 -1.683026             -2.061083 

In [5]:
# --- FINAL SUBMISSION PIPELINE ---

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# 1. Import preprocessing functions from earlier cells


# 2. Load train & test
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
sample_sub = pd.read_csv('../data/sample_submission.csv')

# 3. Apply EXACT same cleaning to train
train_clean = clean_data(train)

# 4. Encode & scale using the SAME pipeline
X_train, X_test_dummy, y_train, y_test_dummy = encode_and_split(train_clean)

# 5. Prepare TEST data (must go through SAME cleaning)
test['retention_status'] = 'Stayed'   # dummy—needed for clean_data
test_clean = clean_data(test)

# Drop target column we artificially added
test_clean = test_clean.drop(columns=['target'])

# 6. Apply ONE-HOT + ORDINAL encoding matching train
# We need to align columns manually:
test_encoded = pd.get_dummies(test_clean)
train_encoded = pd.get_dummies(train_clean.drop(columns=['target']))

# Align column sets
test_encoded = test_encoded.reindex(columns=train_encoded.columns, fill_value=0)

# 7. Scale test data using *TRAIN SCALER*
scaler = StandardScaler()
X_scaled = scaler.fit_transform(train_encoded)
X_test_submit_scaled = scaler.transform(test_encoded)

# 8. Train final model on full training data
final_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

final_model.fit(X_scaled, y_train)

# 9. Predict on processed test
predictions = final_model.predict(X_test_submit_scaled)

# 10. Map back to labels
inverse_map = {0: 'Stayed', 1: 'Left'}

submission_df = pd.DataFrame({
    'founder_id': test['founder_id'],
    'retention_status': [inverse_map[p] for p in predictions]
})

submission_df.to_csv('submission.csv', index=False)

print("Success! submission.csv created:")
print(submission_df.head())

ValueError: Found input variables with inconsistent numbers of samples: [40543, 32434]