# PHASE 3: DATA PREPROCESSING & FEATURE ENGINEERING
## Shark Tank India - Creating 35+ Engineered Features

**Objectives:**
- Clean data for modeling
- Create 35+ engineered features
- Handle missing values strategically
- Prepare train/test splits

**Feature Categories:**
1. Financial Health Indicators (10 features)
2. Deal Structure Indicators (8 features)
3. Team Composition Features (7 features)
4. Innovation Indicators (4 features)
5. Shark Affinity Scores (7 features)
6. Industry Context Features (5 features)
7. Geographic Features (4 features)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
import os

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

print("‚úÖ Libraries imported successfully")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## 1. DATA LOADING

In [None]:
# Load raw data
df = pd.read_csv('../data/raw/Shark Tank India.csv')

print(f"üìä Dataset loaded: {df.shape}")
print(f"   Rows: {len(df):,}")
print(f"   Columns: {len(df.columns)}")

# Create a copy for feature engineering
df_fe = df.copy()

print("\n‚úÖ Data loaded and copied for feature engineering")

## 2. DATA CLEANING

### 2.1 Missing Value Treatment

In [None]:
print("="*80)
print("üßπ MISSING VALUE TREATMENT")
print("="*80)

# Shark-specific columns - fill with 0 (means no investment)
shark_names = ['Namita', 'Vineeta', 'Anupam', 'Aman', 'Peyush', 'Ritesh', 'Amit', 'Guest']
shark_cols = []

for shark in shark_names:
    for suffix in ['Investment Amount', 'Investment Equity', 'Debt Amount']:
        col = f'{shark} {suffix}'
        if col in df_fe.columns:
            df_fe[col] = df_fe[col].fillna(0)
            shark_cols.append(col)

print(f"‚úÖ Filled {len(shark_cols)} shark-specific columns with 0")

# Financial metrics - industry median imputation
financial_cols = ['Yearly Revenue', 'Monthly Sales', 'Gross Margin', 'Net Margin', 'EBITDA']
for col in financial_cols:
    if col in df_fe.columns:
        df_fe[col] = df_fe.groupby('Industry')[col].transform(
            lambda x: x.fillna(x.median())
        )
        print(f"‚úÖ Imputed {col} with industry median")

# SKUs - fill with median
if 'SKUs' in df_fe.columns:
    df_fe['SKUs'] = df_fe['SKUs'].fillna(df_fe['SKUs'].median())
    print(f"‚úÖ Imputed SKUs with median")

# Deal-specific columns - fill with 0
deal_cols = ['Total Deal Amount', 'Total Deal Equity', 'Total Deal Debt', 'Debt Interest',
             'Royalty Percentage', 'Royalty Recouped Amount', 'Advisory Shares Equity']
for col in deal_cols:
    if col in df_fe.columns:
        df_fe[col] = df_fe[col].fillna(0)

print(f"‚úÖ Filled {len([c for c in deal_cols if c in df_fe.columns])} deal columns with 0")

print("\n‚úÖ Missing value treatment completed")

### 2.2 Data Type Conversions

In [None]:
print("="*80)
print("üîÑ DATA TYPE CONVERSIONS")
print("="*80)

# Convert Yes/No to 1/0
binary_cols = ['Cash Burn', 'Has Patents', 'Bootstrapped']
for col in binary_cols:
    if col in df_fe.columns:
        df_fe[col] = df_fe[col].map({'Yes': 1, 'yes': 1, 'No': 0, 'no': 0}).fillna(0)
        print(f"‚úÖ Converted {col} to binary (1/0)")

# Ensure numeric columns are numeric
numeric_cols = ['Yearly Revenue', 'Monthly Sales', 'Gross Margin', 'Net Margin',
                'Original Ask Amount', 'Original Offered Equity', 'Valuation Requested']
for col in numeric_cols:
    if col in df_fe.columns:
        df_fe[col] = pd.to_numeric(df_fe[col], errors='coerce').fillna(0)

print("\n‚úÖ Data type conversions completed")

### 2.3 Outlier Detection (Flag, Don't Remove)

In [None]:
print("="*80)
print("üîç OUTLIER DETECTION")
print("="*80)

# Flag high valuation outliers
if 'Valuation Requested' in df_fe.columns:
    Q1 = df_fe['Valuation Requested'].quantile(0.25)
    Q3 = df_fe['Valuation Requested'].quantile(0.75)
    IQR = Q3 - Q1
    upper_bound = Q3 + 1.5 * IQR
    df_fe['is_outlier_valuation'] = (df_fe['Valuation Requested'] > upper_bound).astype(int)
    print(f"‚úÖ Created is_outlier_valuation: {df_fe['is_outlier_valuation'].sum()} outliers flagged")

# Flag high revenue
if 'Yearly Revenue' in df_fe.columns:
    df_fe['is_high_revenue'] = (df_fe['Yearly Revenue'] > 1000).astype(int)
    print(f"‚úÖ Created is_high_revenue: {df_fe['is_high_revenue'].sum()} high revenue startups")

print("\n‚úÖ Outlier detection completed (flagged, not removed)")

## 3. FEATURE ENGINEERING - 35+ NEW FEATURES

### 3.1 Financial Health Indicators (10 features)

In [None]:
print("="*80)
print("üí∞ FINANCIAL HEALTH INDICATORS (10 features)")
print("="*80)

feature_count = 0

# 1. Revenue per SKU
df_fe['revenue_per_sku'] = df_fe['Yearly Revenue'] / (df_fe['SKUs'] + 1)
feature_count += 1
print(f"{feature_count}. revenue_per_sku created")

# 2. Monthly to yearly ratio
df_fe['monthly_to_yearly_ratio'] = (df_fe['Monthly Sales'] * 12) / (df_fe['Yearly Revenue'] + 1)
df_fe['monthly_to_yearly_ratio'] = df_fe['monthly_to_yearly_ratio'].clip(0, 10)  # Cap at 10
feature_count += 1
print(f"{feature_count}. monthly_to_yearly_ratio created")

# 3. Profit margin gap
df_fe['profit_margin_gap'] = df_fe['Gross Margin'] - df_fe['Net Margin']
feature_count += 1
print(f"{feature_count}. profit_margin_gap created")

# 4. Profitability score
df_fe['profitability_score'] = (df_fe['Net Margin'] * df_fe['Yearly Revenue']) / 1000
feature_count += 1
print(f"{feature_count}. profitability_score created")

# 5. EBITDA margin
df_fe['ebitda_margin'] = df_fe['EBITDA'] / (df_fe['Yearly Revenue'] + 1)
feature_count += 1
print(f"{feature_count}. ebitda_margin created")

# 6. Burn rate
df_fe['burn_rate'] = df_fe['Cash Burn'] * df_fe['Monthly Sales']
feature_count += 1
print(f"{feature_count}. burn_rate created")

# 7. Runway months
df_fe['runway_months'] = df_fe['Yearly Revenue'] / (df_fe['burn_rate'] + 1)
df_fe['runway_months'] = df_fe['runway_months'].clip(0, 100)  # Cap at 100 months
feature_count += 1
print(f"{feature_count}. runway_months created")

# 8. Is pre-revenue
df_fe['is_pre_revenue'] = (df_fe['Yearly Revenue'] == 0).astype(int)
feature_count += 1
print(f"{feature_count}. is_pre_revenue created")

# 9. Revenue category (ordinal)
df_fe['revenue_category'] = pd.cut(df_fe['Yearly Revenue'], 
                                    bins=[-np.inf, 0, 100, 1000, 10000, np.inf],
                                    labels=[0, 1, 2, 3, 4])
df_fe['revenue_category'] = df_fe['revenue_category'].astype(int)
feature_count += 1
print(f"{feature_count}. revenue_category created (ordinal 0-4)")

# 10. Financial health score (composite)
df_fe['financial_health_score'] = (
    (df_fe['Yearly Revenue'] > 0).astype(int) * 2 +
    (df_fe['Net Margin'] > 10).astype(int) * 2 +
    (df_fe['Cash Burn'] == 0).astype(int) * 1
)
feature_count += 1
print(f"{feature_count}. financial_health_score created (0-5 scale)")

print(f"\n‚úÖ Created {feature_count} Financial Health Indicators")

### 3.2 Deal Structure Indicators (8 features)

In [None]:
print("="*80)
print("ü§ù DEAL STRUCTURE INDICATORS (8 features)")
print("="*80)

feature_count = 0

# 1. Revenue multiple
df_fe['revenue_multiple'] = df_fe['Valuation Requested'] / (df_fe['Yearly Revenue'] + 1)
df_fe['revenue_multiple'] = df_fe['revenue_multiple'].clip(0, 1000)  # Cap at 1000x
feature_count += 1
print(f"{feature_count}. revenue_multiple created")

# 2. Ask percentage
df_fe['ask_percentage'] = (df_fe['Original Ask Amount'] / (df_fe['Valuation Requested'] + 1)) * 100
feature_count += 1
print(f"{feature_count}. ask_percentage created")

# 3. Valuation reasonableness (vs industry median)
industry_median_multiple = df_fe.groupby('Industry')['revenue_multiple'].transform('median')
df_fe['valuation_reasonableness'] = df_fe['revenue_multiple'] / (industry_median_multiple + 1)
df_fe['valuation_reasonableness'] = df_fe['valuation_reasonableness'].clip(0, 10)
feature_count += 1
print(f"{feature_count}. valuation_reasonableness created")

# 4. Expected equity dilution
df_fe['expected_equity_dilution'] = df_fe['Original Offered Equity']
feature_count += 1
print(f"{feature_count}. expected_equity_dilution created")

# 5. Deal size category
df_fe['deal_size_category'] = pd.cut(df_fe['Original Ask Amount'],
                                      bins=[-np.inf, 50, 100, 200, 500, np.inf],
                                      labels=[0, 1, 2, 3, 4])
df_fe['deal_size_category'] = df_fe['deal_size_category'].astype(int)
feature_count += 1
print(f"{feature_count}. deal_size_category created (ordinal 0-4)")

# 6. Valuation to ask ratio
df_fe['valuation_to_ask_ratio'] = df_fe['Valuation Requested'] / (df_fe['Original Ask Amount'] + 1)
feature_count += 1
print(f"{feature_count}. valuation_to_ask_ratio created")

# 7. Is reasonable valuation (within 2x of industry median)
df_fe['is_reasonable_valuation'] = (df_fe['valuation_reasonableness'] <= 2).astype(int)
feature_count += 1
print(f"{feature_count}. is_reasonable_valuation created")

# 8. Deal complexity score
df_fe['deal_complexity_score'] = (
    (df_fe['Total Deal Debt'] > 0).astype(int) +
    (df_fe['Royalty Percentage'] > 0).astype(int) +
    (df_fe['Advisory Shares Equity'] > 0).astype(int)
)
feature_count += 1
print(f"{feature_count}. deal_complexity_score created (0-3 scale)")

print(f"\n‚úÖ Created {feature_count} Deal Structure Indicators")

### 3.3 Team Composition Features (7 features)

In [None]:
print("="*80)
print("üë• TEAM COMPOSITION FEATURES (7 features)")
print("="*80)

feature_count = 0

# 1. Team size
df_fe['team_size'] = df_fe['Number of Presenters'].fillna(1)
feature_count += 1
print(f"{feature_count}. team_size created")

# 2. Male ratio
df_fe['male_ratio'] = df_fe['Male Presenters'] / (df_fe['team_size'] + 0.001)
feature_count += 1
print(f"{feature_count}. male_ratio created")

# 3. Female ratio
df_fe['female_ratio'] = df_fe['Female Presenters'] / (df_fe['team_size'] + 0.001)
feature_count += 1
print(f"{feature_count}. female_ratio created")

# 4. Gender diversity score
df_fe['gender_diversity_score'] = 1 - abs(df_fe['male_ratio'] - df_fe['female_ratio'])
feature_count += 1
print(f"{feature_count}. gender_diversity_score created (0-1 scale)")

# 5. Is solo founder
df_fe['is_solo_founder'] = (df_fe['team_size'] == 1).astype(int)
feature_count += 1
print(f"{feature_count}. is_solo_founder created")

# 6. Is couple
df_fe['is_couple'] = df_fe['Couple Presenters'].fillna(0).astype(int)
feature_count += 1
print(f"{feature_count}. is_couple created")

# 7. Has female founder
df_fe['has_female_founder'] = (df_fe['Female Presenters'] > 0).astype(int)
feature_count += 1
print(f"{feature_count}. has_female_founder created")

print(f"\n‚úÖ Created {feature_count} Team Composition Features")

### 3.4 Innovation Indicators (4 features)

In [None]:
print("="*80)
print("üí° INNOVATION INDICATORS (4 features)")
print("="*80)

feature_count = 0

# 1. Has patent
df_fe['has_patent'] = df_fe['Has Patents'].fillna(0).astype(int)
feature_count += 1
print(f"{feature_count}. has_patent created")

# 2. Is bootstrapped
df_fe['is_bootstrapped'] = df_fe['Bootstrapped'].fillna(0).astype(int)
feature_count += 1
print(f"{feature_count}. is_bootstrapped created")

# 3. SKU count
df_fe['sku_count'] = df_fe['SKUs'].fillna(0)
feature_count += 1
print(f"{feature_count}. sku_count created")

# 4. Innovation score (composite)
df_fe['innovation_score'] = df_fe['has_patent'] * 2 + df_fe['is_bootstrapped']
feature_count += 1
print(f"{feature_count}. innovation_score created (0-3 scale)")

print(f"\n‚úÖ Created {feature_count} Innovation Indicators")

### 3.5 Shark Affinity Scores (7 features)

In [None]:
print("="*80)
print("ü¶à SHARK AFFINITY SCORES (7 features)")
print("="*80)

feature_count = 0

# Calculate industry affinity for each shark
shark_names = ['Namita', 'Aman', 'Anupam', 'Peyush', 'Vineeta', 'Ritesh', 'Amit']

for shark in shark_names:
    investment_col = f'{shark} Investment Amount'
    
    if investment_col in df_fe.columns:
        # Calculate industry-wise investment rate for this shark
        industry_investments = df_fe.groupby('Industry')[investment_col].apply(
            lambda x: (x > 0).sum()
        )
        total_investments = (df_fe[investment_col] > 0).sum()
        
        if total_investments > 0:
            affinity = industry_investments / total_investments
        else:
            affinity = industry_investments * 0  # All zeros
        
        # Map affinity to each row based on industry
        df_fe[f'{shark.lower()}_industry_fit'] = df_fe['Industry'].map(affinity).fillna(0)
        feature_count += 1
        print(f"{feature_count}. {shark.lower()}_industry_fit created")

print(f"\n‚úÖ Created {feature_count} Shark Affinity Scores")

### 3.6 Industry Context Features (5 features)

In [None]:
print("="*80)
print("üè≠ INDUSTRY CONTEXT FEATURES (5 features)")
print("="*80)

feature_count = 0

# 1. Industry average success rate
industry_success_rate = df_fe.groupby('Industry')['Received Offer'].mean()
df_fe['industry_avg_success_rate'] = df_fe['Industry'].map(industry_success_rate)
feature_count += 1
print(f"{feature_count}. industry_avg_success_rate created")

# 2. Industry average equity
industry_avg_equity = df_fe.groupby('Industry')['Total Deal Equity'].mean()
df_fe['industry_avg_equity'] = df_fe['Industry'].map(industry_avg_equity)
feature_count += 1
print(f"{feature_count}. industry_avg_equity created")

# 3. Industry median valuation
industry_median_val = df_fe.groupby('Industry')['Valuation Requested'].median()
df_fe['industry_median_valuation'] = df_fe['Industry'].map(industry_median_val)
feature_count += 1
print(f"{feature_count}. industry_median_valuation created")

# 4. Industry pitch count
industry_pitch_count = df_fe.groupby('Industry')['Startup Name'].transform('count')
df_fe['industry_pitch_count'] = industry_pitch_count
feature_count += 1
print(f"{feature_count}. industry_pitch_count created")

# 5. Industry competition index
df_fe['industry_competition_index'] = df_fe['industry_pitch_count'] / len(df_fe)
feature_count += 1
print(f"{feature_count}. industry_competition_index created")

print(f"\n‚úÖ Created {feature_count} Industry Context Features")

### 3.7 Geographic Features (4 features)

In [None]:
print("="*80)
print("üó∫Ô∏è  GEOGRAPHIC FEATURES (4 features)")
print("="*80)

feature_count = 0

# 1. State success rate
state_success = df_fe.groupby('Pitchers State')['Received Offer'].mean()
df_fe['state_success_rate'] = df_fe['Pitchers State'].map(state_success)
feature_count += 1
print(f"{feature_count}. state_success_rate created")

# 2. Is metro
metro_states = ['Maharashtra', 'Delhi', 'Karnataka']
df_fe['is_metro'] = df_fe['Pitchers State'].isin(metro_states).astype(int)
feature_count += 1
print(f"{feature_count}. is_metro created")

# 3. State pitch density
state_pitch_density = df_fe.groupby('Pitchers State')['Startup Name'].transform('count')
df_fe['state_pitch_density'] = state_pitch_density
feature_count += 1
print(f"{feature_count}. state_pitch_density created")

# 4. Geographic diversity score (normalized pitch density)
df_fe['geographic_diversity_score'] = df_fe['state_pitch_density'] / len(df_fe)
feature_count += 1
print(f"{feature_count}. geographic_diversity_score created")

print(f"\n‚úÖ Created {feature_count} Geographic Features")

## 4. FEATURE SUMMARY

In [None]:
print("="*80)
print("üìä FEATURE ENGINEERING SUMMARY")
print("="*80)

# Count engineered features
original_cols = set(df.columns)
new_cols = set(df_fe.columns) - original_cols
engineered_features = sorted(list(new_cols))

print(f"\n‚úÖ Total Engineered Features: {len(engineered_features)}")
print(f"\nFeature Categories:")
print(f"   1. Financial Health Indicators: 10 features")
print(f"   2. Deal Structure Indicators: 8 features")
print(f"   3. Team Composition Features: 7 features")
print(f"   4. Innovation Indicators: 4 features")
print(f"   5. Shark Affinity Scores: 7 features")
print(f"   6. Industry Context Features: 5 features")
print(f"   7. Geographic Features: 4 features")
print(f"   8. Outlier Flags: 2 features")
print(f"   " + "-"*50)
print(f"   TOTAL: {len(engineered_features)} features")

print(f"\nüìã All Engineered Features:")
for i, feat in enumerate(engineered_features, 1):
    print(f"   {i:2d}. {feat}")

print(f"\nüìä Dataset Shape After Feature Engineering:")
print(f"   Rows: {len(df_fe):,}")
print(f"   Columns: {len(df_fe.columns)} (original: {len(df.columns)}, new: {len(engineered_features)})")

## 5. TARGET VARIABLE ENGINEERING

In [None]:
print("="*80)
print("üéØ TARGET VARIABLE ENGINEERING")
print("="*80)

# Binary classification targets
df_fe['got_offer'] = df_fe['Received Offer'].astype(int)
df_fe['accepted_offer'] = df_fe['Accepted Offer'].astype(int)

print(f"‚úÖ Binary targets created:")
print(f"   - got_offer: {df_fe['got_offer'].sum()}/{len(df_fe)} ({df_fe['got_offer'].mean()*100:.1f}%)")
print(f"   - accepted_offer: {df_fe['accepted_offer'].sum()}/{len(df_fe)} ({df_fe['accepted_offer'].mean()*100:.1f}%)")

# Multi-label targets (7 sharks)
shark_targets = []
shark_names = ['Namita', 'Aman', 'Anupam', 'Peyush', 'Vineeta', 'Ritesh', 'Amit']

print(f"\n‚úÖ Multi-label targets (individual sharks):")
for shark in shark_names:
    investment_col = f'{shark} Investment Amount'
    if investment_col in df_fe.columns:
        target_col = f'{shark.lower()}_invested'
        df_fe[target_col] = (df_fe[investment_col] > 0).astype(int)
        shark_targets.append(target_col)
        print(f"   - {target_col}: {df_fe[target_col].sum()} deals ({df_fe[target_col].mean()*100:.1f}%)")

# Regression target
df_fe['equity_dilution'] = df_fe['Total Deal Equity'] - df_fe['Original Offered Equity']
print(f"\n‚úÖ Regression target created:")
print(f"   - equity_dilution: Mean={df_fe['equity_dilution'].mean():.2f}%, Std={df_fe['equity_dilution'].std():.2f}%")

print(f"\n‚úÖ Total targets created: {2 + len(shark_targets) + 1} (2 binary + {len(shark_targets)} multi-label + 1 regression)")

## 6. FEATURE SELECTION & CORRELATION ANALYSIS

In [None]:
print("="*80)
print("üîç FEATURE SELECTION & CORRELATION ANALYSIS")
print("="*80)

# Select only numeric features for modeling
numeric_features = df_fe.select_dtypes(include=[np.number]).columns.tolist()

# Remove target variables and identifiers from features
exclude_cols = ['Season Number', 'Episode Number', 'Pitch Number', 'got_offer', 'accepted_offer', 
                'equity_dilution', 'Received Offer', 'Accepted Offer'] + shark_targets

# Also exclude individual shark investment amounts (we have affinity scores instead)
shark_investment_cols = [col for col in numeric_features if 'Investment Amount' in col or 'Investment Equity' in col or 'Debt Amount' in col]
exclude_cols.extend(shark_investment_cols)

feature_cols = [col for col in numeric_features if col not in exclude_cols]

print(f"\nüìä Feature Selection:")
print(f"   Total numeric columns: {len(numeric_features)}")
print(f"   Excluded columns: {len(exclude_cols)}")
print(f"   Selected features for modeling: {len(feature_cols)}")

# Correlation with target
print(f"\nüéØ Top 20 Features Correlated with 'got_offer':")
correlations = df_fe[feature_cols + ['got_offer']].corr()['got_offer'].sort_values(ascending=False)
top_20_corr = correlations[1:21]  # Exclude self-correlation

for i, (feat, corr) in enumerate(top_20_corr.items(), 1):
    print(f"   {i:2d}. {feat:40s} : {corr:6.3f}")

# Save feature importance
feature_importance_df = pd.DataFrame({
    'Feature': correlations.index[1:],  # Exclude self
    'Correlation_with_got_offer': correlations.values[1:]
}).sort_values('Correlation_with_got_offer', ascending=False, key=abs)

os.makedirs('../data/processed', exist_ok=True)
feature_importance_df.to_csv('../data/processed/feature_importance_preliminary.csv', index=False)
print(f"\n‚úÖ Saved: feature_importance_preliminary.csv")

# Check for highly correlated features (multicollinearity)
print(f"\nüîç Checking for Multicollinearity (correlation > 0.95):")
corr_matrix = df_fe[feature_cols].corr().abs()
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_pairs = [(column, row, corr_matrix.loc[row, column]) 
                   for column in upper_triangle.columns 
                   for row in upper_triangle.index 
                   if upper_triangle.loc[row, column] > 0.95]

if high_corr_pairs:
    print(f"   Found {len(high_corr_pairs)} highly correlated pairs:")
    for feat1, feat2, corr in high_corr_pairs[:10]:  # Show first 10
        print(f"   - {feat1} <-> {feat2}: {corr:.3f}")
else:
    print(f"   No highly correlated pairs found (good!)")

## 7. TRAIN/TEST SPLIT

In [None]:
print("="*80)
print("‚úÇÔ∏è  TRAIN/TEST SPLIT")
print("="*80)

# Prepare feature matrix
X = df_fe[feature_cols].copy()
y_binary = df_fe['got_offer'].copy()
y_multilabel = df_fe[shark_targets].copy()
y_regression = df_fe['equity_dilution'].copy()

# Handle any remaining NaN values
X = X.fillna(0)

print(f"\nüìä Data Shapes:")
print(f"   X (features): {X.shape}")
print(f"   y_binary: {y_binary.shape}")
print(f"   y_multilabel: {y_multilabel.shape}")
print(f"   y_regression: {y_regression.shape}")

# Stratified split by industry to maintain distribution
X_train, X_test, y_train_binary, y_test_binary = train_test_split(
    X, y_binary, test_size=0.2, random_state=42, stratify=df_fe['Industry']
)

# Get corresponding indices for other targets
train_idx = X_train.index
test_idx = X_test.index

y_train_multilabel = y_multilabel.loc[train_idx]
y_test_multilabel = y_multilabel.loc[test_idx]

y_train_regression = y_regression.loc[train_idx]
y_test_regression = y_regression.loc[test_idx]

print(f"\n‚úÖ Train/Test Split Complete:")
print(f"   Training set: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"   Test set: {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")
print(f"\n   Binary target distribution:")
print(f"   - Train: {y_train_binary.sum()}/{len(y_train_binary)} offers ({y_train_binary.mean()*100:.1f}%)")
print(f"   - Test: {y_test_binary.sum()}/{len(y_test_binary)} offers ({y_test_binary.mean()*100:.1f}%)")

## 8. SAVE PROCESSED DATA

In [None]:
print("="*80)
print("üíæ SAVING PROCESSED DATA")
print("="*80)

# Save train/test splits
X_train.to_csv('../data/processed/X_train.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)

y_train_binary.to_csv('../data/processed/y_train_binary.csv', index=False, header=['got_offer'])
y_test_binary.to_csv('../data/processed/y_test_binary.csv', index=False, header=['got_offer'])

y_train_multilabel.to_csv('../data/processed/y_train_multilabel.csv', index=False)
y_test_multilabel.to_csv('../data/processed/y_test_multilabel.csv', index=False)

y_train_regression.to_csv('../data/processed/y_train_regression.csv', index=False, header=['equity_dilution'])
y_test_regression.to_csv('../data/processed/y_test_regression.csv', index=False, header=['equity_dilution'])

print(f"‚úÖ Saved train/test splits:")
print(f"   - X_train.csv ({X_train.shape})")
print(f"   - X_test.csv ({X_test.shape})")
print(f"   - y_train_binary.csv")
print(f"   - y_test_binary.csv")
print(f"   - y_train_multilabel.csv ({y_train_multilabel.shape})")
print(f"   - y_test_multilabel.csv ({y_test_multilabel.shape})")
print(f"   - y_train_regression.csv")
print(f"   - y_test_regression.csv")

# Save full processed dataset
df_fe.to_csv('../data/processed/processed_data_full.csv', index=False)
print(f"\n‚úÖ Saved full processed dataset: processed_data_full.csv ({df_fe.shape})")

# Save feature list
feature_list_df = pd.DataFrame({
    'Feature': feature_cols,
    'Data_Type': [df_fe[col].dtype for col in feature_cols]
})
feature_list_df.to_csv('../data/processed/feature_list.csv', index=False)
print(f"‚úÖ Saved feature list: feature_list.csv ({len(feature_cols)} features)")

print(f"\n" + "="*80)
print(f"üéâ PHASE 3 COMPLETE: DATA PREPROCESSING & FEATURE ENGINEERING")
print(f"="*80)
print(f"\nüìä Summary:")
print(f"   ‚úÖ {len(engineered_features)} new features created")
print(f"   ‚úÖ {len(feature_cols)} features selected for modeling")
print(f"   ‚úÖ {len(X_train)} training samples, {len(X_test)} test samples")
print(f"   ‚úÖ 3 target types: binary, multi-label (7 sharks), regression")
print(f"   ‚úÖ All data saved to data/processed/")
print(f"\nüöÄ Ready for Phase 4: ML Model Training!")
print(f"="*80)