# Notebook 5: Feature Engineering & Pump Classification
## Social Media-Driven Stock Manipulation and Tail Risk Research

---

**Research Project:** Social Media-Driven Stock Manipulation and Tail Risk

**Purpose:** Engineer features for pump detection, train a classification model, and generate Pump Likelihood Scores (PLS) for each episode.

**Inputs:**
- Episodes with window metrics (Notebook 4)
- Ground truth labels (Notebook 1)

**Output:**
- Feature-engineered episode dataset
- Trained classification model
- Episodes with PLS scores

---

**Last Updated:** 2025

## 1. Environment Setup

In [None]:
# =============================================================================
# INSTALL REQUIRED PACKAGES
# =============================================================================

!pip install pandas==2.0.3
!pip install numpy==1.24.3
!pip install scikit-learn==1.3.2
!pip install scipy==1.11.4
!pip install tqdm==4.66.1
!pip install pyarrow==14.0.1
!pip install matplotlib==3.8.2
!pip install seaborn==0.13.0
!pip install shap==0.44.0

print("All packages installed successfully.")

In [None]:
# =============================================================================
# IMPORT LIBRARIES
# =============================================================================

import os
import json
import warnings
from datetime import datetime
from typing import List, Dict, Optional, Tuple

import pandas as pd
import numpy as np
from scipy import stats
from tqdm.notebook import tqdm

# Machine Learning
from sklearn.model_selection import (
    train_test_split, cross_val_score, cross_val_predict,
    StratifiedKFold, TimeSeriesSplit
)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    precision_recall_curve, roc_curve, f1_score, accuracy_score
)
from sklearn.inspection import permutation_importance

import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print(f"Environment setup complete. Timestamp: {datetime.now()}")

## 2. Configuration and Load Data

In [None]:
# =============================================================================
# CONFIGURATION
# =============================================================================

class ResearchConfig:
    """Configuration for classification."""
    
    # Model Parameters
    N_ESTIMATORS = 100
    MAX_DEPTH = 5  # Prevent overfitting with small samples
    RANDOM_STATE = 42
    
    # Cross-validation
    N_FOLDS = 5
    TEST_SIZE = 0.2
    
    # Data Paths
    BASE_PATH = "/content/drive/MyDrive/Research/PumpDump/"
    PROCESSED_DATA_PATH = BASE_PATH + "data/processed/"
    RESULTS_PATH = BASE_PATH + "results/"

config = ResearchConfig()

# Handle Colab vs local
try:
    from google.colab import drive
    drive.mount('/content/drive')
    IN_COLAB = True
except ImportError:
    IN_COLAB = False
    config.BASE_PATH = "./research_data/"
    config.PROCESSED_DATA_PATH = config.BASE_PATH + "data/processed/"
    config.RESULTS_PATH = config.BASE_PATH + "results/"

os.makedirs(config.RESULTS_PATH, exist_ok=True)

In [None]:
# =============================================================================
# LOAD DATA
# =============================================================================

def load_episodes(results_path: str) -> pd.DataFrame:
    """Load episodes from Notebook 4."""
    episodes_path = os.path.join(results_path, 'episodes.parquet')
    
    if os.path.exists(episodes_path):
        episodes = pd.read_parquet(episodes_path)
        print(f"Loaded episodes: {len(episodes)} rows")
    else:
        print("Episodes file not found - creating sample")
        episodes = create_sample_episodes()
    
    return episodes


def create_sample_episodes() -> pd.DataFrame:
    """Create sample episodes for demonstration."""
    np.random.seed(42)
    n = 200
    
    # Mix of confirmed and control
    labels = np.concatenate([np.ones(30), np.zeros(170)])
    np.random.shuffle(labels)
    
    episodes = pd.DataFrame({
        'episode_id': range(1, n+1),
        'ticker': np.random.choice(['GME', 'AMC', 'BB', 'NOK', 'CLOV', 'WISH', 'MULN', 'FFIE'], n),
        'event_date': pd.date_range('2020-01-01', periods=n, freq='W'),
        'label': labels.astype(int),
        'event_return': np.random.uniform(0.1, 0.5, n),
        'event_volume_ratio': np.random.uniform(2, 20, n),
        'return_5d': np.where(labels == 1, np.random.uniform(-0.4, -0.1, n), np.random.uniform(-0.2, 0.1, n)),
        'return_20d': np.where(labels == 1, np.random.uniform(-0.6, -0.2, n), np.random.uniform(-0.3, 0.2, n)),
        'max_drawdown_20d': np.where(labels == 1, np.random.uniform(0.4, 0.7, n), np.random.uniform(0.1, 0.4, n)),
        'msg_count': np.random.poisson(50, n) * (1 + labels),
        'msg_zscore': np.random.uniform(3, 10, n),
        'promo_share': np.where(labels == 1, np.random.uniform(0.3, 0.7, n), np.random.uniform(0.05, 0.3, n)),
        'user_concentration': np.where(labels == 1, np.random.uniform(0.5, 0.9, n), np.random.uniform(0.2, 0.5, n)),
        'pre_avg_return': np.random.normal(0, 0.02, n),
        'pre_avg_volume': np.random.lognormal(14, 1, n)
    })
    
    return episodes


# Load episodes
episodes_df = load_episodes(config.RESULTS_PATH)
print(f"\nLabel distribution:")
print(episodes_df['label'].value_counts())

## 3. Feature Engineering

In [None]:
# =============================================================================
# FEATURE ENGINEER
# =============================================================================

class FeatureEngineer:
    """Engineers features for pump classification.
    
    Feature Categories:
    A. Market Features (from price-volume data)
    B. Social Features (from message board data)
    C. Derived Features (combinations and ratios)
    """
    
    # Feature definitions
    MARKET_FEATURES = [
        'event_return',
        'event_volume_ratio',
        'return_5d',
        'return_20d',
        'max_drawdown_5d',
        'max_drawdown_20d',
        'pre_avg_return',
    ]
    
    SOCIAL_FEATURES = [
        'msg_zscore',
        'promo_share',
        'user_concentration',
        'msg_count',
    ]
    
    def __init__(self):
        self.feature_names = []
        
    def add_market_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Add market-based features."""
        df = df.copy()
        
        # Reversal magnitude
        if 'event_return' in df.columns and 'return_5d' in df.columns:
            df['reversal_5d'] = -df['return_5d']  # Positive if price dropped
        if 'event_return' in df.columns and 'return_20d' in df.columns:
            df['reversal_20d'] = -df['return_20d']
            
        # Reversal relative to event return
        if 'event_return' in df.columns and 'return_20d' in df.columns:
            df['reversal_ratio_20d'] = -df['return_20d'] / (df['event_return'] + 0.01)
        
        # Volume spike severity
        if 'event_volume_ratio' in df.columns:
            df['log_volume_ratio'] = np.log1p(df['event_volume_ratio'])
            
        # Price momentum before event
        if 'pre_avg_return' in df.columns:
            df['pre_momentum'] = df['pre_avg_return'] * 20  # Approximate 20-day momentum
        
        return df
    
    def add_social_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Add social media-based features."""
        df = df.copy()
        
        # Log message count
        if 'msg_count' in df.columns:
            df['log_msg_count'] = np.log1p(df['msg_count'])
        
        # Promo intensity (promo share * z-score)
        if 'promo_share' in df.columns and 'msg_zscore' in df.columns:
            df['promo_intensity'] = df['promo_share'] * df['msg_zscore']
        
        # Coordination score (high concentration + high promo)
        if 'user_concentration' in df.columns and 'promo_share' in df.columns:
            df['coordination_score'] = df['user_concentration'] * df['promo_share']
        
        return df
    
    def add_interaction_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Add interaction terms between market and social."""
        df = df.copy()
        
        # Social * Volume interaction
        if 'msg_zscore' in df.columns and 'event_volume_ratio' in df.columns:
            df['social_volume_interaction'] = df['msg_zscore'] * np.log1p(df['event_volume_ratio'])
        
        # Promo * Reversal interaction
        if 'promo_share' in df.columns and 'reversal_20d' in df.columns:
            df['promo_reversal_interaction'] = df['promo_share'] * df['reversal_20d']
        
        return df
    
    def get_feature_matrix(self, df: pd.DataFrame) -> Tuple[np.ndarray, List[str]]:
        """Extract feature matrix for modeling."""
        
        # Define all features to use
        all_features = [
            # Original market features
            'event_return', 'event_volume_ratio', 'return_5d', 'return_20d',
            'max_drawdown_20d',
            
            # Original social features
            'msg_zscore', 'promo_share', 'user_concentration',
            
            # Engineered features
            'reversal_20d', 'reversal_ratio_20d', 'log_volume_ratio',
            'log_msg_count', 'promo_intensity', 'coordination_score',
            'social_volume_interaction'
        ]
        
        # Filter to available features
        available = [f for f in all_features if f in df.columns]
        
        self.feature_names = available
        
        X = df[available].values
        
        return X, available
    
    def engineer_all(self, df: pd.DataFrame) -> pd.DataFrame:
        """Run full feature engineering pipeline."""
        print("Engineering features...")
        
        df = self.add_market_features(df)
        df = self.add_social_features(df)
        df = self.add_interaction_features(df)
        
        X, features = self.get_feature_matrix(df)
        
        print(f"Total features: {len(features)}")
        print(f"Feature names: {features}")
        
        return df


# Initialize engineer
feature_engineer = FeatureEngineer()

# Engineer features
episodes_df = feature_engineer.engineer_all(episodes_df)

print("\nFeature-engineered data sample:")
print(episodes_df.head())

## 4. Train Classification Model

In [None]:
# =============================================================================
# PUMP CLASSIFIER
# =============================================================================

class PumpClassifier:
    """Trains and evaluates pump-and-dump classification model.
    
    Uses Random Forest with careful regularization to prevent
    overfitting on small labeled samples.
    """
    
    def __init__(self, config: ResearchConfig):
        self.config = config
        self.model = None
        self.scaler = StandardScaler()
        self.feature_names = []
        self.cv_results = {}
        
    def prepare_data(self, df: pd.DataFrame, 
                     feature_engineer: FeatureEngineer) -> Tuple[np.ndarray, np.ndarray]:
        """Prepare features and labels."""
        
        # Get feature matrix
        X, feature_names = feature_engineer.get_feature_matrix(df)
        self.feature_names = feature_names
        
        # Get labels
        y = df['label'].values
        
        # Handle missing values
        X = np.nan_to_num(X, nan=0)
        
        print(f"Data prepared: {X.shape[0]} samples, {X.shape[1]} features")
        print(f"Label distribution: {pd.Series(y).value_counts().to_dict()}")
        
        return X, y
    
    def train_model(self, X: np.ndarray, y: np.ndarray) -> Dict:
        """Train Random Forest classifier with cross-validation."""
        
        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        
        # Initialize model
        self.model = RandomForestClassifier(
            n_estimators=self.config.N_ESTIMATORS,
            max_depth=self.config.MAX_DEPTH,
            class_weight='balanced',  # Handle imbalanced classes
            random_state=self.config.RANDOM_STATE,
            n_jobs=-1
        )
        
        # Cross-validation
        cv = StratifiedKFold(n_splits=self.config.N_FOLDS, shuffle=True, 
                             random_state=self.config.RANDOM_STATE)
        
        # Get cross-validated predictions
        cv_probs = cross_val_predict(self.model, X_scaled, y, cv=cv, method='predict_proba')
        cv_preds = (cv_probs[:, 1] > 0.5).astype(int)
        
        # Calculate metrics
        self.cv_results = {
            'accuracy': accuracy_score(y, cv_preds),
            'f1': f1_score(y, cv_preds),
            'roc_auc': roc_auc_score(y, cv_probs[:, 1]) if len(np.unique(y)) > 1 else np.nan,
            'confusion_matrix': confusion_matrix(y, cv_preds).tolist(),
            'classification_report': classification_report(y, cv_preds, output_dict=True)
        }
        
        # Fit final model on all data
        self.model.fit(X_scaled, y)
        
        print("\nCross-Validation Results:")
        print(f"  Accuracy: {self.cv_results['accuracy']:.3f}")
        print(f"  F1 Score: {self.cv_results['f1']:.3f}")
        print(f"  ROC AUC: {self.cv_results['roc_auc']:.3f}")
        print(f"\nConfusion Matrix:")
        print(np.array(self.cv_results['confusion_matrix']))
        
        return self.cv_results
    
    def predict_pls(self, X: np.ndarray) -> np.ndarray:
        """Predict Pump Likelihood Scores."""
        X_scaled = self.scaler.transform(X)
        pls = self.model.predict_proba(X_scaled)[:, 1]
        return pls
    
    def get_feature_importance(self) -> pd.DataFrame:
        """Get feature importance from trained model."""
        importance = pd.DataFrame({
            'feature': self.feature_names,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        return importance
    
    def temporal_validation(self, df: pd.DataFrame, 
                            feature_engineer: FeatureEngineer,
                            train_end_date: str) -> Dict:
        """Temporal split validation: train on early data, test on later."""
        
        df = df.copy()
        df['event_date'] = pd.to_datetime(df['event_date'])
        train_end = pd.to_datetime(train_end_date)
        
        # Split
        train_df = df[df['event_date'] <= train_end]
        test_df = df[df['event_date'] > train_end]
        
        if len(train_df) == 0 or len(test_df) == 0:
            print("Cannot perform temporal validation - insufficient data")
            return {}
        
        # Prepare data
        X_train, _ = feature_engineer.get_feature_matrix(train_df)
        X_test, _ = feature_engineer.get_feature_matrix(test_df)
        
        X_train = np.nan_to_num(X_train, nan=0)
        X_test = np.nan_to_num(X_test, nan=0)
        
        y_train = train_df['label'].values
        y_test = test_df['label'].values
        
        # Train and evaluate
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        model = RandomForestClassifier(
            n_estimators=self.config.N_ESTIMATORS,
            max_depth=self.config.MAX_DEPTH,
            class_weight='balanced',
            random_state=self.config.RANDOM_STATE
        )
        
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_prob = model.predict_proba(X_test_scaled)[:, 1]
        
        results = {
            'train_size': len(train_df),
            'test_size': len(test_df),
            'train_period': f"{train_df['event_date'].min().date()} to {train_df['event_date'].max().date()}",
            'test_period': f"{test_df['event_date'].min().date()} to {test_df['event_date'].max().date()}",
            'accuracy': accuracy_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred) if y_test.sum() > 0 else np.nan,
            'roc_auc': roc_auc_score(y_test, y_prob) if len(np.unique(y_test)) > 1 else np.nan
        }
        
        print("\nTemporal Validation Results:")
        print(f"  Train: {results['train_period']} ({results['train_size']} samples)")
        print(f"  Test: {results['test_period']} ({results['test_size']} samples)")
        print(f"  Accuracy: {results['accuracy']:.3f}")
        print(f"  F1 Score: {results['f1']:.3f}")
        print(f"  ROC AUC: {results['roc_auc']:.3f}")
        
        return results


# Initialize classifier
classifier = PumpClassifier(config)
print("Pump Classifier initialized")

In [None]:
# =============================================================================
# TRAIN MODEL
# =============================================================================

# Prepare data
X, y = classifier.prepare_data(episodes_df, feature_engineer)

# Train with cross-validation
print("\n" + "="*60)
print("TRAINING PUMP CLASSIFIER")
print("="*60)

cv_results = classifier.train_model(X, y)

# Feature importance
importance_df = classifier.get_feature_importance()
print("\nFeature Importance:")
print(importance_df.head(15))

In [None]:
# =============================================================================
# TEMPORAL VALIDATION
# =============================================================================

# Run temporal validation (train on 2019-2022, test on 2023+)
temporal_results = classifier.temporal_validation(
    episodes_df, 
    feature_engineer,
    train_end_date='2022-12-31'
)

## 5. Generate Pump Likelihood Scores

In [None]:
# =============================================================================
# GENERATE PLS SCORES
# =============================================================================

# Predict PLS for all episodes
X, _ = feature_engineer.get_feature_matrix(episodes_df)
X = np.nan_to_num(X, nan=0)

episodes_df['pls'] = classifier.predict_pls(X)

print("Pump Likelihood Scores (PLS) Generated")
print(f"\nPLS Distribution:")
print(episodes_df['pls'].describe())

# Decile analysis
episodes_df['pls_decile'] = pd.qcut(episodes_df['pls'], q=10, labels=False, duplicates='drop') + 1

decile_analysis = episodes_df.groupby('pls_decile').agg({
    'pls': 'mean',
    'label': 'mean',
    'return_20d': 'mean',
    'max_drawdown_20d': 'mean',
    'promo_share': 'mean'
}).round(3)

print("\nPLS Decile Analysis:")
print(decile_analysis)

## 6. Visualizations

In [None]:
# =============================================================================
# VISUALIZATIONS
# =============================================================================

def plot_model_performance(cv_results: Dict, importance_df: pd.DataFrame):
    """Plot model performance visualizations."""
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Confusion matrix
    ax1 = axes[0]
    cm = np.array(cv_results['confusion_matrix'])
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1,
                xticklabels=['Control', 'Pump'], yticklabels=['Control', 'Pump'])
    ax1.set_xlabel('Predicted')
    ax1.set_ylabel('Actual')
    ax1.set_title('Confusion Matrix (CV)')
    
    # Feature importance
    ax2 = axes[1]
    top_features = importance_df.head(10)
    ax2.barh(top_features['feature'], top_features['importance'], color='steelblue')
    ax2.set_xlabel('Importance')
    ax2.set_title('Top 10 Feature Importance')
    ax2.invert_yaxis()
    
    # Performance metrics
    ax3 = axes[2]
    metrics = ['accuracy', 'f1', 'roc_auc']
    values = [cv_results.get(m, 0) for m in metrics]
    colors = ['blue', 'green', 'orange']
    ax3.bar(metrics, values, color=colors)
    ax3.set_ylim(0, 1)
    ax3.set_ylabel('Score')
    ax3.set_title('Cross-Validation Metrics')
    for i, v in enumerate(values):
        ax3.text(i, v + 0.02, f'{v:.3f}', ha='center')
    
    plt.tight_layout()
    plt.savefig(os.path.join(config.RESULTS_PATH, 'model_performance.png'), dpi=150)
    plt.show()


def plot_pls_analysis(episodes_df: pd.DataFrame):
    """Plot PLS analysis visualizations."""
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # PLS distribution
    ax1 = axes[0, 0]
    confirmed = episodes_df[episodes_df['label'] == 1]['pls']
    control = episodes_df[episodes_df['label'] == 0]['pls']
    ax1.hist(control, bins=30, alpha=0.5, label='Control', color='blue')
    ax1.hist(confirmed, bins=30, alpha=0.5, label='Confirmed Pump', color='red')
    ax1.set_xlabel('Pump Likelihood Score (PLS)')
    ax1.set_ylabel('Frequency')
    ax1.set_title('PLS Distribution by Label')
    ax1.legend()
    
    # PLS vs 20-day return
    ax2 = axes[0, 1]
    ax2.scatter(episodes_df['pls'], episodes_df['return_20d']*100, 
                c=episodes_df['label'], cmap='coolwarm', alpha=0.6)
    ax2.axhline(y=0, color='black', linestyle='--')
    ax2.set_xlabel('PLS')
    ax2.set_ylabel('20-Day Return (%)')
    ax2.set_title('PLS vs Post-Event Returns')
    
    # Lift curve
    ax3 = axes[1, 0]
    decile_rates = episodes_df.groupby('pls_decile')['label'].mean()
    baseline = episodes_df['label'].mean()
    ax3.bar(decile_rates.index, decile_rates.values, color='steelblue')
    ax3.axhline(y=baseline, color='red', linestyle='--', label=f'Baseline ({baseline:.2%})')
    ax3.set_xlabel('PLS Decile')
    ax3.set_ylabel('Confirmed Pump Rate')
    ax3.set_title('Lift Curve: Pump Rate by PLS Decile')
    ax3.legend()
    
    # Drawdown by PLS decile
    ax4 = axes[1, 1]
    decile_drawdown = episodes_df.groupby('pls_decile')['max_drawdown_20d'].mean() * 100
    ax4.bar(decile_drawdown.index, decile_drawdown.values, color='darkred')
    ax4.set_xlabel('PLS Decile')
    ax4.set_ylabel('Average Max Drawdown (%)')
    ax4.set_title('Drawdown by PLS Decile')
    
    plt.tight_layout()
    plt.savefig(os.path.join(config.RESULTS_PATH, 'pls_analysis.png'), dpi=150)
    plt.show()


# Generate visualizations
print("Generating visualizations...")
plot_model_performance(cv_results, importance_df)
plot_pls_analysis(episodes_df)

## 7. Robustness: Threshold Sensitivity

In [None]:
# =============================================================================
# ROBUSTNESS CHECKS
# =============================================================================

def run_robustness_checks(episodes_df: pd.DataFrame, 
                          feature_engineer: FeatureEngineer,
                          config: ResearchConfig) -> Dict:
    """Run robustness checks with different model configurations."""
    
    results = {}
    
    # Prepare data
    X, _ = feature_engineer.get_feature_matrix(episodes_df)
    X = np.nan_to_num(X, nan=0)
    y = episodes_df['label'].values
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # Test 1: Different max_depth
    print("Testing max_depth sensitivity...")
    depth_results = []
    for depth in [3, 5, 7, 10, None]:
        model = RandomForestClassifier(
            n_estimators=100, max_depth=depth, class_weight='balanced', random_state=42
        )
        scores = cross_val_score(model, X_scaled, y, cv=cv, scoring='f1')
        depth_results.append({'max_depth': depth, 'f1_mean': scores.mean(), 'f1_std': scores.std()})
    
    results['depth_sensitivity'] = depth_results
    
    # Test 2: Different models
    print("Testing different models...")
    model_results = []
    
    models = {
        'RandomForest': RandomForestClassifier(n_estimators=100, max_depth=5, class_weight='balanced', random_state=42),
        'GradientBoosting': GradientBoostingClassifier(n_estimators=100, max_depth=3, random_state=42),
        'LogisticRegression': LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
    }
    
    for name, model in models.items():
        scores = cross_val_score(model, X_scaled, y, cv=cv, scoring='f1')
        model_results.append({'model': name, 'f1_mean': scores.mean(), 'f1_std': scores.std()})
    
    results['model_comparison'] = model_results
    
    # Print results
    print("\n" + "="*60)
    print("ROBUSTNESS CHECK RESULTS")
    print("="*60)
    
    print("\nMax Depth Sensitivity:")
    for r in depth_results:
        print(f"  depth={r['max_depth']}: F1 = {r['f1_mean']:.3f} (+/- {r['f1_std']:.3f})")
    
    print("\nModel Comparison:")
    for r in model_results:
        print(f"  {r['model']}: F1 = {r['f1_mean']:.3f} (+/- {r['f1_std']:.3f})")
    
    return results


# Run robustness checks
robustness_results = run_robustness_checks(episodes_df, feature_engineer, config)

## 8. Save Outputs

In [None]:
# =============================================================================
# SAVE OUTPUTS
# =============================================================================

def save_classification_results(episodes_df: pd.DataFrame,
                                 cv_results: Dict,
                                 importance_df: pd.DataFrame,
                                 robustness_results: Dict,
                                 output_dir: str):
    """Save classification outputs."""
    os.makedirs(output_dir, exist_ok=True)
    
    # Save episodes with PLS
    episodes_path = os.path.join(output_dir, 'episodes_with_pls.parquet')
    episodes_df.to_parquet(episodes_path, index=False)
    print(f"Saved episodes with PLS: {episodes_path}")
    
    # Save CSV
    episodes_csv = os.path.join(output_dir, 'episodes_with_pls.csv')
    episodes_df.to_csv(episodes_csv, index=False)
    print(f"Saved CSV: {episodes_csv}")
    
    # Save feature importance
    importance_path = os.path.join(output_dir, 'feature_importance.csv')
    importance_df.to_csv(importance_path, index=False)
    print(f"Saved feature importance: {importance_path}")
    
    # Save summary
    summary = {
        'episodes_total': len(episodes_df),
        'confirmed_pumps': int((episodes_df['label'] == 1).sum()),
        'control_episodes': int((episodes_df['label'] == 0).sum()),
        'cv_results': cv_results,
        'pls_stats': episodes_df['pls'].describe().to_dict(),
        'top_features': importance_df.head(10).to_dict('records'),
        'robustness': robustness_results,
        'created_at': datetime.now().isoformat()
    }
    
    summary_path = os.path.join(output_dir, 'notebook05_summary.json')
    with open(summary_path, 'w') as f:
        json.dump(summary, f, indent=2, default=str)
    print(f"Saved summary: {summary_path}")
    
    return summary


# Save outputs
output_summary = save_classification_results(
    episodes_df=episodes_df,
    cv_results=cv_results,
    importance_df=importance_df,
    robustness_results=robustness_results,
    output_dir=config.RESULTS_PATH
)

print("\n" + "="*60)
print("Output Summary (key metrics):")
print(f"  Episodes: {output_summary['episodes_total']}")
print(f"  CV Accuracy: {output_summary['cv_results']['accuracy']:.3f}")
print(f"  CV F1: {output_summary['cv_results']['f1']:.3f}")
print(f"  CV AUC: {output_summary['cv_results']['roc_auc']:.3f}")

## 9. Summary and Next Steps

In [None]:
# =============================================================================
# NOTEBOOK 5 SUMMARY
# =============================================================================

print("""
╔══════════════════════════════════════════════════════════════════════════════╗
║        NOTEBOOK 5: FEATURE ENGINEERING & CLASSIFICATION COMPLETE             ║
╚══════════════════════════════════════════════════════════════════════════════╝

OUTPUT FILES:
─────────────
• episodes_with_pls.parquet       - Episodes with Pump Likelihood Scores
• episodes_with_pls.csv           - CSV for inspection
• feature_importance.csv          - Feature importance rankings
• model_performance.png           - Performance visualizations
• pls_analysis.png                - PLS analysis plots
• notebook05_summary.json         - Summary statistics

FEATURES ENGINEERED:
────────────────────
Market Features:
• Event return, volume ratio
• 5/20-day reversals and drawdowns
• Pre-event momentum

Social Features:
• Message z-score, promotional share
• User concentration (Gini)
• Promo intensity, coordination score

Interaction Features:
• Social-volume interaction
• Promo-reversal interaction

MODEL DETAILS:
──────────────
• Algorithm: Random Forest
• Regularization: max_depth=5, balanced classes
• Validation: 5-fold stratified CV

PUMP LIKELIHOOD SCORE (PLS):
────────────────────────────
• Range: 0 to 1
• 0 = Low manipulation likelihood
• 1 = High manipulation likelihood
• Use as continuous proxy for manipulation risk

NEXT STEPS:
───────────
→ Notebook 6: Tail Risk Analysis
  - Compute VaR and Expected Shortfall
  - Portfolio-level analysis
  - Spillover analysis
  - Regression models

""")

In [None]:
# =============================================================================
# ENVIRONMENT INFO
# =============================================================================

import sys
import platform
import sklearn

print("Environment Information:")
print(f"  Python: {sys.version}")
print(f"  Platform: {platform.platform()}")
print(f"  Pandas: {pd.__version__}")
print(f"  NumPy: {np.__version__}")
print(f"  Scikit-learn: {sklearn.__version__}")
print(f"  Timestamp: {datetime.now().isoformat()}")