In [None]:
import numpy as np
import pandas as pd
import gc
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit

import xgboost as xgb
import lightgbm as lgb

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tools.sm_exceptions import ConvergenceWarning

try:
    import matplotlib.pyplot as plt
except ImportError:
    print("Warning: matplotlib not available, plotting functions will be disabled")
    plt = None

from app_config import Config
from model_utils import ModelDataProcessor, ModelEvaluator, ModelManager

# Initialize secure configuration and utilities
config = Config()
data_processor = ModelDataProcessor()
evaluator = ModelEvaluator()
model_manager = ModelManager()

# Mount Google Drive if in Colab environment
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("✅ Google Drive mounted")
except ImportError:
    print("ℹ️ Not in Colab environment, skipping Google Drive mount")

print("✅ All modules and utilities loaded successfully!")

class MemoryEfficientStockPredictor:
    def __init__(self, look_back=60):
        self.look_back = look_back
        self.models = {}
        self.predictions = {}
        self.weights = {}
        # Updated scaler patterns to match your naming convention
        self.scaler_patterns = ['_minmax', '_robust', '_z', '_std']
        self.equity_cols = []
        self.futures_cols = []
        self.options_cols = []
        self.nse_index_cols = []
        self.technical_indicator_cols = []

    def load_data(self, train_path, test_path):
        """Load data efficiently with memory management"""
        print("Loading training data...")
        self.train_data = pd.read_parquet(train_path)
        print(f"Training data shape: {self.train_data.shape}")

        print("Loading test data...")
        self.test_data = pd.read_parquet(test_path)
        print(f"Test data shape: {self.test_data.shape}")

        # Categorize columns based on your data structure
        self._categorize_columns(self.train_data.columns)

        # Force garbage collection
        gc.collect()

    def _categorize_columns(self, columns):
        """Categorize columns based on data type and naming patterns"""
        print("Categorizing data columns...")

        # Reset column lists
        self.equity_cols = []
        self.futures_cols = []
        self.options_cols = []
        self.nse_index_cols = []
        self.technical_indicator_cols = []

        # Common technical indicators you mentioned
        tech_indicators = ['RSI', 'MACD', 'SMA', 'EMA', 'BB', 'ATR', 'ADX', 'STOCH', 'CCI', 'MFI',
                          'OBV', 'VWAP', 'BOLL', 'SAR', 'WILLIAMS', 'ROC', 'MOM', 'TSI', 'UO',
                          'crossover', 'divergence', 'weight_opt', 'strategy', 'oi_buildup',
                          'unusual_volume', 'rolling_mean', 'rolling_var', 'rolling_std']

        option_keywords = ['CE', 'PE', 'CALL', 'PUT', 'STRIKE', 'EXPIRY', 'IV', 'DELTA', 'GAMMA',
                          'THETA', 'VEGA', 'RHO', 'option_chain', 'pcr', 'max_pain']

        futures_keywords = ['FUT', 'FUTURE', 'basis', 'contango', 'backwardation', 'roll_yield']

        nse_index_keywords = ['NIFTY', 'SENSEX', 'BANKNIFTY', 'INDEX', 'VIX']

        for col in columns:
            col_upper = col.upper()

            # Check for technical indicators
            if any(indicator.upper() in col_upper for indicator in tech_indicators):
                self.technical_indicator_cols.append(col)

            # Check for options data
            elif any(keyword in col_upper for keyword in option_keywords):
                self.options_cols.append(col)

            # Check for futures data
            elif any(keyword in col_upper for keyword in futures_keywords):
                self.futures_cols.append(col)

            # Check for NSE indices
            elif any(index_name in col_upper for index_name in nse_index_keywords):
                self.nse_index_cols.append(col)

            # Default to equity if basic OHLCV pattern
            elif any(ohlcv in col_upper for ohlcv in ['OPEN', 'HIGH', 'LOW', 'CLOSE', 'VOLUME']):
                self.equity_cols.append(col)

        print(f"Equity columns: {len(self.equity_cols)}")
        print(f"Futures columns: {len(self.futures_cols)}")
        print(f"Options columns: {len(self.options_cols)}")
        print(f"NSE Index columns: {len(self.nse_index_cols)}")
        print(f"Technical Indicator columns: {len(self.technical_indicator_cols)}")
        print(f"Total columns: {len(columns)}")

    def prepare_features(self, data, target_col='Close'):
        """Prepare features for different model types using existing calculations"""
        print("Preparing features from existing calculations...")

        # Since you've already calculated most indicators, we'll use them directly
        # Get all scaled columns based on your naming convention
        scaled_cols = []
        for col in data.columns:
            if any(pattern in col for pattern in self.scaler_patterns):
                scaled_cols.append(col)

        print(f"Found {len(scaled_cols)} scaled columns")

        # Create feature groups for different model types
        feature_groups = {
            'equity_features': self.equity_cols,
            'futures_features': self.futures_cols,
            'options_features': self.options_cols,
            'nse_index_features': self.nse_index_cols,
            'technical_features': self.technical_indicator_cols,
            'scaled_features': scaled_cols
        }

        # Combine all available features (excluding target)
        all_features = []
        for group_name, features in feature_groups.items():
            available_features = [col for col in features if col in data.columns and col != target_col]
            all_features.extend(available_features)
            print(f"{group_name}: {len(available_features)} features")

        # Remove duplicates while preserving order
        all_features = list(dict.fromkeys(all_features))

        # Add any remaining numeric columns that might be useful
        numeric_cols = data.select_dtypes(include=[np.number]).columns.tolist()
        for col in numeric_cols:
            if col not in all_features and col != target_col:
                all_features.append(col)

        print(f"Total features selected: {len(all_features)}")

        # Ensure target column exists
        if target_col not in data.columns:
            print(f"Warning: Target column '{target_col}' not found. Using first equity close column.")
            close_cols = [col for col in data.columns if 'close' in col.lower()]
            if close_cols:
                target_col = close_cols[0]
            else:
                raise ValueError("No suitable target column found")

        # Create final dataset with features and target
        final_features = [col for col in all_features if col in data.columns]
        final_data = data[final_features + [target_col]].copy()

        # Handle any remaining NaN values
        final_data = final_data.fillna(method='ffill').fillna(method='bfill')

        # Remove any remaining NaN rows
        final_data = final_data.dropna()

        print(f"Final prepared data shape: {final_data.shape}")
        return final_data

    def calculate_rsi(self, prices, window=14):
        """Calculate RSI indicator"""
        delta = prices.diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
        rs = gain / loss
        return 100 - (100 / (1 + rs))

    def calculate_macd(self, prices, fast=12, slow=26):
        """Calculate MACD indicator"""
        ema_fast = prices.ewm(span=fast).mean()
        ema_slow = prices.ewm(span=slow).mean()
        return ema_fast - ema_slow

    def create_sequences(self, data, target_col='Close'):
        """Create sequences for LSTM/GRU models with smart feature selection"""
        print("Creating sequences for deep learning models...")

        # Select most important features for sequence models to manage memory
        # Prioritize scaled features and key technical indicators
        priority_features = []

        # Add scaled features (these are already normalized)
        for col in data.columns:
            if any(pattern in col for pattern in self.scaler_patterns):
                priority_features.append(col)

        # Add key technical indicators
        key_indicators = ['RSI', 'MACD', 'SMA', 'EMA', 'ATR', 'VWAP', 'BOLL']
        for col in data.columns:
            if any(indicator in col.upper() for indicator in key_indicators) and col not in priority_features:
                priority_features.append(col)

        # Add volume and OI related features
        volume_oi_features = ['VOLUME', 'OI', 'BUILDUP', 'UNUSUAL']
        for col in data.columns:
            if any(feature in col.upper() for feature in volume_oi_features) and col not in priority_features:
                priority_features.append(col)

        # Limit features to prevent memory issues (max 50 features for sequences)
        if len(priority_features) > 50:
            priority_features = priority_features[:50]

        # Ensure we have the features in the data
        available_features = [col for col in priority_features if col in data.columns and col != target_col]

        print(f"Using {len(available_features)} features for sequence creation")

        if not available_features:
            # Fallback to all numeric columns except target
            available_features = [col for col in data.select_dtypes(include=[np.number]).columns
                                if col != target_col][:30]  # Limit to 30 features

        sequences = []
        targets = []

        for i in range(self.look_back, len(data)):
            sequences.append(data[available_features].iloc[i-self.look_back:i].values)
            targets.append(data[target_col].iloc[i])

        return np.array(sequences), np.array(targets)

    def train_arima(self, data, target_col='Close'):
        """Train ARIMA model with memory efficiency"""
        print("Training ARIMA model...")
        try:
            # Use a subset for ARIMA if data is too large
            if len(data) > 5000:
                data_subset = data.tail(5000)
            else:
                data_subset = data

            model = ARIMA(data_subset[target_col], order=(2, 1, 2))
            fitted_model = model.fit()
            self.models['ARIMA'] = fitted_model
            print("ARIMA model trained successfully")
        except Exception as e:
            print(f"ARIMA training failed: {e}")
            self.models['ARIMA'] = None

    def train_lstm(self, X_seq, y_seq):
        """Train Bi-LSTM model"""
        print("Training Bi-LSTM model...")

        # Clear any existing models from memory
        tf.keras.backend.clear_session()

        model = Sequential([
            Bidirectional(LSTM(50, return_sequences=True), input_shape=(X_seq.shape[1], X_seq.shape[2])),
            Dropout(0.2),
            Bidirectional(LSTM(50, return_sequences=False)),
            Dropout(0.2),
            Dense(25),
            Dense(1)
        ])

        model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

        early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

        # Split data for validation
        split_idx = int(0.8 * len(X_seq))
        X_train, X_val = X_seq[:split_idx], X_seq[split_idx:]
        y_train, y_val = y_seq[:split_idx], y_seq[split_idx:]

        model.fit(X_train, y_train,
                 validation_data=(X_val, y_val),
                 epochs=50, batch_size=32,
                 callbacks=[early_stop], verbose=0)

        self.models['BiLSTM'] = model
        print("Bi-LSTM model trained successfully")

        # Clear memory
        del X_train, X_val, y_train, y_val
        gc.collect()

    def train_gru(self, X_seq, y_seq):
        """Train GRU model"""
        print("Training GRU model...")

        tf.keras.backend.clear_session()

        model = Sequential([
            GRU(50, return_sequences=True, input_shape=(X_seq.shape[1], X_seq.shape[2])),
            Dropout(0.2),
            GRU(50, return_sequences=False),
            Dropout(0.2),
            Dense(25),
            Dense(1)
        ])

        model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

        early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

        split_idx = int(0.8 * len(X_seq))
        X_train, X_val = X_seq[:split_idx], X_seq[split_idx:]
        y_train, y_val = y_seq[:split_idx], y_seq[split_idx:]

        model.fit(X_train, y_train,
                 validation_data=(X_val, y_val),
                 epochs=50, batch_size=32,
                 callbacks=[early_stop], verbose=0)

        self.models['GRU'] = model
        print("GRU model trained successfully")

        del X_train, X_val, y_train, y_val
        gc.collect()

    def train_tree_models(self, X_train, y_train):
        """Train tree-based models with feature importance analysis"""
        print("Training tree-based models...")

        # Reduce features if too many to prevent overfitting and memory issues
        if X_train.shape[1] > 100:
            print(f"Reducing features from {X_train.shape[1]} to top 100 most important")
            # Quick feature selection using correlation with target
            correlations = X_train.corrwith(y_train).abs().sort_values(ascending=False)
            top_features = correlations.head(100).index.tolist()
            X_train_reduced = X_train[top_features]
        else:
            X_train_reduced = X_train

        print(f"Training with {X_train_reduced.shape[1]} features")

        # XGBoost with parameters optimized for your multi-asset data
        self.models['XGBoost'] = xgb.XGBRegressor(
            n_estimators=200,
            max_depth=8,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1,
            reg_alpha=0.1,
            reg_lambda=0.1
        )
        self.models['XGBoost'].fit(X_train_reduced, y_train)

        # LightGBM optimized for financial data
        self.models['LightGBM'] = lgb.LGBMRegressor(
            n_estimators=200,
            max_depth=8,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1,
            reg_alpha=0.1,
            reg_lambda=0.1,
            verbose=-1,
            importance_type='gain'
        )
        self.models['LightGBM'].fit(X_train_reduced, y_train)

        # Random Forest with parameters for financial time series
        self.models['RandomForest'] = RandomForestRegressor(
            n_estimators=150,
            max_depth=15,
            min_samples_split=5,
            min_samples_leaf=2,
            max_features='sqrt',
            random_state=42,
            n_jobs=-1
        )
        self.models['RandomForest'].fit(X_train_reduced, y_train)

        # Store the selected features for prediction
        self.tree_model_features = X_train_reduced.columns.tolist()

        print("Tree-based models trained successfully")

        # Print feature importance for top model
        if hasattr(self.models['LightGBM'], 'feature_importances_'):
            importances = self.models['LightGBM'].feature_importances_
            feature_importance = pd.DataFrame({
                'feature': self.tree_model_features,
                'importance': importances
            }).sort_values('importance', ascending=False)

            print("\nTop 10 Most Important Features:")
            print(feature_importance.head(10))

    def train_other_models(self, X_train, y_train):
        """Train SVR and KNN models"""
        print("Training SVR and KNN models...")

        # Use a subset for SVR if data is too large (SVR is memory intensive)
        if len(X_train) > 5000:
            subset_idx = np.random.choice(len(X_train), 5000, replace=False)
            X_subset = X_train.iloc[subset_idx]
            y_subset = y_train.iloc[subset_idx]
        else:
            X_subset = X_train
            y_subset = y_train

        # SVR
        self.models['SVR'] = SVR(kernel='rbf', C=100, gamma='scale')
        self.models['SVR'].fit(X_subset, y_subset)

        # KNN
        self.models['KNN'] = KNeighborsRegressor(n_neighbors=5, n_jobs=-1)
        self.models['KNN'].fit(X_train, y_train)

        print("SVR and KNN models trained successfully")

    def train_all_models(self):
        """Train all models with memory management"""
        # Prepare data
        print("Preparing training data...")
        train_data = self.prepare_features(self.train_data.copy())

        # Prepare features for non-sequential models
        feature_cols = [col for col in train_data.columns if col not in ['Close']]
        X_train = train_data[feature_cols]
        y_train = train_data['Close']

        # Train ARIMA
        self.train_arima(train_data)

        # Prepare sequences for LSTM/GRU
        print("Preparing sequences for deep learning models...")
        X_seq, y_seq = self.create_sequences(train_data)

        # Train deep learning models
        self.train_lstm(X_seq, y_seq)
        self.train_gru(X_seq, y_seq)

        # Clear sequences from memory
        del X_seq, y_seq
        gc.collect()

        # Train tree-based models
        self.train_tree_models(X_train, y_train)

        # Train other models
        self.train_other_models(X_train, y_train)

        print("All models trained successfully!")

    def predict_single_model(self, model_name, test_data):
        """Make predictions with a single model"""
        if model_name not in self.models or self.models[model_name] is None:
            return None

        model = self.models[model_name]

        if model_name == 'ARIMA':
            try:
                forecast = model.forecast(steps=len(test_data))
                return forecast
            except:
                return None

        elif model_name in ['BiLSTM', 'GRU']:
            X_seq, _ = self.create_sequences(test_data)
            if len(X_seq) == 0:
                return None
            predictions = model.predict(X_seq, verbose=0)
            return predictions.flatten()

        else:  # Tree-based and other models
            if model_name in ['XGBoost', 'LightGBM', 'RandomForest'] and hasattr(self, 'tree_model_features'):
                # Use the same features that were selected during training
                available_features = [col for col in self.tree_model_features if col in test_data.columns]
                if len(available_features) != len(self.tree_model_features):
                    print(f"Warning: Only {len(available_features)}/{len(self.tree_model_features)} features available for {model_name}")
                X_test = test_data[available_features]
            else:
                # For SVR and KNN, use all available features except target
                feature_cols = [col for col in test_data.columns if col not in ['Close']]
                X_test = test_data[feature_cols]

            return model.predict(X_test)

    def calculate_model_weights(self, validation_data):
        """Calculate weights for ensemble based on validation performance"""
        print("Calculating model weights...")

        val_data = self.prepare_features(validation_data.copy())
        val_target = val_data['Close'].values

        weights = {}
        errors = {}

        for model_name in self.models.keys():
            if self.models[model_name] is not None:
                try:
                    pred = self.predict_single_model(model_name, val_data)
                    if pred is not None:
                        # Align predictions with targets
                        if model_name in ['BiLSTM', 'GRU']:
                            aligned_target = val_target[self.look_back:]
                        elif model_name == 'ARIMA':
                            aligned_target = val_target[-len(pred):]
                        else:
                            aligned_target = val_target

                        if len(pred) == len(aligned_target):
                            mse = mean_squared_error(aligned_target, pred)
                            errors[model_name] = mse
                            weights[model_name] = 1 / (mse + 1e-6)  # Inverse of error
                        else:
                            weights[model_name] = 0
                    else:
                        weights[model_name] = 0
                except Exception as e:
                    print(f"Error calculating weight for {model_name}: {e}")
                    weights[model_name] = 0
            else:
                weights[model_name] = 0

        # Normalize weights
        total_weight = sum(weights.values())
        if total_weight > 0:
            self.weights = {k: v/total_weight for k, v in weights.items()}
        else:
            # Equal weights if all models failed
            self.weights = {k: 1/len(weights) for k in weights.keys()}

        print("Model weights calculated:")
        for model, weight in self.weights.items():
            print(f"{model}: {weight:.4f}")

    def make_ensemble_predictions(self, test_data):
        """Make ensemble predictions"""
        print("Making ensemble predictions...")

        test_data_prepared = self.prepare_features(test_data.copy())
        all_predictions = {}

        # Get predictions from each model
        for model_name in self.models.keys():
            pred = self.predict_single_model(model_name, test_data_prepared)
            if pred is not None:
                all_predictions[model_name] = pred
                print(f"{model_name} predictions shape: {len(pred)}")

        # Create ensemble prediction
        if not all_predictions:
            raise ValueError("No valid predictions from any model")

        # Find the minimum length to align all predictions
        min_length = min(len(pred) for pred in all_predictions.values())

        ensemble_pred = np.zeros(min_length)

        for model_name, pred in all_predictions.items():
            weight = self.weights.get(model_name, 0)
            if weight > 0:
                # Take the last min_length predictions
                aligned_pred = pred[-min_length:] if len(pred) > min_length else pred
                ensemble_pred += weight * aligned_pred

        self.predictions = all_predictions
        return ensemble_pred

    def evaluate_predictions(self, y_true, y_pred, model_name="Ensemble"):
        """Evaluate prediction performance"""
        mse = mean_squared_error(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_true, y_pred)

        print(f"\n{model_name} Performance:")
        print(f"MSE: {mse:.6f}")
        print(f"MAE: {mae:.6f}")
        print(f"RMSE: {rmse:.6f}")
        print(f"R²: {r2:.6f}")

        return {'MSE': mse, 'MAE': mae, 'RMSE': rmse, 'R2': r2}

    def plot_predictions(self, y_true, y_pred, title="Stock Price Predictions"):
        """Plot actual vs predicted prices with additional analysis"""
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))

        # Main prediction plot
        axes[0, 0].plot(y_true, label='Actual', alpha=0.8, linewidth=1.5)
        axes[0, 0].plot(y_pred, label='Predicted', alpha=0.8, linewidth=1.5)
        axes[0, 0].set_title(title)
        axes[0, 0].set_xlabel('Time')
        axes[0, 0].set_ylabel('Price')
        axes[0, 0].legend()
        axes[0, 0].grid(True, alpha=0.3)

        # Residuals plot
        residuals = y_true - y_pred
        axes[0, 1].plot(residuals, alpha=0.7)
        axes[0, 1].axhline(y=0, color='r', linestyle='--', alpha=0.7)
        axes[0, 1].set_title('Residuals')
        axes[0, 1].set_xlabel('Time')
        axes[0, 1].set_ylabel('Residual')
        axes[0, 1].grid(True, alpha=0.3)

        # Scatter plot
        axes[1, 0].scatter(y_true, y_pred, alpha=0.6, s=1)
        axes[1, 0].plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--', alpha=0.8)
        axes[1, 0].set_xlabel('Actual')
        axes[1, 0].set_ylabel('Predicted')
        axes[1, 0].set_title('Actual vs Predicted')
        axes[1, 0].grid(True, alpha=0.3)

        # Error distribution
        axes[1, 1].hist(residuals, bins=50, alpha=0.7, edgecolor='black')
        axes[1, 1].set_title('Error Distribution')
        axes[1, 1].set_xlabel('Residual')
        axes[1, 1].set_ylabel('Frequency')
        axes[1, 1].grid(True, alpha=0.3)

        plt.tight_layout()
        plt.show()

        # Additional analysis plots for model comparison
        if hasattr(self, 'predictions') and len(self.predictions) > 1:
            self.plot_model_comparison(y_true)

    def plot_model_comparison(self, y_true):
        """Plot comparison of individual model performances"""
        fig, axes = plt.subplots(2, 1, figsize=(15, 10))

        # Plot all model predictions
        axes[0].plot(y_true, label='Actual', linewidth=2, alpha=0.8)

        colors = ['red', 'blue', 'green', 'orange', 'purple', 'brown', 'pink', 'gray']
        for i, (model_name, pred) in enumerate(self.predictions.items()):
            if len(pred) == len(y_true):
                color = colors[i % len(colors)]
                axes[0].plot(pred, label=f'{model_name}', alpha=0.6, color=color)

        axes[0].set_title('All Model Predictions Comparison')
        axes[0].set_xlabel('Time')
        axes[0].set_ylabel('Price')
        axes[0].legend()
        axes[0].grid(True, alpha=0.3)

        # Plot model weights
        if hasattr(self, 'weights') and self.weights:
            models = list(self.weights.keys())
            weights = list(self.weights.values())

            axes[1].bar(models, weights, alpha=0.7, color=colors[:len(models)])
            axes[1].set_title('Model Weights in Ensemble')
            axes[1].set_xlabel('Models')
            axes[1].set_ylabel('Weight')
            axes[1].tick_params(axis='x', rotation=45)
            axes[1].grid(True, alpha=0.3)

        plt.tight_layout()
        plt.show()

# Usage Example
def main():
    # Initialize predictor with appropriate look_back for your time series
    predictor = MemoryEfficientStockPredictor(look_back=20)

    # Load data using secure configuration
    config = Config()
    train_path = config.get_model_save_path() + 'combined_training_data.parquet'
    test_path = config.get_model_save_path() + 'batch_3.parquet'

    predictor.load_data(train_path, test_path)

    # Display data info
    print("\nDataset Overview:")
    print(f"Training data columns: {len(predictor.train_data.columns)}")
    print(f"Test data columns: {len(predictor.test_data.columns)}")

    # Split training data for validation (80-20 split)
    train_size = int(0.8 * len(predictor.train_data))
    validation_data = predictor.train_data[train_size:].copy()
    predictor.train_data = predictor.train_data[:train_size].copy()

    print(f"Training samples: {len(predictor.train_data)}")
    print(f"Validation samples: {len(validation_data)}")
    print(f"Test samples: {len(predictor.test_data)}")

    # Train all models
    print("\n" + "="*50)
    print("STARTING MODEL TRAINING")
    print("="*50)
    predictor.train_all_models()

    # Calculate weights using validation data
    print("\n" + "="*50)
    print("CALCULATING ENSEMBLE WEIGHTS")
    print("="*50)
    predictor.calculate_model_weights(validation_data)

    # Make predictions on test data
    print("\n" + "="*50)
    print("MAKING PREDICTIONS")
    print("="*50)
    ensemble_predictions = predictor.make_ensemble_predictions(predictor.test_data)

    # Prepare test targets for evaluation
    test_data_prepared = predictor.prepare_features(predictor.test_data.copy())

    # Find the appropriate target column
    target_col = 'Close'
    if target_col not in test_data_prepared.columns:
        close_cols = [col for col in test_data_prepared.columns if 'close' in col.lower()]
        if close_cols:
            target_col = close_cols[0]
            print(f"Using target column: {target_col}")

    # Align test targets with predictions
    test_targets = test_data_prepared[target_col].values[-len(ensemble_predictions):]

    print("\n" + "="*50)
    print("EVALUATION RESULTS")
    print("="*50)

    # Evaluate ensemble performance
    ensemble_metrics = predictor.evaluate_predictions(test_targets, ensemble_predictions, "Ensemble")

    # Evaluate individual models
    print("\nIndividual Model Performance:")
    individual_metrics = {}
    for model_name, pred in predictor.predictions.items():
        if len(pred) >= len(test_targets):
            aligned_pred = pred[-len(test_targets):]
            individual_metrics[model_name] = predictor.evaluate_predictions(
                test_targets, aligned_pred, model_name
            )

    # Plot results
    print("\n" + "="*50)
    print("GENERATING PLOTS")
    print("="*50)
    predictor.plot_predictions(test_targets, ensemble_predictions,
                             "Hybrid Ensemble: Equity-Futures-Options Prediction")

    # Summary report
    print("\n" + "="*50)
    print("SUMMARY REPORT")
    print("="*50)
    print(f"Best individual model: {min(individual_metrics.items(), key=lambda x: x[1]['RMSE'])[0]}")
    print(f"Ensemble RMSE: {ensemble_metrics['RMSE']:.6f}")
    print(f"Ensemble R²: {ensemble_metrics['R2']:.6f}")

    return predictor, ensemble_predictions, test_targets

# Additional utility functions for multi-asset analysis
def analyze_data_distribution(data):
    """Analyze the distribution of different asset types in the dataset"""
    print("Data Distribution Analysis:")
    print(f"Total columns: {len(data.columns)}")

    # Count different types of columns
    equity_count = sum(1 for col in data.columns if any(term in col.upper() for term in ['OPEN', 'HIGH', 'LOW', 'CLOSE', 'VOLUME']) and not any(term in col.upper() for term in ['FUT', 'CE', 'PE']))
    futures_count = sum(1 for col in data.columns if 'FUT' in col.upper())
    options_count = sum(1 for col in data.columns if any(term in col.upper() for term in ['CE', 'PE', 'CALL', 'PUT']))
    index_count = sum(1 for col in data.columns if any(term in col.upper() for term in ['NIFTY', 'SENSEX', 'BANK']))
    scaled_count = sum(1 for col in data.columns if any(pattern in col for pattern in ['_minmax', '_robust', '_z', '_std']))

    print(f"Equity-related columns: {equity_count}")
    print(f"Futures-related columns: {futures_count}")
    print(f"Options-related columns: {options_count}")
    print(f"Index-related columns: {index_count}")
    print(f"Scaled columns: {scaled_count}")

    # Memory usage analysis
    memory_usage = data.memory_usage(deep=True).sum() / (1024**2)  # MB
    print(f"Memory usage: {memory_usage:.2f} MB")

    return {
        'equity_count': equity_count,
        'futures_count': futures_count,
        'options_count': options_count,
        'index_count': index_count,
        'scaled_count': scaled_count,
        'memory_mb': memory_usage
    }

def optimize_for_colab(data, max_features=200):
    """Optimize dataset for Colab memory constraints"""
    print(f"Optimizing dataset - Original shape: {data.shape}")

    # Remove highly correlated features
    correlation_matrix = data.select_dtypes(include=[np.number]).corr().abs()
    upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

    # Find features with correlation > 0.95
    high_corr_features = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.95)]

    if high_corr_features:
        print(f"Removing {len(high_corr_features)} highly correlated features")
        data = data.drop(columns=high_corr_features)

    # If still too many features, select based on variance
    if data.shape[1] > max_features:
        numeric_data = data.select_dtypes(include=[np.number])
        variances = numeric_data.var().sort_values(ascending=False)
        selected_features = variances.head(max_features).index.tolist()

        # Always keep the target column if it exists
        if 'Close' in data.columns and 'Close' not in selected_features:
            selected_features.append('Close')

        data = data[selected_features]
        print(f"Reduced to top {len(selected_features)} features by variance")

    print(f"Optimized shape: {data.shape}")
    return data

# Enhanced main function with data analysis
def main_with_analysis():
    """Enhanced main function with comprehensive analysis"""
    print("="*60)
    print("MULTI-ASSET STOCK PREDICTION SYSTEM")
    print("Equity | Futures | Options | NSE Indices")
    print("="*60)

    # Initialize predictor
    predictor = MemoryEfficientStockPredictor(look_back=60)

    # Load data using secure configuration
    config = Config()
    train_path = config.get_model_save_path() + 'combined_training_data.parquet'
    test_path = config.get_model_save_path() + 'batch_3.parquet'

    predictor.load_data(train_path, test_path)

    # Analyze data distribution
    print("\nTRAINING DATA ANALYSIS:")
    train_analysis = analyze_data_distribution(predictor.train_data)

    print("\nTEST DATA ANALYSIS:")
    test_analysis = analyze_data_distribution(predictor.test_data)

    # Optimize for Colab if needed
    if train_analysis['memory_mb'] > 8000:  # If > 8GB
        print("\nOptimizing data for Colab memory constraints...")
        predictor.train_data = optimize_for_colab(predictor.train_data)
        predictor.test_data = optimize_for_colab(predictor.test_data)

    # Continue with the regular training process
    train_size = int(0.8 * len(predictor.train_data))
    validation_data = predictor.train_data[train_size:].copy()
    predictor.train_data = predictor.train_data[:train_size].copy()

    print(f"\nData Split:")
    print(f"Training: {len(predictor.train_data)} samples")
    print(f"Validation: {len(validation_data)} samples")
    print(f"Test: {len(predictor.test_data)} samples")

    # Train models
    print("\n" + "="*50)
    print("TRAINING HYBRID ENSEMBLE MODELS")
    print("="*50)
    predictor.train_all_models()

    # Calculate ensemble weights
    print("\n" + "="*50)
    print("OPTIMIZING ENSEMBLE WEIGHTS")
    print("="*50)
    predictor.calculate_model_weights(validation_data)

    # Make predictions
    print("\n" + "="*50)
    print("GENERATING PREDICTIONS")
    print("="*50)
    ensemble_predictions = predictor.make_ensemble_predictions(predictor.test_data)

    # Evaluation
    test_data_prepared = predictor.prepare_features(predictor.test_data.copy())

    # Smart target column detection
    target_col = 'Close'
    if target_col not in test_data_prepared.columns:
        close_cols = [col for col in test_data_prepared.columns if 'close' in col.lower()]
        if close_cols:
            target_col = close_cols[0]

    test_targets = test_data_prepared[target_col].values[-len(ensemble_predictions):]

    print("\n" + "="*50)
    print("PERFORMANCE EVALUATION")
    print("="*50)

    # Ensemble evaluation
    ensemble_metrics = predictor.evaluate_predictions(test_targets, ensemble_predictions, "Hybrid Ensemble")

    # Individual model evaluation
    individual_metrics = {}
    print("\nIndividual Model Performance:")
    for model_name, pred in predictor.predictions.items():
        if len(pred) >= len(test_targets):
            aligned_pred = pred[-len(test_targets):]
            individual_metrics[model_name] = predictor.evaluate_predictions(
                test_targets, aligned_pred, model_name
            )

    # Generate comprehensive plots
    predictor.plot_predictions(test_targets, ensemble_predictions,
                             "Multi-Asset Hybrid Ensemble Prediction")

    # Final summary
    print("\n" + "="*60)
    print("FINAL PERFORMANCE SUMMARY")
    print("="*60)

    best_individual = min(individual_metrics.items(), key=lambda x: x[1]['RMSE'])
    improvement = ((best_individual[1]['RMSE'] - ensemble_metrics['RMSE']) / best_individual[1]['RMSE']) * 100

    print(f"Best Individual Model: {best_individual[0]} (RMSE: {best_individual[1]['RMSE']:.6f})")
    print(f"Ensemble RMSE: {ensemble_metrics['RMSE']:.6f}")
    print(f"Improvement over best individual: {improvement:.2f}%")
    print(f"Ensemble R²: {ensemble_metrics['R2']:.6f}")

    # Save comprehensive results
    results_df = pd.DataFrame({
        'Actual': test_targets,
        'Ensemble': ensemble_predictions
    })

    for model_name, pred in predictor.predictions.items():
        if len(pred) >= len(test_targets):
            results_df[model_name] = pred[-len(test_targets):]

    try:
        results_path = config.get_model_save_path() + 'multi_asset_predictions.csv'
        results_df.to_csv(results_path, index=False)
        print(f"\nDetailed results saved to: {results_path}")
    except:
        print("Could not save results to drive")

    return predictor, ensemble_predictions, test_targets, results_df, individual_metrics

# Run the main function
if __name__ == "__main__":
    predictor, predictions, targets = main()

In [None]:
# =====================================
# 📊 ML Data Validation Framework
# =====================================

def validate_ml_dataframe(df: pd.DataFrame, required_columns: List[str] = None, 
                         target_column: str = 'Close') -> pd.DataFrame:
    """
    Validate DataFrame for ML training with comprehensive checks.
    
    Args:
        df: Input DataFrame
        required_columns: List of required column names
        target_column: Name of target column
    
    Returns:
        Validated DataFrame ready for ML processing
    """
    if df is None or df.empty:
        raise ValueError("Cannot train models on empty DataFrame")
    
    print(f"🔍 Starting ML data validation for {len(df)} records...")
    
    # Basic structure validation
    if len(df) < 100:
        print(f"⚠️ Warning: Dataset has only {len(df)} records, which may be insufficient for training")
    
    # Check for target column
    if target_column not in df.columns:
        # Try to find suitable target
        close_cols = [col for col in df.columns if 'close' in col.lower()]
        if close_cols:
            target_column = close_cols[0]
            print(f"🎯 Using {target_column} as target column")
        else:
            raise ValueError(f"Target column '{target_column}' not found and no suitable alternative")
    
    # Validate target column
    if df[target_column].isna().all():
        raise ValueError(f"Target column '{target_column}' contains only NaN values")
    
    # Check for sufficient non-NaN target values
    valid_target_count = df[target_column].notna().sum()
    if valid_target_count < len(df) * 0.7:  # Less than 70% valid targets
        print(f"⚠️ Warning: Only {valid_target_count}/{len(df)} ({valid_target_count/len(df)*100:.1f}%) target values are valid")
    
    # Validate required columns if specified
    if required_columns:
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            print(f"⚠️ Missing required columns: {missing_cols}")
            # Add missing columns with NaN
            for col in missing_cols:
                df[col] = np.nan
    
    # Remove columns with all NaN values
    all_nan_cols = df.columns[df.isna().all()].tolist()
    if all_nan_cols:
        print(f"Removing {len(all_nan_cols)} columns with all NaN values")
        df = df.drop(columns=all_nan_cols)
    
    # Remove columns with single unique value (excluding NaN)
    single_value_cols = []
    for col in df.select_dtypes(include=[np.number]).columns:
        unique_vals = df[col].dropna().nunique()
        if unique_vals <= 1:
            single_value_cols.append(col)
    
    if single_value_cols and target_column not in single_value_cols:
        print(f"Removing {len(single_value_cols)} columns with single values: {single_value_cols[:5]}{'...' if len(single_value_cols) > 5 else ''}")
        df = df.drop(columns=single_value_cols)
    
    # Handle infinite values
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    inf_counts = {}
    for col in numeric_cols:
        inf_count = np.isinf(df[col]).sum()
        if inf_count > 0:
            inf_counts[col] = inf_count
            df[col] = df[col].replace([np.inf, -np.inf], np.nan)
    
    if inf_counts:
        print(f"Replaced infinite values in {len(inf_counts)} columns")
    
    # Check for extreme outliers in target
    target_mean = df[target_column].mean()
    target_std = df[target_column].std()
    
    if target_std > 0:
        z_scores = np.abs((df[target_column] - target_mean) / target_std)
        extreme_outliers = (z_scores > 10).sum()
        
        if extreme_outliers > 0:
            print(f"⚠️ Found {extreme_outliers} extreme outliers in target column (>10 std dev)")
            # Cap outliers instead of removing them
            outlier_mask = z_scores > 10
            df.loc[outlier_mask, target_column] = target_mean + (10 * target_std * np.sign(df.loc[outlier_mask, target_column] - target_mean))
    
    print(f"✅ ML data validation completed: {len(df)} records, {len(df.columns)} features")
    return df

def validate_ml_features(X, y, feature_names=None):
    """
    Validate feature matrix and target vector for ML training.
    
    Args:
        X: Feature matrix
        y: Target vector
        feature_names: List of feature names (optional)
    
    Returns:
        Validated X, y, and feature_names
    """
    print(f"🔍 Validating ML features: X{X.shape}, y{y.shape if hasattr(y, 'shape') else len(y)}")
    
    # Convert to numpy if needed
    if hasattr(X, 'values'):
        X = X.values
    if hasattr(y, 'values'):
        y = y.values
    
    X = np.array(X)
    y = np.array(y)
    
    # Basic shape validation
    if len(X) != len(y):
        raise ValueError(f"Feature matrix length ({len(X)}) doesn't match target length ({len(y)})")
    
    if len(X) == 0:
        raise ValueError("Empty feature matrix")
    
    # Check for NaN values
    nan_features = np.isnan(X).any(axis=0)
    nan_target = np.isnan(y)
    
    if nan_features.any():
        nan_feature_count = nan_features.sum()
        print(f"⚠️ Found NaN values in {nan_feature_count} features")
        
        # Remove features that are mostly NaN
        nan_ratio = np.isnan(X).mean(axis=0)
        high_nan_features = nan_ratio > 0.5
        
        if high_nan_features.any():
            print(f"Removing {high_nan_features.sum()} features with >50% NaN values")
            X = X[:, ~high_nan_features]
            if feature_names:
                feature_names = [name for i, name in enumerate(feature_names) if not high_nan_features[i]]
        
        # Impute remaining NaN values
        from sklearn.impute import SimpleImputer
        imputer = SimpleImputer(strategy='median')
        X = imputer.fit_transform(X)
    
    if nan_target.any():
        valid_mask = ~nan_target
        print(f"Removing {nan_target.sum()} samples with NaN target values")
        X = X[valid_mask]
        y = y[valid_mask]
    
    # Check for constant features
    feature_std = np.std(X, axis=0)
    constant_features = feature_std == 0
    
    if constant_features.any():
        print(f"Removing {constant_features.sum()} constant features")
        X = X[:, ~constant_features]
        if feature_names:
            feature_names = [name for i, name in enumerate(feature_names) if not constant_features[i]]
    
    # Final validation
    if X.shape[1] == 0:
        raise ValueError("No valid features remaining after validation")
    
    print(f"✅ Feature validation completed: X{X.shape}, y{y.shape}")
    return X, y, feature_names

def safe_model_training(model_func, X, y, *args, **kwargs):
    """
    Safely train a model with error handling and validation.
    
    Args:
        model_func: Function to train the model
        X: Feature matrix
        y: Target vector
        *args, **kwargs: Additional arguments for model_func
    
    Returns:
        Trained model or None if training fails
    """
    try:
        # Validate inputs
        X, y, _ = validate_ml_features(X, y)
        
        if len(X) < 50:
            print(f"⚠️ Warning: Training with only {len(X)} samples")
        
        # Train model
        model = model_func(X, y, *args, **kwargs)
        print(f"✅ Model training completed successfully")
        return model
        
    except Exception as e:
        print(f"❌ Model training failed: {str(e)}")
        return None

print("✅ ML data validation utilities loaded successfully!")
