In [1]:
import pandas as pd
import numpy as np
import logging
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from typing import Dict, List, Tuple
import ta

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
class EnhancedFinancialPreprocessor:
    def __init__(self, sequence_length: int = 20):
        """
        Initialize preprocessor with your dataset's fields plus additional features
        """
        self.sequence_length = sequence_length
        self.scalers = {}
        
        # Feature groups based on your actual data fields
        self.feature_groups = {
            'price': [
                'Open', 'High', 'Low', 'Close', 'Adj Close',
                'Returns', 'True_Range', 'ATR'
            ],
            
            'moving_averages': [
                'MA20',
                'MA50',  # Will be calculated
                'MA200', # Will be calculated
                'EMA20', # Will be calculated
            ],
            
            'volatility': [
                '20dSTD',
                'Upper_Band',
                'Lower_Band',
                'BB_Width',    # Will be calculated
                'Volatility',
                'ATR',         # Will be calculated
            ],
            
            'momentum': [
                'RSI',
                'MACD',
                'Signal_Line',
                'MACD_Histogram',  # Will be calculated
                'ROC',            # Will be calculated
                'Momentum'        # Will be calculated
            ],
            
            'volume': [
                'Volume',
                'Volume_MA',
                'Volume_Ratio',
                'OBV'            # Will be calculated
            ],
            
            'events': [
                'Dividends',
                'Stock Splits'
            ]
        }
        
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
    def calculate_additional_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Calculate additional technical indicators not in original data
        """
        try:
            df = df.copy()
            
            # 1. Additional Moving Averages
            df['MA50'] = df['Close'].rolling(window=50).mean()
            df['MA200'] = df['Close'].rolling(window=200).mean()
            df['EMA20'] = df['Close'].ewm(span=20, adjust=False).mean()
            
            # 2. Bollinger Band Width
            df['BB_Width'] = (df['Upper_Band'] - df['Lower_Band']) / df['MA20']
            
            # 3. Average True Range (ATR)
            high_low = df['High'] - df['Low']
            high_close = np.abs(df['High'] - df['Close'].shift())
            low_close = np.abs(df['Low'] - df['Close'].shift())
            df['True_Range'] = np.maximum(high_low, np.maximum(high_close, low_close))
            df['ATR'] = df['True_Range'].rolling(window=14).mean()
            
            # 4. MACD Histogram
            df['MACD_Histogram'] = df['MACD'] - df['Signal_Line']
            
            # 5. Rate of Change (ROC)
            df['ROC'] = ((df['Close'] - df['Close'].shift(10)) / 
                        df['Close'].shift(10)) * 100
            
            # 6. Momentum
            df['Momentum'] = df['Close'] - df['Close'].shift(10)
            
            # 7. On Balance Volume (OBV)
            df['OBV'] = np.where(df['Close'] > df['Close'].shift(),
                                df['Volume'],
                                np.where(df['Close'] < df['Close'].shift(),
                                        -df['Volume'], 0)).cumsum()
            
            return df
            
        except Exception as e:
            self.logger.error(f"Error calculating additional features: {str(e)}")
            raise
    
    def normalize_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Normalize features based on their characteristics
        """
        df = df.copy()
        
        # Price normalization
        price_features = self.feature_groups['price']
        self.scalers['price'] = MinMaxScaler()
        df[price_features] = self.scalers['price'].fit_transform(df[price_features])
        
        # Volume normalization (log transform then scale)
        volume_features = ['Volume', 'Volume_MA', 'OBV']
        df[volume_features] = np.log1p(df[volume_features])
        self.scalers['volume'] = MinMaxScaler()
        df[volume_features] = self.scalers['volume'].fit_transform(df[volume_features])
        
        # Note: RSI, MACD, etc. are already normalized
        
        return df
    
    def prepare_data(self, df: pd.DataFrame, train_split: float = 0.8) -> Dict:
        """
        Complete data preparation pipeline
        """
        try:
            # 1. Ensure date is index

            df['Date'] = pd.to_datetime(df['Date'])
            df = df.set_index('Date')
            
            # 2. Calculate additional features
            df = self.calculate_additional_features(df)
            
            # 3. Handle missing values
            df = self.handle_missing_values(df)
            
            # 4. Normalize features
            df = self.normalize_features(df)
            
            # 5. Create sequences
            X, y = self.create_sequences(df)
            
            # 6. Train/validation split
            split_idx = int(len(X) * train_split)
            
            return {
                'train': {
                    'X': X[:split_idx],
                    'y': y[:split_idx]
                },
                'val': {
                    'X': X[split_idx:],
                    'y': y[split_idx:]
                },
                'scalers': self.scalers,
                'feature_groups': self.feature_groups
            }
            
        except Exception as e:
            self.logger.error(f"Error in data preparation: {str(e)}")
            raise
            
    def handle_missing_values(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Handle missing values based on feature type
        """
        df = df.copy()
        
        # Forward fill price and MA data
        price_ma_features = (self.feature_groups['price'] + 
                           self.feature_groups['moving_averages'])
        df[price_ma_features] = df[price_ma_features].fillna(method='ffill')
        
        # Zero fill volume data
        df[self.feature_groups['volume']] = df[self.feature_groups['volume']].fillna(0)
        
        # Forward fill other indicators
        df = df.fillna(method='ffill')
        
        return df
    
    def create_sequences(self, df: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
        """
        Create sequences for CNN-LSTM model
        """
        # Get all features except events
        features = [feat for group, feats in self.feature_groups.items() 
                   if group != 'events' for feat in feats]
        
        # Create sequences
        X, y = [], []
        for i in range(len(df) - self.sequence_length):
            sequence = df[features].iloc[i:(i + self.sequence_length)]
            target = df['Returns'].iloc[i + self.sequence_length]
            
            X.append(sequence.values)
            y.append(target)
            
        return np.array(X), np.array(y)



In [13]:
def load_and_analyze_data():
    """
    Load and analyze the SP500 master data
    """
    try:
        # Load the data
        logger.info("Loading SP500 master data...")
        df = pd.read_csv('sp500_master_data.csv')
        
        # Convert date to datetime and handle timezone
        df['Date'] = pd.to_datetime(df['Date']).dt.tz_localize(None)
        
        # Basic data analysis
        logger.info("\nDataset Overview:")
        logger.info(f"Total rows: {len(df)}")
        logger.info(f"Total columns: {len(df.columns)}")
        logger.info(f"Date range: {df['Date'].min()} to {df['Date'].max()}")
        logger.info(f"Number of unique stocks: {df['Symbol'].nunique()}")
        
        return df
        
    except Exception as e:
        logger.error(f"Error loading data: {str(e)}")
        raise

def prepare_stock_data(df: pd.DataFrame, symbol: str = None):
    """
    Prepare data for a single stock or all stocks
    """
    try:
        # Filter for specific stock if provided
        if symbol:
            df = df[df['Symbol'] == symbol].copy()
            logger.info(f"\nPreparing data for {symbol}")
            
            if len(df) == 0:
                raise ValueError(f"No data found for symbol {symbol}")
        
        # Ensure date is timezone-naive
        df['Date'] = pd.to_datetime(df['Date'])
        
        # Sort by date
        df = df.sort_values('Date')
        
        # Initialize preprocessor
        preprocessor = EnhancedFinancialPreprocessor(sequence_length=20)
        
        # Prepare data
        prepared_data = preprocessor.prepare_data(df)
        
        # Log preparation summary
        logger.info("\nData Preparation Summary:")
        logger.info(f"Training sequences shape: {prepared_data['train']['X'].shape}")
        logger.info(f"Validation sequences shape: {prepared_data['val']['X'].shape}")
        
        return prepared_data
        
    except Exception as e:
        logger.error(f"Error preparing data: {str(e)}")
        raise

def visualize_features(df: pd.DataFrame, symbol: str):
    """
    Create visualization of key features for a stock
    """
    try:
        # Filter data for the specified stock
        stock_data = df[df['Symbol'] == symbol].copy()
        
        if len(stock_data) == 0:
            raise ValueError(f"No data found for symbol {symbol}")
            
        stock_data['Date'] = pd.to_datetime(stock_data['Date']).dt.tz_localize(None)
        
        # Create visualization
        fig, axes = plt.subplots(3, 1, figsize=(15, 12))
        
        # Price and Moving Averages
        axes[0].plot(stock_data['Date'], stock_data['Close'], label='Close')
        axes[0].plot(stock_data['Date'], stock_data['MA20'], label='MA20')
        axes[0].set_title(f'{symbol} - Price and Moving Averages')
        axes[0].legend()
        
        # Technical Indicators
        axes[1].plot(stock_data['Date'], stock_data['RSI'], label='RSI')
        axes[1].plot(stock_data['Date'], stock_data['MACD'], label='MACD')
        axes[1].set_title('Technical Indicators')
        axes[1].legend()
        
        # Volume
        axes[2].bar(stock_data['Date'], stock_data['Volume'])
        axes[2].set_title('Volume')
        
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        logger.error(f"Error creating visualization: {str(e)}")
        raise
    """
    Create visualization of key features for a stock
    """
    try:
        # Filter data for the specified stock
        stock_data = df[df['Symbol'] == Symbol].copy()
        stock_data['Date'] = pd.to_datetime(stock_data['Date'])
        
        # Create visualization
        fig, axes = plt.subplots(3, 1, figsize=(15, 12))
        
        # Price and Moving Averages
        axes[0].plot(stock_data['Date'], stock_data['Close'], label='Close')
        axes[0].plot(stock_data['Date'], stock_data['MA_20'], label='MA_20')
        axes[0].plot(stock_data['Date'], stock_data['MA_50'], label='MA_50')
        axes[0].set_title(f'{Symbol} - Price and Moving Averages')
        axes[0].legend()
        
        # Technical Indicators
        axes[1].plot(stock_data['Date'], stock_data['RSI_14'], label='RSI_14')
        axes[1].plot(stock_data['Date'], stock_data['MACD'], label='MACD')
        axes[1].set_title('Technical Indicators')
        axes[1].legend()
        
        # Volume
        axes[2].bar(stock_data['Date'], stock_data['Volume'])
        axes[2].set_title('Volume')
        
        plt.tight_layout()
        plt.show()
        
    except Exception as e:
        logger.error(f"Error creating visualization: {str(e)}")
        raise


In [14]:
# Main execution
if __name__ == "__main__":
    try:
        # Load and analyze data
        df = load_and_analyze_data()
        
        # Example: Prepare data for a single stock (e.g., AAPL)
        aapl_data = prepare_stock_data(df, 'AAPL')
        
        # Visualize features
        visualize_features(df, 'AAPL')
        
        # Optional: Prepare data for all stocks
        # all_stocks_data = prepare_stock_data(df)
        
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        raise


INFO:__main__:Loading SP500 master data...
  df['Date'] = pd.to_datetime(df['Date']).dt.tz_localize(None)
ERROR:__main__:Error loading data: Can only use .dt accessor with datetimelike values
ERROR:__main__:Error in main execution: Can only use .dt accessor with datetimelike values


AttributeError: Can only use .dt accessor with datetimelike values