# Data Preparation for LSTM Models

This notebook prepares stock data for LSTM models by:
1. Loading clean data from `../data/cleaned/stock.csv`
2. Adding technical indicators relevant for prediction
3. Creating target variables for different prediction periods (day, week, month)
4. Selecting features based on correlation with targets
5. Preparing and exporting data to `../data/lstm/period/stock_lstm_period.csv`

## 1. Import Libraries and Setup

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

# Define the list of stocks to process
stocks = [
    'AAPL', 'MSFT', 'GOOG', 'AMZN', 'TSLA', 'META', 'NVDA', 'SPY', 'V', 'DIS',
    'NFLX', 'PYPL', 'BABA', 'IBM', 'AMD', 'BA', 'INTC', 'T', 'GS', 'NKE'
]

# Create output directories
for period in ['day', 'week', 'month']:
    os.makedirs(f'../data/lstm/{period}', exist_ok=True)
print("Output directories created.")

## 2. Data Loading Function

In [None]:
def load_data(stock):
    """Load stock data from cleaned files"""
    file_path = f'../data/cleaned/{stock}.csv'
    
    if not os.path.exists(file_path):
        print(f"Warning: File {file_path} does not exist. Skipping.")
        return None
    
    df = pd.read_csv(file_path)
    df['Date'] = pd.to_datetime(df['Date'])
    df.set_index('Date', inplace=True)
    
    return df

# Test with a sample stock
sample_stock = 'AAPL'
sample_df = load_data(sample_stock)

if sample_df is not None:
    print(f"Loaded {sample_stock} data with shape: {sample_df.shape}")
    print(f"Columns: {', '.join(sample_df.columns[:5])}...")
    print(f"Date range: {sample_df.index.min()} to {sample_df.index.max()}")

## 3. Add Technical Indicators

In [None]:
def add_technical_indicators(df):
    """Add technical indicators useful for LSTM prediction"""
    df_new = df.copy()
    
    # Bollinger Bands Percentage
    if all(col in df_new.columns for col in ['BB_Upper', 'BB_Lower']):
        df_new['BB_Percentage'] = (df_new['Close'] - df_new['BB_Lower']) / (df_new['BB_Upper'] - df_new['BB_Lower'])
    
    # Rate of Change (ROC)
    for period in [5, 10, 20]:
        df_new[f'ROC_{period}'] = df_new['Close'].pct_change(periods=period) * 100
    
    # True Range and ATR
    df_new['TR'] = np.maximum(
        df_new['High'] - df_new['Low'],
        np.maximum(
            abs(df_new['High'] - df_new['Close'].shift(1)),
            abs(df_new['Low'] - df_new['Close'].shift(1))
        )
    )
    df_new['ATR_14'] = df_new['TR'].rolling(window=14).mean()
    
    # Price momentum and acceleration
    df_new['Momentum_5'] = df_new['Close'] - df_new['Close'].shift(5)
    df_new['Momentum_10'] = df_new['Close'] - df_new['Close'].shift(10)
    df_new['Acceleration_5'] = df_new['Momentum_5'] - df_new['Momentum_5'].shift(5)
    
    # Log returns
    df_new['Log_Return_1D'] = np.log(df_new['Close'] / df_new['Close'].shift(1))
    
    # Drop NaN values
    df_new = df_new.dropna()
    
    return df_new

# Test with sample data
if sample_df is not None:
    enhanced_df = add_technical_indicators(sample_df)
    new_columns = set(enhanced_df.columns) - set(sample_df.columns)
    print(f"Added {len(new_columns)} new technical indicators")
    print(f"New shape: {enhanced_df.shape}")

## 4. Create Target Variables

In [59]:
def create_target_variables(df):
    """Create target variables for different prediction periods"""
    df_new = df.copy()
    
    # Next day price direction (1 if up, 0 if down)
    df_new['Target_Next_Day'] = (df_new['Close'].shift(-1) > df_new['Close']).astype(int)
    
    # Next week average price direction
    next_week_mean = df_new['Close'].rolling(window=5).mean().shift(-5)
    df_new['Target_Next_Week'] = (next_week_mean > df_new['Close']).astype(int)
    
    # Next month average price direction
    next_month_mean = df_new['Close'].rolling(window=21).mean().shift(-21)
    df_new['Target_Next_Month'] = (next_month_mean > df_new['Close']).astype(int)
    
    # Drop rows with NaN target values
    df_new = df_new.dropna(subset=['Target_Next_Day', 'Target_Next_Week', 'Target_Next_Month'])
    
    return df_new

# Test with enhanced data
if 'enhanced_df' in locals():
    target_df = create_target_variables(enhanced_df)
    print("Target class distribution:")
    for col in ['Target_Next_Day', 'Target_Next_Week', 'Target_Next_Month']:
        up_pct = target_df[col].mean() * 100
        print(f"  {col}: {up_pct:.1f}% Up, {100-up_pct:.1f}% Down")

Target class distribution:
  Target_Next_Day: 52.9% Up, 47.1% Down
  Target_Next_Week: 57.3% Up, 42.7% Down
  Target_Next_Month: 61.3% Up, 38.7% Down


## 5. Feature Selection Based on Correlation

In [60]:
def select_features(df, target_column, threshold=0.8, verbose=False):
    """Select features based on correlation with target and remove highly correlated features"""
    # Calculate correlation matrix
    corr_matrix = df.corr(numeric_only=True)
    
    # Get target correlations
    target_corr = corr_matrix[target_column].drop(target_column)
    
    # Find highly correlated feature pairs
    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) >= threshold:
                col1, col2 = corr_matrix.columns[i], corr_matrix.columns[j]
                # Skip if either column is the target
                if col1 == target_column or col2 == target_column:
                    continue
                high_corr_pairs.append((col1, col2, abs(corr_matrix.iloc[i, j])))
    
    if verbose and high_corr_pairs:
        print(f"Found {len(high_corr_pairs)} feature pairs with correlation >= {threshold}")
    
    # Sort features by absolute correlation with target
    sorted_features = target_corr.abs().sort_values(ascending=False)
    
    # For each pair of highly correlated features, keep the one with higher correlation to target
    removed_features = set()
    for col1, col2, _ in high_corr_pairs:
        if col1 in removed_features or col2 in removed_features:
            continue
            
        # Keep the feature with higher correlation to target
        corr1 = abs(target_corr.get(col1, 0))
        corr2 = abs(target_corr.get(col2, 0))
        
        if corr1 >= corr2:
            removed_features.add(col2)
        else:
            removed_features.add(col1)
    
    # Select features not in removed_features
    selected_features = [col for col in sorted_features.index if col not in removed_features]
    
    # Add essential price and volume features if not already included
    essential_features = ['Open', 'High', 'Low', 'Close', 'Volume']
    for feature in essential_features:
        if feature in df.columns and feature not in selected_features and feature not in removed_features:
            selected_features.append(feature)
    
    if verbose:
        print(f"Selected {len(selected_features)} features, removed {len(removed_features)} highly correlated features")
    
    return selected_features

# Test feature selection
if 'target_df' in locals():
    selected_features_day = select_features(target_df, 'Target_Next_Day', verbose=True)
    print(f"Top 5 features: {', '.join(selected_features_day[:5])}")

Found 108 feature pairs with correlation >= 0.8
Selected 22 features, removed 24 highly correlated features
Top 5 features: Target_1D, Target_Next_Week, Target_1W, Target_Next_Month, Target_1M


## 6. Prepare Data for LSTM

In [61]:
def prepare_data_for_lstm(df, feature_cols, target_col):
    """Prepare data for LSTM model by scaling features and adding date"""
    # Select only needed columns
    df_selected = df[feature_cols + [target_col]].copy()
    
    # Scale features (keeping the target as is)
    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(
        scaler.fit_transform(df_selected[feature_cols]),
        columns=feature_cols,
        index=df_selected.index
    )
    
    # Add target column back
    df_scaled[target_col] = df_selected[target_col]
    
    # Add date column for reference
    df_scaled['date'] = df_scaled.index
    
    return df_scaled

# Test data preparation
if 'target_df' in locals() and 'selected_features_day' in locals():
    prepared_day = prepare_data_for_lstm(target_df, selected_features_day, 'Target_Next_Day')
    print(f"Prepared data shape: {prepared_day.shape}")
    print(f"Columns: {prepared_day.columns[:5].tolist()}...")

Prepared data shape: (3645, 24)
Columns: ['Target_1D', 'Target_Next_Week', 'Target_1W', 'Target_Next_Month', 'Target_1M']...


## 7. Main Processing Function

In [62]:
def process_stock(stock, verbose=False):
    """Process a single stock for all prediction periods"""
    print(f"Processing {stock}...")
    
    # Load data
    df = load_data(stock)
    if df is None:
        return
    
    # Add technical indicators
    df_enhanced = add_technical_indicators(df)
    
    # Create target variables
    df_targets = create_target_variables(df_enhanced)
    
    # Process for each prediction period
    periods = {
        'day': 'Target_Next_Day',
        'week': 'Target_Next_Week',
        'month': 'Target_Next_Month'
    }
    
    for period, target_col in periods.items():
        # Select features based on correlation
        selected_features = select_features(df_targets, target_col, verbose=verbose)
        
        # Prepare data for LSTM
        prepared_data = prepare_data_for_lstm(df_targets, selected_features, target_col)
        
        # Export prepared data
        output_path = f'../data/lstm/{period}/{stock}_lstm_{period}.csv'
        prepared_data.to_csv(output_path, index=False)
        
        print(f"  {period.capitalize()}: {prepared_data.shape[0]} records, {len(selected_features)} features")

## 8. Process All Stocks

In [None]:
# Process all stocks
for stock in stocks:
    process_stock(stock)

## 9. Data Quality Check

In [64]:
def check_data_quality(stock, period):
    """Check the quality of prepared data"""
    file_path = f'../data/lstm/{period}/{stock}_lstm_{period}.csv'
    
    if not os.path.exists(file_path):
        print(f"Warning: File {file_path} does not exist.")
        return None
    
    # Load data
    df = pd.read_csv(file_path)
    
    # Basic checks
    print(f"\nData Quality Check - {stock} {period}:")
    print(f"  Records: {df.shape[0]}, Features: {df.shape[1]}")
    
    # Check for missing values
    missing_values = df.isnull().sum().sum()
    print(f"  Missing values: {missing_values}")
    
    # Check target distribution
    target_col = f"Target_Next_{period.capitalize()}"
    if target_col in df.columns:
        up_pct = df[target_col].mean() * 100
        print(f"  Target distribution: {up_pct:.1f}% Up, {100-up_pct:.1f}% Down")
    
    # Simple feature correlation with target
    df_numeric = df.select_dtypes(include=['float64', 'int64'])
    if 'date' in df_numeric.columns:
        df_numeric = df_numeric.drop(columns=['date'])
        
    if target_col in df_numeric.columns:
        target_corr = df_numeric.corr()[target_col].abs().sort_values(ascending=False)
        print(f"  Top 3 features correlated with target:")
        for feature, corr in target_corr.drop(target_col).head(3).items():
            print(f"    - {feature}: {corr:.4f}")
    
    return df

# Check data quality for a sample stock
sample_stock = 'AAPL'
for period in ['day', 'week', 'month']:
    check_data_quality(sample_stock, period)


Data Quality Check - AAPL day:
  Records: 3645, Features: 24
  Missing values: 0
  Target distribution: 52.9% Up, 47.1% Down
  Top 3 features correlated with target:
    - Target_1D: 1.0000
    - Target_Next_Week: 0.4551
    - Target_1W: 0.2829

Data Quality Check - AAPL week:
  Records: 3645, Features: 22
  Missing values: 0
  Target distribution: 57.3% Up, 42.7% Down
  Top 3 features correlated with target:
    - Target_1W: 0.7332
    - Target_Next_Month: 0.4661
    - Target_1D: 0.4551

Data Quality Check - AAPL month:
  Records: 3645, Features: 22
  Missing values: 0
  Target distribution: 61.3% Up, 38.7% Down
  Top 3 features correlated with target:
    - Target_1M: 0.6963
    - Target_1W: 0.5251
    - Target_Next_Week: 0.4661


## 10. Overall Processing Summary

In [65]:
def summarize_processing():
    """Summarize the data processing results"""
    print("\nProcessing Summary:")
    
    all_files = []
    missing_count = 0
    for period in ['day', 'week', 'month']:
        period_files = [f for f in os.listdir(f'../data/lstm/{period}') if f.endswith('.csv')]
        all_files.extend([f"../data/lstm/{period}/{f}" for f in period_files])
        
        # Check for missing files
        expected_count = len(stocks)
        missing = expected_count - len(period_files)
        missing_count += missing
        
        print(f"  {period.capitalize()} prediction: {len(period_files)}/{expected_count} files created")
    
    # Calculate total data size
    total_size = sum(os.path.getsize(f) for f in all_files) / (1024 * 1024)  # Size in MB
    print(f"  Total storage used: {total_size:.2f} MB")
    
    # Check for any issues
    if missing_count > 0:
        print(f"\nWarning: {missing_count} files are missing. Check for errors in processing.")
    else:
        print("\nAll files have been created successfully!")
    
    # Sample a few files to check target balance
    print("\nTarget balance check (sample of stocks):")
    sample_stocks = stocks[:3]  # First 3 stocks as sample
    for period in ['day', 'week', 'month']:
        print(f"  {period.capitalize()} prediction:")
        for stock in sample_stocks:
            file_path = f'../data/lstm/{period}/{stock}_lstm_{period}.csv'
            if os.path.exists(file_path):
                df = pd.read_csv(file_path)
                target_col = f"Target_Next_{period.capitalize()}"
                up_pct = df[target_col].mean() * 100
                print(f"    - {stock}: {up_pct:.1f}% Up, {100-up_pct:.1f}% Down")

summarize_processing()


Processing Summary:
  Day prediction: 20/20 files created
  Week prediction: 20/20 files created
  Month prediction: 20/20 files created
  Total storage used: 74.45 MB

All files have been created successfully!

Target balance check (sample of stocks):
  Day prediction:
    - AAPL: 52.9% Up, 47.1% Down
    - MSFT: 52.4% Up, 47.6% Down
    - GOOG: 53.2% Up, 46.8% Down
  Week prediction:
    - AAPL: 57.3% Up, 42.7% Down
    - MSFT: 57.3% Up, 42.7% Down
    - GOOG: 55.4% Up, 44.6% Down
  Month prediction:
    - AAPL: 61.3% Up, 38.7% Down
    - MSFT: 62.2% Up, 37.8% Down
    - GOOG: 59.6% Up, 40.4% Down


sample of outputs 

In [66]:
day=pd.read_csv('../data/lstm/day/AAPL_lstm_day.csv')
week=pd.read_csv('../data/lstm/week/AAPL_lstm_week.csv')
month=pd.read_csv('../data/lstm/month/AAPL_lstm_month.csv')

In [67]:
day.head(3)

Unnamed: 0,Target_1D,Target_Next_Week,Target_1W,Target_Next_Month,Target_1M,Monthly_Return,BB_Percentage,Daily_Return,Volume_MA_20,DayOfWeek,...,Weekly_Return,Acceleration_5,Volatility_30D,MACD_Signal,Month,BB_Width,Momentum_5,Daily_Change,Target_Next_Day,date
0,0.0,0.0,1.0,1.0,1.0,0.402416,0.374633,0.444898,0.602307,0.0,...,0.464342,0.378171,0.171696,0.513175,0.909091,0.112911,0.626432,0.345732,0,2010-11-15
1,0.0,1.0,1.0,1.0,1.0,0.360384,0.24595,0.393341,0.564933,0.25,...,0.441265,0.379279,0.154242,0.512199,0.909091,0.128078,0.625344,0.343752,0,2010-11-16
2,1.0,1.0,1.0,1.0,1.0,0.395318,0.24766,0.443479,0.549196,0.5,...,0.418725,0.378922,0.154822,0.511036,0.909091,0.144562,0.624207,0.346259,1,2010-11-17


In [68]:
week.head(3)

Unnamed: 0,Target_1W,Target_Next_Month,Target_1D,Target_1M,ROC_20,BB_Percentage,Volatility_30D,Volatility_10D,MACD_Hist,Volume_MA_20,...,DayOfWeek,Momentum_5,Log_Return_1D,MACD,Month,BB_Width,Daily_Change,Acceleration_5,Target_Next_Week,date
0,1.0,1.0,0.0,1.0,0.380506,0.374633,0.171696,0.131246,0.592761,0.602307,...,0.0,0.626432,0.479758,0.551346,0.909091,0.112911,0.345732,0.378171,0,2010-11-15
1,1.0,1.0,0.0,1.0,0.395089,0.24595,0.154242,0.129883,0.590088,0.564933,...,0.25,0.625344,0.427354,0.549526,0.909091,0.128078,0.343752,0.379279,1,2010-11-16
2,1.0,1.0,1.0,1.0,0.38404,0.24766,0.154822,0.118344,0.588365,0.549196,...,0.5,0.624207,0.478326,0.547898,0.909091,0.144562,0.346259,0.378922,1,2010-11-17


In [69]:
month.head(3)

Unnamed: 0,Target_1M,Target_1W,Target_Next_Week,Target_1D,ATR_14,Volume_MA_20,Volatility_30D,BB_Percentage,ROC_20,Volatility_10D,...,Log_Return_1D,MACD_Hist,Daily_Change,Momentum_5,DayOfWeek,MACD,Acceleration_5,Month,Target_Next_Month,date
0,1.0,1.0,0.0,0.0,0.006022,0.602307,0.171696,0.374633,0.380506,0.131246,...,0.479758,0.592761,0.345732,0.626432,0.0,0.551346,0.378171,0.909091,1,2010-11-15
1,1.0,1.0,1.0,0.0,0.006643,0.564933,0.154242,0.24595,0.395089,0.129883,...,0.427354,0.590088,0.343752,0.625344,0.25,0.549526,0.379279,0.909091,1,2010-11-16
2,1.0,1.0,1.0,1.0,0.006507,0.549196,0.154822,0.24766,0.38404,0.118344,...,0.478326,0.588365,0.346259,0.624207,0.5,0.547898,0.378922,0.909091,1,2010-11-17
