# LSTM Model Training for Stock Prediction

This notebook trains LSTM models for predicting stock price movements for:
- 20 different stocks
- 3 prediction periods (day, week, month)

The trained models will be saved to `../model/lstm/period/lstm_stock_period.h5`

## 1. Import Libraries and Setup

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
import random
import tensorflow as tf
np.random.seed(42)
random.seed(42)
tf.random.set_seed(42)

# Define the list of stocks to process
stocks = [
    'AAPL', 'MSFT', 'GOOG', 'AMZN', 'TSLA', 'META', 'NVDA', 'SPY', 'V', 'DIS',
    'NFLX', 'PYPL', 'BABA', 'IBM', 'AMD', 'BA', 'INTC', 'T', 'GS', 'NKE'
]

# Define periods
periods = ['day', 'week', 'month']

# Create output directories
for period in periods:
    os.makedirs(f'../model/lstm/{period}', exist_ok=True)
print("Output directories created.")

## 2. Data Loading and Preparation Functions

In [None]:
def load_data(stock, period):
    """Load prepared data from CSV"""
    file_path = f'../data/lstm/{period}/{stock}_lstm_{period}.csv'
    
    if not os.path.exists(file_path):
        print(f"Warning: File {file_path} does not exist. Skipping.")
        return None
    
    df = pd.read_csv(file_path)
    return df

In [None]:
def prepare_lstm_data(df, target_col, time_steps=10, test_size=0.2, validation_size=0.2):
    """Prepare data for LSTM model training with sequences"""
    # Convert date column to datetime and sort
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'])
        df.sort_values('date', inplace=True)
        df = df.drop(columns=['date'])  # Remove date after sorting
    
    # Extract features and target
    y = df[target_col].values
    X = df.drop(columns=[target_col]).values
    
    # Create sequences for LSTM
    X_seq, y_seq = [], []
    for i in range(len(X) - time_steps):
        X_seq.append(X[i:i + time_steps])
        y_seq.append(y[i + time_steps])
    
    X_seq = np.array(X_seq)
    y_seq = np.array(y_seq)
    
    # Split into training and temporary test sets
    X_train_temp, X_test, y_train_temp, y_test = train_test_split(
        X_seq, y_seq, test_size=test_size, shuffle=False
    )
    
    # Split training set into actual training and validation sets
    val_size_adjusted = validation_size / (1 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_temp, y_train_temp, test_size=val_size_adjusted, shuffle=False
    )
    
    return X_train, X_val, X_test, y_train, y_val, y_test

## 3. Model Building Function

In [None]:
def build_lstm_model(input_shape, lstm_units=64, dropout_rate=0.2, learning_rate=0.001):
    """Build LSTM model for binary classification"""
    model = Sequential([
        LSTM(lstm_units, return_sequences=True, input_shape=input_shape),
        Dropout(dropout_rate),
        LSTM(lstm_units // 2),
        Dropout(dropout_rate),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

## 4. Quick Parameter Selection Function

In [None]:
def select_parameters(X_train, X_val, y_train, y_val, verbose=0):
    """Simple parameter selection for LSTM model"""
    # Define parameter options to try
    lstm_units_options = [32, 64]
    dropout_options = [0.2, 0.3]
    learning_rate_options = [0.001, 0.0005]
    
    best_accuracy = 0
    best_params = {}
    
    # Simple grid search
    for lstm_units in lstm_units_options:
        for dropout_rate in dropout_options:
            for learning_rate in learning_rate_options:
                # Build model with current parameters
                model = build_lstm_model(
                    input_shape=(X_train.shape[1], X_train.shape[2]),
                    lstm_units=lstm_units,
                    dropout_rate=dropout_rate,
                    learning_rate=learning_rate
                )
                
                # Train for a few epochs to get a quick estimate
                history = model.fit(
                    X_train, y_train,
                    epochs=5,
                    batch_size=32,
                    validation_data=(X_val, y_val),
                    verbose=verbose
                )
                
                # Get validation accuracy
                val_accuracy = history.history['val_accuracy'][-1]
                
                # Print current parameters and accuracy
                if verbose > 0:
                    print(f"LSTM units: {lstm_units}, Dropout: {dropout_rate}, LR: {learning_rate}, Val Acc: {val_accuracy:.4f}")
                
                # Update best parameters if current is better
                if val_accuracy > best_accuracy:
                    best_accuracy = val_accuracy
                    best_params = {
                        'lstm_units': lstm_units,
                        'dropout_rate': dropout_rate,
                        'learning_rate': learning_rate
                    }
    
    print(f"Best parameters: {best_params}, Best validation accuracy: {best_accuracy:.4f}")
    return best_params

## 5. Model Training Function

In [None]:
def train_lstm_model(X_train, X_val, y_train, y_val, params, epochs=50, batch_size=32):
    """Train LSTM model with early stopping"""
    # Build model with selected parameters
    model = build_lstm_model(
        input_shape=(X_train.shape[1], X_train.shape[2]),
        lstm_units=params['lstm_units'],
        dropout_rate=params['dropout_rate'],
        learning_rate=params['learning_rate']
    )
    
    # Set up callbacks
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True
    )
    
    # Train model
    history = model.fit(
        X_train, y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_val, y_val),
        callbacks=[early_stopping],
        verbose=1
    )
    
    return model, history

## 6. Evaluate and Plot Results

In [None]:
def evaluate_lstm_model(model, X_test, y_test, history, stock, period):
    """Evaluate LSTM model and plot results"""
    # Evaluate on test set
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test accuracy: {test_accuracy:.4f}")
    
    # Plot training history
    plt.figure(figsize=(12, 5))
    
    # Plot accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title(f'{stock} {period} - Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='lower right')
    
    # Plot loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title(f'{stock} {period} - Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper right')
    
    plt.tight_layout()
    plt.show()
    
    # Generate predictions for test set
    y_pred_prob = model.predict(X_test, verbose=0)
    y_pred = (y_pred_prob > 0.5).astype(int).flatten()
    
    # Calculate confusion matrix
    from sklearn.metrics import confusion_matrix, classification_report
    cm = confusion_matrix(y_test, y_pred)
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix
    plt.figure(figsize=(6, 5))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title(f'{stock} {period} - Confusion Matrix')
    plt.colorbar()
    classes = ['Down', 'Up']
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)
    
    # Add text annotations
    thresh = cm.max() / 2
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], 'd'),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
    
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    
    return test_accuracy

## 7. Main Processing Function for Single Stock

In [None]:
def process_stock_lstm(stock, period, time_steps=10, param_selection=True, plot=True):
    """Process a single stock for a specific prediction period"""
    print(f"\nProcessing {stock} for {period} prediction...")
    
    # Load data
    df = load_data(stock, period)
    if df is None:
        return None
    
    # Get target column name
    target_col = f'Target_Next_{period.capitalize()}'
    if target_col not in df.columns:
        print(f"Error: Target column {target_col} not found in data.")
        return None
    
    # Prepare data
    X_train, X_val, X_test, y_train, y_val, y_test = prepare_lstm_data(
        df, target_col, time_steps=time_steps
    )
    
    print(f"Data shapes: X_train: {X_train.shape}, X_val: {X_val.shape}, X_test: {X_test.shape}")
    
    # Quick parameter selection if enabled
    if param_selection:
        params = select_parameters(X_train, X_val, y_train, y_val)
    else:
        # Default parameters
        params = {
            'lstm_units': 64,
            'dropout_rate': 0.2,
            'learning_rate': 0.001
        }
    
    # Train model
    model, history = train_lstm_model(X_train, X_val, y_train, y_val, params)
    
    # Evaluate model
    if plot:
        test_accuracy = evaluate_lstm_model(model, X_test, y_test, history, stock, period)
    else:
        test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
        print(f"Test accuracy: {test_accuracy:.4f}")
    
    # Save model
    model_path = f'../model/lstm/{period}/lstm_{stock}_{period}.h5'
    model.save(model_path)
    print(f"Model saved to {model_path}")
    
    return model

## 8. Test with Sample Stock

In [None]:
# Test with a sample stock
sample_stock = 'AAPL'
sample_period = 'day'

test_model = process_stock_lstm(sample_stock, sample_period, time_steps=10, param_selection=True, plot=True)

## 9. Process All Stocks

In [None]:
def process_all_stocks(time_steps=10, param_selection=False, plot=False):
    """Process all stocks and periods"""
    results = {}
    
    # Process each stock
    for stock in stocks:
        results[stock] = {}
        
        # Process each period
        for period in periods:
            print(f"\n{'='*50}")
            print(f"Training {stock} {period} model")
            print(f"{'='*50}")
            
            # Process the current stock-period combination
            model = process_stock_lstm(
                stock=stock,
                period=period,
                time_steps=time_steps,
                param_selection=param_selection,
                plot=plot
            )
            
            if model is not None:
                results[stock][period] = 'Success'
            else:
                results[stock][period] = 'Failed'
    
    # Display summary
    print("\nProcessing Summary:")
    for stock in stocks:
        print(f"\n{stock}:")
        for period in periods:
            status = results.get(stock, {}).get(period, 'Not processed')
            print(f"  {period}: {status}")
    
    return results

## 10. Run Full Processing

In [None]:
# Set to True when ready to process all stocks
process_all = True

if process_all:
    # Process all stocks
    # Set param_selection=False to use default parameters for all models
    # Set plot=False to disable plotting for each model
    processing_results = process_all_stocks(time_steps=10, param_selection=False, plot=False)
else:
    print("Full processing is disabled. Set 'process_all = True' to train all models.")

## 11. Verify Saved Models

In [None]:
def verify_saved_models():
    """Verify that all expected models have been saved"""
    print("Verifying saved models...")
    
    expected_count = len(stocks) * len(periods)
    found_count = 0
    missing_models = []
    
    for period in periods:
        for stock in stocks:
            model_path = f'../model/lstm/{period}/lstm_{stock}_{period}.h5'
            if os.path.exists(model_path):
                found_count += 1
            else:
                missing_models.append(f"{stock}_{period}")
    
    print(f"Found {found_count} out of {expected_count} expected models.")
    
    if missing_models:
        print(f"Missing {len(missing_models)} models:")
        for model in missing_models[:10]:  # Show first 10 if many are missing
            print(f"  - {model}")
        if len(missing_models) > 10:
            print(f"  ...and {len(missing_models) - 10} more")
    else:
        print("All expected models have been saved successfully!")

In [None]:
# Run this after processing all stocks to verify models were saved
if process_all:
    verify_saved_models()