# Stock Market Prediction with LSTM

**Generated notebook** for the uploaded CSV: `/mnt/data/portfolio_data.csv`.

- Detected columns: `Date, AMZN, DPZ, BTC, NFLX`.
- Suggested date column: `Date`.
- Suggested target (price) column: `NFLX`.

This notebook contains a full pipeline: EDA, preprocessing, LSTM model building, training, evaluation, and how to use the trained model for future predictions.

Run the cells sequentially. If your CSV uses different column names, update the variables in the **Configuration** cell accordingly.

In [None]:
# Configuration - edit these if your column names differ
CSV_PATH = r'/mnt/data/portfolio_data.csv'
DATE_COL = 'Date'
TARGET_COL = 'NFLX'
TEST_SIZE = 0.2          # fraction of data to use for testing
SEQUENCE_LENGTH = 60     # number of past days used to predict the next day
BATCH_SIZE = 32
EPOCHS = 50
RANDOM_STATE = 42
MODEL_SAVE_PATH = '/mnt/data/stock_lstm_model.h5'
SCALER_SAVE_PATH = '/mnt/data/scaler.pkl'


In [None]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import pickle
import os
print('TensorFlow version:', tf.__version__)


In [None]:
# Load data
df = pd.read_csv(CSV_PATH)
print('Shape:', df.shape)
print('Columns:', df.columns.tolist())
display(df.head())

# Ensure date column is datetime and sort by date if present
if DATE_COL in df.columns:
    df[DATE_COL] = pd.to_datetime(df[DATE_COL], errors='coerce')
    df = df.sort_values(DATE_COL).reset_index(drop=True)
    print(f'Using date column: {DATE_COL} (range: {df[DATE_COL].min()} to {df[DATE_COL].max()})')
else:
    print('Date column not found; proceeding without date index.')


In [None]:
# EDA - basic visualizations and checks
# Plot target column
if TARGET_COL in df.columns:
    plt.figure(figsize=(12,4))
    plt.plot(df[TARGET_COL], label=TARGET_COL)
    plt.title(f'{TARGET_COL} over time')
    plt.xlabel('Index (time-sorted rows)')
    plt.ylabel(TARGET_COL)
    plt.legend()
    plt.show()
else:
    print(f'Target column {TARGET_COL} not found in data.')
    
# Check missing values and basic stats
display(df.describe())
print('\nMissing values per column:')
print(df.isna().sum())

In [None]:
# Preprocessing
# We will use only the TARGET_COL for univariate LSTM (predict next price from past prices).
data = df[[TARGET_COL]].copy()

# Handle missing values by forward fill then backward fill
data = data.fillna(method='ffill').fillna(method='bfill')

# Scaling
scaler = MinMaxScaler(feature_range=(0,1))
scaled = scaler.fit_transform(data.values.reshape(-1,1))

# Save scaler for later use
with open(SCALER_SAVE_PATH, 'wb') as f:
    pickle.dump(scaler, f)

print('Scaled data shape:', scaled.shape)

In [None]:
# Create sequences for LSTM
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length, 0])
        y.append(data[i+seq_length, 0])
    X = np.array(X)
    y = np.array(y)
    return X, y

SEQ_LEN = SEQUENCE_LENGTH
X, y = create_sequences(scaled, SEQ_LEN)
print('X shape:', X.shape, 'y shape:', y.shape)

# reshape X for LSTM [samples, time_steps, features]
X = X.reshape((X.shape[0], X.shape[1], 1))

# Train/test split (time-series split, keep order)
split_idx = int(X.shape[0] * (1 - TEST_SIZE))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)

In [None]:
# Build LSTM model
def build_lstm(input_shape):
    model = Sequential()
    model.add(LSTM(64, return_sequences=True, input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(LSTM(32, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

model = build_lstm((X_train.shape[1], X_train.shape[2]))
model.summary()

In [None]:
# Training
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6),
    ModelCheckpoint(MODEL_SAVE_PATH, monitor='val_loss', save_best_only=True)
]

history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    verbose=1
)

In [None]:
# Evaluation - plot training history and predictions
plt.figure(figsize=(10,4))
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend(); plt.title('Loss'); plt.show()

# Predictions (and inverse scale)
preds = model.predict(X_test)
# inverse transform
with open(SCALER_SAVE_PATH, 'rb') as f:
    scaler = pickle.load(f)
preds_inv = scaler.inverse_transform(preds)
y_test_inv = scaler.inverse_transform(y_test.reshape(-1,1))

# Metrics
mse = mean_squared_error(y_test_inv, preds_inv)
mae = mean_absolute_error(y_test_inv, preds_inv)
print(f'MSE: {mse:.4f}, MAE: {mae:.4f}')

# Plot actual vs predicted
plt.figure(figsize=(12,4))
plt.plot(y_test_inv, label='Actual')
plt.plot(preds_inv, label='Predicted')
plt.legend(); plt.title('Actual vs Predicted'); plt.show()

In [None]:
# Save the final model (already saved via ModelCheckpoint) and example single-step inference
print('Model saved to', MODEL_SAVE_PATH)

# Example: predict the next value after the last available sequence
last_seq = scaled[-SEQ_LEN:].reshape(1, SEQ_LEN, 1)
pred_scaled = model.predict(last_seq)
pred_price = scaler.inverse_transform(pred_scaled)[0,0]
print('Predicted next price (single-step):', pred_price)

## Notes & Next steps

- This is a univariate LSTM using only the chosen target price column. For improved results, consider multivariate models using features like Open, High, Low, Volume, technical indicators (moving averages, RSI, MACD), and exogenous variables.
- Try hyperparameter tuning (number of layers, units, learning rate, sequence length).
- Consider using walk-forward validation for better time-series evaluation.
- If your CSV uses different column names, change `DATE_COL` and `TARGET_COL` in the configuration cell.

---

*Notebook generated automatically.*