In [1]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Load the processed data
data = np.load("../data/processed/processed_data.npz", allow_pickle=True)
X = data['X']
y = data['y']

# For illustration, print the shapes
print("X shape:", X.shape)  # (num_samples, window_size, feature_dimension)
print("y shape:", y.shape)  # (num_samples,)

# Define the model
model = Sequential([
    LSTM(64, input_shape=(X.shape[1], X.shape[2]), return_sequences=False),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1)  # Predicting the next day’s adjusted close
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Example split: Assume you manually split or use a time-based split for training/validation
split_idx = int(0.7 * len(X))
X_train, X_val = X[:split_idx], X[split_idx:]
y_train, y_val = y[:split_idx], y[split_idx:]

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                    epochs=50, batch_size=32)

# Now the model is ready for evaluation or further use


X shape: (1470, 10, 14)
y shape: (1470,)
Epoch 1/50


  super().__init__(**kwargs)


[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - loss: 0.5621 - val_loss: 0.0949
Epoch 2/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0932 - val_loss: 0.0687
Epoch 3/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0820 - val_loss: 0.0623
Epoch 4/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0662 - val_loss: 0.0537
Epoch 5/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0630 - val_loss: 0.0548
Epoch 6/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0593 - val_loss: 0.0471
Epoch 7/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0507 - val_loss: 0.0465
Epoch 8/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0487 - val_loss: 0.0421
Epoch 9/50
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

In [None]:
# Assume X_test and y_test are prepared similarly to training data
loss = model.evaluate(X_test, y_test)
print("Test Loss (MSE):", loss)


In [None]:
import numpy as np
import pandas as pd

# Let's assume:
# - df is your original DataFrame with data for all tickers.
# - window_size is the size of your sliding window.
# - dynamic_features = ["Adj Close", "Close", "High", "Low", "Open", "Volume"]
# - static_features_df is your DataFrame with one-hot encoded static features for each ticker.
# - scalers_dict is a dictionary that maps each ticker to its fitted StandardScaler.

predictions = []

# Process each ticker individually
for ticker, ticker_df in df.groupby("Ticker"):
    # Sort by date to ensure proper time order
    ticker_df = ticker_df.sort_values("Date")
    
    # Extract the latest window of dynamic features
    last_window = ticker_df.iloc[-window_size:][dynamic_features].values
    
    # Retrieve the corresponding scaler for the ticker and normalize the window
    scaler = scalers_dict[ticker]  # You should have saved this during training
    normalized_window = scaler.transform(last_window)
    
    # Get one-hot encoded static features for the ticker (as a 1D vector)
    static_vector = static_features_df[static_features_df["Ticker"] == ticker]\
                        .drop(columns=["Ticker"]).values.flatten()
    # Repeat the static vector for each day in the window and concatenate with dynamic features
    static_repeated = np.tile(static_vector, (window_size, 1))
    input_seq = np.hstack([normalized_window, static_repeated])
    
    # Reshape for model input (batch dimension)
    input_seq = input_seq[np.newaxis, ...]
    
    # Predict tomorrow's adjusted close price
    predicted_price = model.predict(input_seq)[0, 0]
    
    # Get the current (last available) price
    current_price = ticker_df.iloc[-1]["Adj Close"]
    
    # Calculate predicted ROI
    roi = (predicted_price - current_price) / current_price
    
    predictions.append({
        "Ticker": ticker,
        "Predicted Price": predicted_price,
        "Current Price": current_price,
        "Predicted ROI": roi
    })

# Create a DataFrame and sort by ROI (highest first)
predictions_df = pd.DataFrame(predictions)
predictions_df = predictions_df.sort_values(by="Predicted ROI", ascending=False)
print(predictions_df)


## V2

In [None]:
###########################
# MODEL DEFINITION & TRAINING
###########################

# Load the processed data (if needed, here we use X and y directly)
print("X shape:", X.shape)  # (num_samples, window_size, feature_dimension)
print("y shape:", y.shape)  # (num_samples,)

# Define the model architecture
model = Sequential([
    LSTM(64, input_shape=(X.shape[1], X.shape[2]), return_sequences=False),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1)  # Output: next day's (normalized) adjusted close
])
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Split the data using a time-based split (e.g., 70% train, 30% validation)
split_idx = int(0.7 * len(X))
X_train, X_val = X[:split_idx], X[split_idx:]
y_train, y_val = y[:split_idx], y[split_idx:]

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                    epochs=50, batch_size=32)

In [None]:
###########################
# PREDICTION & EVALUATION: TOMORROW'S PRICES & ROI CALCULATION
###########################

# Define dynamic features and create a list to store predictions
dynamic_features = ["Adj Close", "Close", "High", "Low", "Open", "Volume"]
predictions = []

# For each ticker in the original dataset, predict tomorrow's price
for ticker, ticker_df in df_original.groupby("Ticker"):
    ticker_df = ticker_df.sort_values("Date")
    
    # Ensure there are at least 'window_size' records
    if len(ticker_df) < window_size:
        continue
    
    # Extract the latest window (most recent window_size days) of dynamic features in original scale
    latest_window = ticker_df.iloc[-window_size:][dynamic_features].values
    
    # Get the corresponding scaler for this ticker and normalize the latest window
    scaler = scalers_dict[ticker]
    normalized_window = scaler.transform(latest_window)
    
    # Get one-hot encoded static features for this ticker and repeat them for each day in the window
    static_vector = static_features_df[static_features_df["Ticker"] == ticker]\
                        .drop(columns=["Ticker"]).values.flatten()
    static_repeated = np.tile(static_vector, (window_size, 1))
    
    # Create the input sequence by concatenating normalized dynamic features with static info
    input_seq = np.hstack([normalized_window, static_repeated])
    input_seq = input_seq[np.newaxis, ...]  # add batch dimension
    
    # Predict tomorrow's (normalized) adjusted close using the trained model
    normalized_pred = model.predict(input_seq)[0, 0]
    
    # Invert the normalization for "Adj Close" (first column) to recover the actual price
    # Using scaler.mean_[0] and scaler.scale_[0] because "Adj Close" is the first feature
    predicted_price = normalized_pred * scaler.scale_[0] + scaler.mean_[0]
    
    # Get the current (most recent) price from the original data
    current_price = ticker_df.iloc[-1]["Adj Close"]
    
    # Calculate the predicted ROI: (predicted - current) / current
    roi = (predicted_price - current_price) / current_price
    
    predictions.append({
        "Ticker": ticker,
        "Predicted Price": predicted_price,
        "Current Price": current_price,
        "Predicted ROI": roi
    })

# Create a DataFrame with the predictions and sort by predicted ROI (highest first)
predictions_df = pd.DataFrame(predictions)
predictions_df = predictions_df.sort_values(by="Predicted ROI", ascending=False)
print(predictions_df)