# Sentiment-Based Trading Model Training

This notebook trains a neural network model that incorporates news sentiment as a feature for predicting stock price movements.

## Data collection

In [78]:
import os
import sys
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from pathlib import Path
from tqdm.notebook import tqdm
import pickle
from alpaca.data.historical import StockHistoricalDataClient
from alpaca.data.requests import StockBarsRequest, StockLatestQuoteRequest
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
import time

from trading_bot_llm_sentiment_brian import TradingBotLLMSentiment

In [2]:
load_dotenv()

# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Initialize the trading bot to use its data collection methods
bot = TradingBotLLMSentiment()
print(f"Bot initialized with symbols: {bot.symbols}")

# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

2025-03-27 21:19:26,790 - trading_bot_llm_sentiment_brian - INFO - Trading bot trading_bot_llm_sentiment_brian initialized with symbols: ['AAPL', 'MSFT', 'META', 'GOOGL', 'AMZN', 'NVDA']
Bot initialized with symbols: ['AAPL', 'MSFT', 'META', 'GOOGL', 'AMZN', 'NVDA']


In [72]:
days=365
symbols = ['AAPL', 'MSFT', 'META', 'GOOGL', 'AMZN', 'NVDA']

symbol = 'AAPL'
filename = 'combined_historical_with_daily_sentiment.csv'

In [76]:
api_key = os.environ['ALPACA_API_KEY']
api_secret = os.environ['ALPACA_API_SECRET']
data_client = StockHistoricalDataClient(api_key, api_secret)

In [79]:
multisymbol_request_params = StockLatestQuoteRequest(symbol_or_symbols=symbols)
latest_quotes = data_client.get_stock_latest_quote(multisymbol_request_params)

In [81]:
current_quote = latest_quotes[symbol]

In [82]:
current_quote.ask_price

228.0

In [80]:
print(latest_quotes)

{'META': {   'ask_exchange': ' ',
    'ask_price': 0.0,
    'ask_size': 0.0,
    'bid_exchange': 'V',
    'bid_price': 600.0,
    'bid_size': 6.0,
    'conditions': ['R'],
    'symbol': 'META',
    'tape': 'C',
    'timestamp': datetime.datetime(2025, 3, 27, 19, 59, 59, 999779, tzinfo=TzInfo(UTC))}, 'MSFT': {   'ask_exchange': 'V',
    'ask_price': 399.7,
    'ask_size': 1.0,
    'bid_exchange': 'V',
    'bid_price': 390.51,
    'bid_size': 1.0,
    'conditions': ['R'],
    'symbol': 'MSFT',
    'tape': 'C',
    'timestamp': datetime.datetime(2025, 3, 27, 19, 59, 59, 998011, tzinfo=TzInfo(UTC))}, 'AAPL': {   'ask_exchange': 'V',
    'ask_price': 228.0,
    'ask_size': 2.0,
    'bid_exchange': 'V',
    'bid_price': 222.7,
    'bid_size': 2.0,
    'conditions': ['R'],
    'symbol': 'AAPL',
    'tape': 'C',
    'timestamp': datetime.datetime(2025, 3, 27, 19, 59, 58, 353500, tzinfo=TzInfo(UTC))}, 'NVDA': {   'ask_exchange': 'V',
    'ask_price': 115.15,
    'ask_size': 6.0,
    'bid_exchan

In [31]:
df = bot.get_historical_data(symbol, days=days)

2025-03-27 21:29:53,934 - trading_bot_llm_sentiment_brian - INFO - Retrieved 250 bars for AAPL


In [48]:
df.head()

Unnamed: 0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap
0,AAPL,2025-03-17 04:00:00+00:00,213.31,215.22,209.97,214.0,48073426.0,577436.0,213.202242
1,AAPL,2025-03-18 04:00:00+00:00,214.16,215.15,211.49,212.69,42432426.0,493004.0,213.10947
2,AAPL,2025-03-19 04:00:00+00:00,214.22,218.76,213.75,215.24,54385391.0,524678.0,215.609629
3,AAPL,2025-03-20 04:00:00+00:00,213.99,217.4899,212.22,214.1,48862947.0,499769.0,214.396693
4,AAPL,2025-03-21 04:00:00+00:00,211.56,218.84,211.28,218.27,94127768.0,496948.0,215.734078


In [5]:
def collect_historical_data_with_daily_sentiment(symbol, days=365):
    """
    Collect historical price data and daily sentiment data for a given symbol.
    
    Args:
        symbol (str): Stock symbol.
        days (int): Number of days of historical data to collect.
        
    Returns:
        DataFrame: Combined price and sentiment data.
    """
    print(f"Collecting data for {symbol}...")
    
    # Get historical price data
    df = bot.get_historical_data(symbol, days=days)
    if df is None:
        print(f"No historical data found for {symbol}")
        return None
    
    # Create a copy to avoid modifying the original dataframe
    df = df.copy()
    
    # Extract the date from the timestamp for daily grouping
    df['date'] = pd.to_datetime(df['timestamp']).dt.date
    daily_dates = df['date'].unique().tolist()
    
    print(f"Collected {len(df)} price data points, calculating sentiment for {len(daily_dates)} days...")
    
    # Add sentiment column
    df['sentiment'] = np.nan
    
    # Get sentiment for each day
    for date in daily_dates:
        date_str = pd.to_datetime(date).strftime('%Y-%m-%d')
        print(f"Getting sentiment for {symbol} for {date_str}")
        
        articles = 10
        news_date = pd.to_datetime(date)
        # Using a 1-day lookback range to fetch daily sentiment
        lookback_range = timedelta(days=1)
        
        sentiment = bot.get_sentiment_signal(symbol, articles, news_date, lookback_range)
        df.loc[df['date'] == date, 'sentiment'] = sentiment
        time.sleep(1)
    
    
    # Drop rows with NaN values
    # df = df.dropna()
    
    print(f"Final dataset: {len(df)} rows for {symbol}")
    return df


## Process all symbols and save the dataset

In [None]:
# List to store dataframes for each symbol
dfs = []

for symbol in symbols:
    try:
        data = collect_historical_data_with_daily_sentiment(symbol)
        if data is not None:
            # Optionally add a symbol column if you want a combined DF later
            data['symbol'] = symbol  
            
            # Save individual CSV for each symbol
            data.to_csv(f"data/{symbol}_historical_with_daily_sentiment.csv", index=False)
            print(f"Saved data for {symbol}")
            
            # Append to our list for later combining
            dfs.append(data)
        else:
            print(f"No data found for {symbol}")
    except Exception as e:
        print(f"Error processing {symbol}: {e}")

# If you want a single combined DataFrame for all symbols:
if dfs:
    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df.to_csv("data/combined_historical_with_daily_sentiment.csv", index=False)
    print("Saved combined data for all symbols.")
else:
    print("No data to combine.")


Collecting data for META...
2025-03-26 19:05:42,850 - trading_bot_llm_sentiment_brian - INFO - Retrieved 250 bars for META
Collected 250 price data points, calculating sentiment for 250 days...
Getting sentiment for META for 2024-03-27
Getting sentiment for META for 2024-03-28
Getting sentiment for META for 2024-04-01
2025-03-26 19:05:45,483 - trading_bot_llm_sentiment_brian - INFO - News articles for META:
2025-03-26 19:05:45,484 - trading_bot_llm_sentiment_brian - INFO -   Article 1: https://finnhub.io/api/news?id=d9dcc7807959f9508f5981f9a49ca85ea0820782a355ebeb17d6ce5857f02ae4
2025-03-26 19:05:45,485 - trading_bot_llm_sentiment_brian - INFO -   Article 2: https://finnhub.io/api/news?id=5ff100437475a5fae8b93ed294211e0ebcd0e659128db6d4aa987aec26730d81
2025-03-26 19:05:45,486 - trading_bot_llm_sentiment_brian - INFO -   Article 3: https://finnhub.io/api/news?id=999024a9e1737a39ed43fd26cef6ff2af23222d2d3d06faf3e378f01e1c6df2e
2025-03-26 19:05:45,486 - trading_bot_llm_sentiment_brian - I

## Model Training

In [39]:
SEQ_LENGTH = 30  # use past 30 days
FEATURES = ['open', 'high', 'low', 'close', 'volume', 'sentiment']
TRAIN_RATIO = 0.8

In [32]:
combined_file = "data/combined_historical_with_daily_sentiment.csv"
if not os.path.exists(combined_file):
    raise FileNotFoundError(f"{combined_file} does not exist.")


In [33]:
df = pd.read_csv(combined_file, index_col=0)
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df.sort_values(['symbol', 'timestamp'])

In [35]:
df.tail(1)

Unnamed: 0_level_0,timestamp,open,high,low,close,volume,trade_count,vwap,date,sentiment
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
NVDA,2025-03-27 04:00:00+00:00,111.35,114.45,110.66,111.43,236658701.0,1787874.0,112.34066,2025-03-27,-0.5


In [36]:
X_train_list, y_train_list = [], []
X_test_list, y_test_list = [], []

In [47]:
def create_sequences(df, seq_length=SEQ_LENGTH, feature_columns=FEATURES):
    """
    Create sequences from the DataFrame using a sliding window.
    For each sequence of past `seq_length` days, the target is the closing price on day seq_length+1.
    """
    X, y = [], []
    if len(df) < seq_length + 1:
        return None, None
    for i in range(len(df) - seq_length):
        # Sequence of features for past seq_length days
        seq = df.iloc[i:i+seq_length][feature_columns].values
        # Target is next day's closing price
        target = df.iloc[i+seq_length]['close']
        X.append(seq)
        y.append(target)
    return np.array(X), np.array(y)

In [48]:
for symbol, group in df.groupby('symbol'):
    group = group.sort_values('timestamp').reset_index(drop=True)
    
    X_symbol, y_symbol = create_sequences(group, seq_length=SEQ_LENGTH)
    
    if len(X_symbol) == 0:
        print(f"Not enough data for {symbol}; skipping.")
        continue
    
    # Split data by time (first TRAIN_RATIO for training, rest for testing)
    split_idx = int(len(X_symbol) * TRAIN_RATIO)
    X_train_list.append(X_symbol[:split_idx])
    y_train_list.append(y_symbol[:split_idx])
    X_test_list.append(X_symbol[split_idx:])
    y_test_list.append(y_symbol[split_idx:])
    
    print(f"{symbol}: {len(X_symbol)} sequences, {split_idx} training samples and {len(X_symbol) - split_idx} testing samples.")


AAPL: 221 sequences, 176 training samples and 45 testing samples.
AMZN: 221 sequences, 176 training samples and 45 testing samples.
GOOGL: 221 sequences, 176 training samples and 45 testing samples.
META: 221 sequences, 176 training samples and 45 testing samples.
MSFT: 221 sequences, 176 training samples and 45 testing samples.
NVDA: 221 sequences, 176 training samples and 45 testing samples.


In [49]:
X_train = np.concatenate(X_train_list, axis=0)
y_train = np.concatenate(y_train_list, axis=0)
X_test = np.concatenate(X_test_list, axis=0)
y_test = np.concatenate(y_test_list, axis=0)

In [50]:
print("Combined training shape:", X_train.shape)
print("Combined testing shape:", X_test.shape)

Combined training shape: (1056, 30, 6)
Combined testing shape: (270, 30, 6)


In [51]:
num_train_samples, seq_len, num_features = X_train.shape
X_train_2d = X_train.reshape(-1, num_features)
X_test_2d = X_test.reshape(-1, num_features)

In [59]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_2d)
X_test_scaled = scaler.transform(X_test_2d)

In [53]:
X_train_scaled = X_train_scaled.reshape(num_train_samples, seq_len, num_features)
num_test_samples = X_test.shape[0]
X_test_scaled = X_test_scaled.reshape(num_test_samples, seq_len, num_features)


In [60]:
with open('data/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [54]:
model = Sequential()
model.add(LSTM(64, activation='tanh', return_sequences=True, input_shape=(SEQ_LENGTH, num_features)))
model.add(Dropout(0.2))  # Dropout to reduce overfitting

# Second LSTM layer
model.add(LSTM(32, activation='tanh'))
model.add(Dropout(0.2))

# An intermediate dense layer
model.add(Dense(16, activation='relu'))

# Final output layer for regression
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse')
model.summary()

In [55]:
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


In [56]:
# history = model.fit(X_train_scaled, y_train, 
#                     epochs=50, batch_size=32, 
#                     validation_split=0.2, callbacks=[early_stop])

history = model.fit(X_train_scaled, y_train, 
                    epochs=100, batch_size=32, 
                    validation_split=0.2, callbacks=[])

Epoch 1/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 122149.3359 - val_loss: 150955.8438
Epoch 2/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 120361.4219 - val_loss: 149708.5469
Epoch 3/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 118707.0859 - val_loss: 147717.0469
Epoch 4/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 116307.1953 - val_loss: 145179.3906
Epoch 5/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 113122.1328 - val_loss: 142266.1875
Epoch 6/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - loss: 109749.5469 - val_loss: 139010.8906
Epoch 7/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 105761.0547 - val_loss: 135454.0469
Epoch 8/100
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - los

In [57]:
test_loss = model.evaluate(X_test_scaled, y_test)
print("Test Loss (MSE):", test_loss)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 23154.1504
Test Loss (MSE): 33324.23046875


In [66]:
now = datetime.now().strftime('%Y-%m-%d')
print(now)

2025-03-27


In [69]:
os.makedirs('data/models', exist_ok=True)
date = datetime.now().strftime('%Y-%m-%d')
path = 'data'
model_name = f"lstm_combined_model_{date}.keras"
model.save(f"data/models/{model_name}")
print(f"Saved LSTM model to models/{model_name}")

Saved LSTM model to models/lstm_combined_model_2025-03-27.keras


In [65]:
import pickle
from tensorflow.keras.models import load_model
import path

def predict_todays_closing_price_enriched(symbol):
    """
    Predict today's closing price for the given symbol using enriched price data that includes sentiment.
    
    Process:
      1. Load and update historical sentiment data.
      2. Filter the enriched data for the given symbol.
      3. Use the last TIME_SERIES_LENGTH rows (the most recent trading days) as the input sequence.
      4. Scale the sequence and predict today's closing price using the trained model.
    
    Returns:
        float or None: The predicted closing price for today, or None if not enough data.
    """
    # Load the scaler
    with open('data/scaler.pkl', 'rb') as f:
        SCALER = pickle.load(f)
    
    # Load the trained model.
    MODEL = load_model("data/models/lstm_combined_model_2025-03-27.keras")
    
    # Step 1: Load and update historical sentiment data.
    enriched_df = bot.load_and_update_sentiment_data(30)
    if enriched_df is None or enriched_df.empty:
        print("Failed to load sentiment data.")
        return None

    # Step 2: Filter for the specific symbol and sort by timestamp.
    symbol_df = enriched_df[enriched_df['symbol'] == symbol].sort_values(by="timestamp")
    if symbol_df.empty:
        print(f"No data available for {symbol}.")
        return None

    # Step 3: Check if there are at least TIME_SERIES_LENGTH rows.
    if len(symbol_df) < TIME_SERIES_LENGTH:
        print("Not enough data to form a prediction sequence.")
        return None
    else:
        # Use the last TIME_SERIES_LENGTH rows for prediction.
        feature_columns = ['open', 'high', 'low', 'close', 'volume', 'sentiment']
        latest_seq = symbol_df.iloc[-TIME_SERIES_LENGTH:][feature_columns].values

    # Step 4: Reshape and scale the sequence.
    latest_seq_2d = latest_seq.reshape(-1, NUM_FEATURES)
    latest_seq_scaled_2d = SCALER.transform(latest_seq_2d)
    latest_seq_scaled = latest_seq_scaled_2d.reshape(1, TIME_SERIES_LENGTH, NUM_FEATURES)
    
    # Predict using the trained model.
    predicted_price = MODEL.predict(latest_seq_scaled)
    predicted_value = predicted_price[0][0]
    print(f"Predicted closing price for {symbol} is {predicted_value:.2f}")
    return predicted_value


ModuleNotFoundError: No module named 'path'

In [69]:
import pickle
from tensorflow.keras.models import load_model

def predict_todays_closing_price_enriched(symbol):
    """
    Predict today's closing price for the given symbol using enriched price data that includes sentiment.
    
    Process:
      1. Load and update historical sentiment data.
      2. Filter the enriched data for the given symbol.
      3. Use the last TIME_SERIES_LENGTH rows (the most recent trading days) as the input sequence.
      4. Scale the sequence and predict today's closing price using the trained model.
    
    Returns:
        float or None: The predicted closing price for today, or None if not enough data.
    """
    # Load the scaler
    with open('data/scaler.pkl', 'rb') as f:
        SCALER = pickle.load(f)
    
    # Load the trained model.
    MODEL = load_model("data/models/lstm_combined_model_2025-03-27.keras")
    
    # Step 1: Load and update historical sentiment data.
    enriched_df = bot.load_and_update_sentiment_data(30)
    if enriched_df is None or enriched_df.empty:
        print("Failed to load sentiment data.")
        return None

    # Step 2: Filter for the specific symbol and sort by timestamp.
    symbol_df = enriched_df[enriched_df['symbol'] == symbol].sort_values(by="timestamp")
    if symbol_df.empty:
        print(f"No data available for {symbol}.")
        return None

    # Step 3: Check if there are at least TIME_SERIES_LENGTH rows.
    if len(symbol_df) < TIME_SERIES_LENGTH:
        print("Not enough data to form a prediction sequence.")
        return None
    else:
        # Use the last TIME_SERIES_LENGTH rows for prediction.
        feature_columns = ['open', 'high', 'low', 'close', 'volume', 'sentiment']
        latest_seq = symbol_df.iloc[-TIME_SERIES_LENGTH:][feature_columns].values

    # Step 4: Reshape and scale the sequence.
    latest_seq_2d = latest_seq.reshape(-1, NUM_FEATURES)
    latest_seq_scaled_2d = SCALER.transform(latest_seq_2d)
    latest_seq_scaled = latest_seq_scaled_2d.reshape(1, TIME_SERIES_LENGTH, NUM_FEATURES)
    
    # Predict using the trained model.
    predicted_price = MODEL.predict(latest_seq_scaled)
    predicted_value = predicted_price[0][0]
    print(f"Predicted closing price for {symbol} is {predicted_value:.2f}")
    return predicted_value


In [70]:
symbol = 'AAPL'

In [71]:
prediction = predict_todays_closing_price_enriched(symbol)

2025-03-27 21:39:00,351 - trading_bot_llm_sentiment_brian - INFO - Loaded combined historical data from data/combined_historical_with_daily_sentiment.csv
2025-03-27 21:39:00,442 - trading_bot_llm_sentiment_brian - INFO - Retrieved 22 bars for AAPL
2025-03-27 21:39:00,543 - trading_bot_llm_sentiment_brian - INFO - Retrieved 22 bars for MSFT


  saveable.load_own_variables(weights_store.get(inner_path))


2025-03-27 21:39:00,644 - trading_bot_llm_sentiment_brian - INFO - Retrieved 22 bars for META
2025-03-27 21:39:00,752 - trading_bot_llm_sentiment_brian - INFO - Retrieved 22 bars for GOOGL
2025-03-27 21:39:00,871 - trading_bot_llm_sentiment_brian - INFO - Retrieved 22 bars for AMZN
2025-03-27 21:39:00,970 - trading_bot_llm_sentiment_brian - INFO - Retrieved 22 bars for NVDA
2025-03-27 21:39:00,975 - trading_bot_llm_sentiment_brian - INFO - Combined historical data is up-to-date.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step
Predicted closing price for AAPL is 305.51


In [26]:
scaler = MinMaxScaler()


In [7]:
df = bot.load_and_update_sentiment_data()

2025-03-27 21:20:37,784 - trading_bot_llm_sentiment_brian - INFO - Loaded combined historical data from data/combined_historical_with_daily_sentiment.csv
2025-03-27 21:20:37,936 - trading_bot_llm_sentiment_brian - INFO - Retrieved 22 bars for AAPL
2025-03-27 21:20:38,042 - trading_bot_llm_sentiment_brian - INFO - Retrieved 22 bars for MSFT
2025-03-27 21:20:38,138 - trading_bot_llm_sentiment_brian - INFO - Retrieved 22 bars for META
2025-03-27 21:20:38,240 - trading_bot_llm_sentiment_brian - INFO - Retrieved 22 bars for GOOGL
2025-03-27 21:20:38,343 - trading_bot_llm_sentiment_brian - INFO - Retrieved 22 bars for AMZN
2025-03-27 21:20:38,451 - trading_bot_llm_sentiment_brian - INFO - Retrieved 22 bars for NVDA
2025-03-27 21:20:38,456 - trading_bot_llm_sentiment_brian - INFO - Combined historical data is up-to-date.


In [8]:
len(df)

180