In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [2]:
# Step 1: Load the historical data
data = pd.read_csv('historical_data.csv')


In [3]:
# Step 2: Load the sentiment scores data
sentiment_data = pd.read_csv('daily_sentiment_scores.csv')

In [4]:
# Step 3: Filter the required columns and sort by date and ticker
data = data[['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Ticker']]
data['Date'] = pd.to_datetime(data['Date'])
data.sort_values(['Ticker', 'Date'], inplace=True)

In [5]:
# Step 4: Merge sentiment scores (neutral, positive, negative) with historical data
sentiment_data['datetime'] = pd.to_datetime(sentiment_data['datetime'])
data = pd.merge(data, sentiment_data[['datetime', 'neutral', 'positive', 'negative']], left_on='Date', right_on='datetime', how='left')

In [6]:
# Step 5: Encode the Ticker column
encoder = OneHotEncoder(sparse=False)
tickers_encoded = encoder.fit_transform(data[['Ticker']])



In [7]:
# Step 6: Add the encoded tickers as additional features
encoded_columns = [f'Ticker_{i}' for i in range(tickers_encoded.shape[1])]
encoded_df = pd.DataFrame(tickers_encoded, columns=encoded_columns, index=data.index)
data = pd.concat([data.reset_index(drop=True), encoded_df], axis=1)

In [8]:
# Step 7: Scale numerical features (Open, High, Low, Close, Volume)
scaler = MinMaxScaler(feature_range=(0, 1))
numerical_features = ['Open', 'High', 'Low', 'Close', 'Volume', 'neutral', 'negative', 'positive']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

In [9]:
# The sentiment columns are already between 0 and 1, so we don't scale them again
numerical_features_with_sentiment = numerical_features + ['neutral', 'positive', 'negative']

In [10]:
# Step 8: Preprocessing function
def preprocess_data(data, n_steps=10):
    X, y = [], []
    feature_columns = numerical_features + encoded_columns
    for ticker in data['Ticker'].unique():
        ticker_data = data[data['Ticker'] == ticker]  # Ensuring sequences are within the same stock
        ticker_data = ticker_data[feature_columns].values
        for i in range(n_steps, len(ticker_data)):
            X.append(ticker_data[i - n_steps:i, :])  # Last n_steps rows as features
            y.append(ticker_data[i, 3])  # Predict 'Close' price
    return np.array(X), np.array(y)

In [11]:
# Step 9: Preprocess the data
X, y = preprocess_data(data, n_steps=10)

In [12]:
# Step 10: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Step 11: Build the LSTM model
model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.2),
    LSTM(50, return_sequences=False),
    Dropout(0.2),
    Dense(25, activation='relu'),
    Dense(1)  
])

  super().__init__(**kwargs)


In [14]:
# Step 12: Compile the model
model.compile(optimizer='adam', loss='mse')

In [15]:
# Step 13: Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1)


Epoch 1/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 11ms/step - loss: 0.0073 - val_loss: 0.0075
Epoch 2/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - loss: 0.0066 - val_loss: 0.0075
Epoch 3/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - loss: 0.0068 - val_loss: 0.0075
Epoch 4/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - loss: 0.0068 - val_loss: 0.0075
Epoch 5/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - loss: 0.0068 - val_loss: 0.0075
Epoch 6/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - loss: 0.0069 - val_loss: 0.0075
Epoch 7/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - loss: 0.0069 - val_loss: 0.0075
Epoch 8/20
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - loss: 0.0075 - val_loss: 0.0075
Epoch 9/20
[1m847/847[0m [32m━━━━

In [16]:
# Step 14: Make predictions
predictions = model.predict(X_test)

[1m236/236[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step


In [19]:
# Step 13: Inverse scale the predictions and actual values
y_test_scaled = scaler.inverse_transform(
    np.concatenate((np.zeros((y_test.shape[0], 7)), y_test.reshape(-1, 1)), axis=1))[:, -1]
predictions_scaled = scaler.inverse_transform(
    np.concatenate((np.zeros((predictions.shape[0], 7)), predictions), axis=1))[:, -1]

In [20]:
# Step 16: Evaluate the model
mae = mean_absolute_error(predictions_scaled, y_test_scaled)
mape = mean_absolute_percentage_error(predictions_scaled, y_test_scaled)
acc2 = 1 - mape

In [21]:
# Step 17: Print performance metrics
print(f"Mean Absolute Error = {mae}")
print(f"Mean Absolute Percentage Error = {mape*100:.2f}%")
print(f"Accuracy = {acc2 *100:.2f}%")

Mean Absolute Error = 0.01412253912858771
Mean Absolute Percentage Error = 8.03%
Accuracy = 91.97%
