In [1]:
import yfinance as yf
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
import ta
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.layers import Input
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
# Set TensorFlow logging level
tf.get_logger().setLevel('ERROR')

In [3]:
# Fetch data for a stock (e.g., Apple) from a start date till today
today = datetime.today().strftime('%Y-%m-%d')
data = yf.download("MSFT", start="2022-01-01", end=today)

[*********************100%***********************]  1 of 1 completed


In [4]:
# Calculate technical indicators
data['SMA_20'] = data['Close'].rolling(window=20).mean()
data['SMA_50'] = data['Close'].rolling(window=50).mean()
data['RSI'] = ta.momentum.RSIIndicator(data['Close']).rsi()
data['MACD'] = ta.trend.MACD(data['Close']).macd_diff()

# Lag features
data['Lag_1'] = data['Close'].shift(1)

In [5]:
# Clip outliers
data['Close'] = data['Close'].clip(lower=data['Close'].quantile(0.05), upper=data['Close'].quantile(0.95))

In [6]:
# Percentage change in closing price
data['Price_Change'] = data['Close'].pct_change()

In [7]:
# Binary classification: 1 for price increase, 0 for decrease or no change
data['Target'] = data['Price_Change'].apply(lambda x: 1 if x > 0 else 0)

In [8]:
# Shift labels to align with features
data['Target'] = data['Target'].shift(-1)

In [9]:
# Handle NaNs in the features before scaling
data.ffill(inplace=True)
data.dropna(inplace=True)  # Drop any remaining NaNs, if necessary

In [10]:
# Features to scale
features_to_scale = ['Close', 'Volume', 'SMA_20', 'SMA_50', 'RSI', 'MACD']

# Initialize the scaler
scaler = MinMaxScaler(feature_range=(0, 1))

# Apply scaling
data[features_to_scale] = scaler.fit_transform(data[features_to_scale])

# Verify the scaling
print(data[features_to_scale].head())
print(data[features_to_scale].tail())

               Close    Volume    SMA_20    SMA_50       RSI      MACD
Date                                                                  
2022-03-15  0.258590  0.325669  0.258576  0.311933  0.405958  0.565335
2022-03-16  0.296395  0.372237  0.257209  0.307940  0.493969  0.681291
2022-03-17  0.300728  0.281085  0.256247  0.304597  0.503758  0.762339
2022-03-18  0.327933  0.444594  0.258427  0.303019  0.564725  0.852818
2022-03-21  0.321302  0.249026  0.260952  0.301563  0.546667  0.890707
               Close    Volume    SMA_20    SMA_50       RSI      MACD
Date                                                                  
2024-08-21  0.973905  0.089290  0.819698  0.981411  0.505734  0.977280
2024-08-22  0.929051  0.132132  0.819057  0.979716  0.371686  0.906796
2024-08-23  0.935526  0.120843  0.817151  0.977315  0.393331  0.863401
2024-08-26  0.918294  0.051391  0.814175  0.974536  0.344655  0.801395
2024-08-27  0.920122  0.055566  0.812133  0.971694  0.351467  0.760455


In [11]:
# Prepare features and labels
X = data[['SMA_20', 'SMA_50', 'RSI', 'MACD', 'Lag_1']].values
y = data['Target'].values

X = X.reshape((X.shape[0], 1, X.shape[1]))

In [12]:
# Define LSTM model with dropout and regularization
model = Sequential()
model.add(Input(shape=(X.shape[1], X.shape[2])))
model.add(LSTM(50, return_sequences=True, dropout=0.2, kernel_regularizer=regularizers.l2(0.01)))
model.add(LSTM(50, dropout=0.2, kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [13]:
# Use early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


In [14]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [15]:
# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 131ms/step - accuracy: 0.5133 - loss: 1.5426 - val_accuracy: 0.4242 - val_loss: 1.4058
Epoch 2/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5435 - loss: 1.3489 - val_accuracy: 0.4242 - val_loss: 1.2434
Epoch 3/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.5505 - loss: 1.1944 - val_accuracy: 0.4242 - val_loss: 1.1117
Epoch 4/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5297 - loss: 1.0739 - val_accuracy: 0.4242 - val_loss: 1.0076
Epoch 5/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5513 - loss: 0.9785 - val_accuracy: 0.4242 - val_loss: 0.9328
Epoch 6/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5419 - loss: 0.9045 - val_accuracy: 0.4242 - val_loss: 0.8780
Epoch 7/50
[1m13/13[0m [32m━━━━━

<keras.src.callbacks.history.History at 0x26d79fe9f90>

In [16]:
# Predict on the test data
predictions = (model.predict(X_test) > 0.5).astype(int)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 263ms/step


In [17]:
# Evaluate the model
print(classification_report(y_test, predictions, zero_division=1))
print(confusion_matrix(y_test, predictions))

# Check the distribution of predictions
unique, counts = np.unique(predictions, return_counts=True)
print("Predictions distribution:", dict(zip(unique, counts)))

              precision    recall  f1-score   support

         0.0       0.60      1.00      0.75        75
         1.0       1.00      0.00      0.00        49

    accuracy                           0.60       124
   macro avg       0.80      0.50      0.38       124
weighted avg       0.76      0.60      0.46       124

[[75  0]
 [49  0]]
Predictions distribution: {0: 124}
