In [1]:
import pandas as pd
import numpy as np
from pyspark.sql.functions import *
from sklearn.preprocessing import MinMaxScaler
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import matplotlib.pyplot as plt

StatementMeta(spark001, 10, 2, Finished, Available)

2024-06-22 13:04:05.110469: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = spark.read.option("header", True).csv('abfss://files@datalake9vbgk2l.dfs.core.windows.net/synapse/workspaces/data/stock_price/*.csv')

StatementMeta(spark001, 10, 3, Finished, Available)

In [4]:
# Filter data for NVDA and AAPL
df = df.withColumn('close',col('close').cast('float')).withColumn('date',to_date(col('date')))
df_nvda = df.filter(col("symbol") == "NVDA").select("date", "close").orderBy("date", ascending=False).limit(100)
df_aapl = df.filter(col("symbol") == "AAPL").select("date", "close").orderBy("date", ascending=False).limit(100)

# Convert Spark DataFrame to Pandas DataFrame for local processing
df_nvda_pd = df_nvda.toPandas()
df_aapl_pd = df_aapl.toPandas()

# Reverse the order to get oldest first
df_nvda_pd = df_nvda_pd.iloc[::-1].reset_index(drop=True)
df_aapl_pd = df_aapl_pd.iloc[::-1].reset_index(drop=True)

# Extracting close prices
nvda_prices = df_nvda_pd['close'].values
aapl_prices = df_aapl_pd['close'].values

StatementMeta(spark001, 10, 5, Finished, Available)

In [5]:
# Scale the data
scaler_nvda = MinMaxScaler()
scaler_aapl = MinMaxScaler()

nvda_scaled = scaler_nvda.fit_transform(nvda_prices.reshape(-1, 1))
aapl_scaled = scaler_aapl.fit_transform(aapl_prices.reshape(-1, 1))

# Create sequences for LSTM
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
        y.append(data[i + seq_length])
    return np.array(X), np.array(y)

seq_length = 10  # Adjust as needed

X_nvda, y_nvda = create_sequences(nvda_scaled, seq_length)
X_aapl, y_aapl = create_sequences(aapl_scaled, seq_length)

# Reshape data for LSTM (samples, time steps, features)
X_nvda = X_nvda.reshape((X_nvda.shape[0], X_nvda.shape[1], 1))
X_aapl = X_aapl.reshape((X_aapl.shape[0], X_aapl.shape[1], 1))

# Define LSTM model function
def build_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(units=50, input_shape=input_shape))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Build LSTM models for NVDA and AAPL
model_nvda = build_lstm_model(input_shape=(X_nvda.shape[1], X_nvda.shape[2]))
model_aapl = build_lstm_model(input_shape=(X_aapl.shape[1], X_aapl.shape[2]))

# Train the models
model_nvda.fit(X_nvda, y_nvda, epochs=50, batch_size=32, verbose=1)
model_aapl.fit(X_aapl, y_aapl, epochs=50, batch_size=32, verbose=1)


StatementMeta(spark001, 10, 6, Finished, Available)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5

<keras.src.callbacks.History at 0x7e3f86958340>

In [None]:
# Function to predict next n days
def predict_next_days(model, data, scaler, seq_length, future_days):
    predicted = []
    last_sequence = data[-seq_length:]
    for _ in range(future_days):
        next_day_prediction = model.predict(last_sequence.reshape(1, seq_length, 1))[0,0]
        predicted.append(next_day_prediction)
        last_sequence = np.append(last_sequence[1:], next_day_prediction)
    return scaler.inverse_transform(np.array(predicted).reshape(-1, 1)).flatten()

# Number of future days to predict
future_days = 30

# Predict next 30 days for NVDA and AAPL
nvda_predictions = predict_next_days(model_nvda, nvda_scaled, scaler_nvda, seq_length, future_days)
aapl_predictions = predict_next_days(model_aapl, aapl_scaled, scaler_aapl, seq_length, future_days)

# Print predicted prices
print("Predicted NVDA prices for the next 30 days:")
print(nvda_predictions)
print("\nPredicted AAPL prices for the next 30 days:")
print(aapl_predictions)
