In [1]:
import numpy as np
from tensorflow.keras.regularizers import l2
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.layers import SimpleRNN, Dense, LSTM, Dropout, GRU
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import MeanSquaredError, RootMeanSquaredError
from tensorflow.keras.losses import MeanSquaredError
import matplotlib.pyplot as plt
import tensorflow as tf
import warnings

warnings.simplefilter("ignore", category=Warning)

In [2]:
# Set the global seed
tf.random.set_seed(42)

In [3]:
data=pd.read_csv("Amazon_Stock_Price.csv")

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Volume
0,0,2000-01-03,4.075,4.478125,3.952344,4.46875,322352000
1,1,2000-01-04,4.26875,4.575,4.0875,4.096875,349748000
2,2,2000-01-05,3.525,3.75625,3.4,3.4875,769148000
3,3,2000-01-06,3.565625,3.634375,3.2,3.278125,375040000
4,4,2000-01-07,3.35,3.525,3.309375,3.478125,210108000


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6050 entries, 0 to 6049
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  6050 non-null   int64  
 1   Date        6050 non-null   object 
 2   Open        6050 non-null   float64
 3   High        6050 non-null   float64
 4   Low         6050 non-null   float64
 5   Close       6050 non-null   float64
 6   Volume      6050 non-null   int64  
dtypes: float64(4), int64(2), object(1)
memory usage: 331.0+ KB


In [6]:
data = data.drop(columns=['Unnamed: 0'])

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6050 entries, 0 to 6049
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    6050 non-null   object 
 1   Open    6050 non-null   float64
 2   High    6050 non-null   float64
 3   Low     6050 non-null   float64
 4   Close   6050 non-null   float64
 5   Volume  6050 non-null   int64  
dtypes: float64(4), int64(1), object(1)
memory usage: 283.7+ KB


In [8]:
data.tail()

Unnamed: 0,Date,Open,High,Low,Close,Volume
6045,2024-01-12,155.389999,156.199997,154.009995,154.619995,40460300
6046,2024-01-16,153.529999,154.990005,152.149994,153.160004,41384600
6047,2024-01-17,151.490005,152.149994,149.910004,151.710007,34953400
6048,2024-01-18,152.770004,153.779999,151.820007,153.5,37850200
6049,2024-01-19,153.830002,155.759995,152.740005,155.339996,51033700


In [9]:
#Sorting the DataFrame by date and setting the 'Date' column as the index is a crucial step when working with time series data.
data['Date']=pd.to_datetime(data['Date'])
data.sort_values(by='Date', inplace=True)
data.set_index('Date', inplace =True)

In [10]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-01-03,4.075,4.478125,3.952344,4.46875,322352000
2000-01-04,4.26875,4.575,4.0875,4.096875,349748000
2000-01-05,3.525,3.75625,3.4,3.4875,769148000
2000-01-06,3.565625,3.634375,3.2,3.278125,375040000
2000-01-07,3.35,3.525,3.309375,3.478125,210108000


In [11]:
# Feature engineering
data['Year'] = data.index.year
data['Month'] = data.index.month
data['Day'] = data.index.day

In [12]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Year,Month,Day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000-01-03,4.075,4.478125,3.952344,4.46875,322352000,2000,1,3
2000-01-04,4.26875,4.575,4.0875,4.096875,349748000,2000,1,4
2000-01-05,3.525,3.75625,3.4,3.4875,769148000,2000,1,5
2000-01-06,3.565625,3.634375,3.2,3.278125,375040000,2000,1,6
2000-01-07,3.35,3.525,3.309375,3.478125,210108000,2000,1,7


In [13]:
#Adding the season as a feature based on the month is another excellent way to enhance the model's understanding of the data. 
data['Season']=data['Month'].apply(lambda month: 1 if month in [12,1,2] else 2 if month in [3,4,5] else 3 if month in [6,7,8] else 4)

#The season column created are defined as follows:
#Season 1: December, January, February (Winter)
#Season 2: March, April, May (Spring)
#Season 3: June, July, August (Summer)
#Season 4: September, October, November (Fall)

In [14]:
# Correlation analysis to select top features
corr_matrix = data.corr().abs()
target = 'Close'
normalized_corr = (corr_matrix[target] - corr_matrix[target].min()) / (corr_matrix[target].max() - corr_matrix[target].min())

In [15]:
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Year,Month,Day,Season
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2000-01-03,4.075,4.478125,3.952344,4.46875,322352000,2000,1,3,1
2000-01-04,4.26875,4.575,4.0875,4.096875,349748000,2000,1,4,1
2000-01-05,3.525,3.75625,3.4,3.4875,769148000,2000,1,5,1
2000-01-06,3.565625,3.634375,3.2,3.278125,375040000,2000,1,6,1
2000-01-07,3.35,3.525,3.309375,3.478125,210108000,2000,1,7,1


In [16]:
normalized_corr

Open      0.999783
High      0.999898
Low       0.999903
Close     1.000000
Volume    0.301610
Year      0.828067
Month     0.036163
Day       0.000000
Season    0.027937
Name: Close, dtype: float64

In [17]:
# Select top 5 features correlated with the target
n = 5
top_features = normalized_corr.sort_values(ascending=False).index[1:n+1]
top_features = list(top_features)


In [18]:
top_features

['Low', 'High', 'Open', 'Year', 'Volume']

In [19]:
top_features.append('Close')

In [20]:
top_features

['Low', 'High', 'Open', 'Year', 'Volume', 'Close']

In [21]:
#Function to create sequences for RNN model
def create_sequences_optimized(data, seq_length, target_idx):
    # Convert the DataFrame to a NumPy array of type float32
    data_values = data.values.astype('float32')
    
    # Calculate the number of samples based on the sequence length
    num_samples = len(data) - seq_length
    
    # Get the number of features from the data
    num_features = data.shape[1]
    
    # Initialize the xs array to hold the input sequences
    xs = np.empty((num_samples, seq_length, num_features), dtype='float32')
    
    # Initialize the ys array to hold the target values
    ys = np.empty(num_samples, dtype='float32')
    
    # Loop through each sample to create the sequences
    for i in range(num_samples):
        # Get the sequence of data for the current sample
        xs[i] = data_values[i:i + seq_length]
        
        # Get the target value for the current sample
        ys[i] = data_values[i + seq_length, target_idx]
    
    # Return the sequences and the corresponding target values
    return xs, ys


In [22]:
# Prepare the data
data = data[top_features].copy()
scalers = {}
for feature in top_features:
    scaler = MinMaxScaler(feature_range=(0, 1))
    data[feature] = scaler.fit_transform(data[[feature]])
    scalers[feature] = scaler

In [23]:
# Parameters for the LSTM model
#pass in values for sequence and batch_size
sequence =10 
target_column = 'Close'

In [24]:
#code below creates sequences for the RNN model
target_idx = top_features.index(target_column)
seq_length = sequence
x, y = create_sequences_optimized(data, seq_length, target_idx)

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
#Split the data using train_test_split 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, shuffle=False)


In [27]:
# create fxn for r2 score
def r2_score(y_true,y_pred):
  ss_res = K.sum(K.square(y_true - y_pred))
  ss_tot = K.sum(K.square(y_true - K.mean(y_true)))
  return 1-ss_res/(ss_tot+K.epsilon())


In [28]:
# Define the model
# 6 in the input shape because there are 6 columns or features
model = Sequential()
model.add(LSTM(100, input_shape=(seq_length, 6), return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(64))
model.add(Dense(1, activation='relu'))

In [29]:
# Compile the model
model.compile(optimizer='adam', loss= 'mse', metrics= ['mse','r2_score'])


In [30]:
# Setup ModelCheckpoint callback to save the entire model
checkpoint= ModelCheckpoint ('model.keras', save_best_only= True, save_weights_only = False, verbose=1)

In [31]:
# Train the model
history = model.fit(x_train, y_train, epochs=30, batch_size=32, validation_data=(x_test, y_test), callbacks=[checkpoint])

Epoch 1/30
[1m130/133[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 10ms/step - loss: 3.7027e-04 - mse: 3.7027e-04 - r2_score: 0.8438
Epoch 1: val_loss improved from inf to 0.00142, saving model to model.keras
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 18ms/step - loss: 3.6374e-04 - mse: 3.6374e-04 - r2_score: 0.8465 - val_loss: 0.0014 - val_mse: 0.0014 - val_r2_score: 0.9703
Epoch 2/30
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 2.8502e-05 - mse: 2.8502e-05 - r2_score: 0.9881
Epoch 2: val_loss improved from 0.00142 to 0.00116, saving model to model.keras
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 34ms/step - loss: 2.8477e-05 - mse: 2.8477e-05 - r2_score: 0.9881 - val_loss: 0.0012 - val_mse: 0.0012 - val_r2_score: 0.9759
Epoch 3/30
[1m131/133[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 22ms/step - loss: 3.2246e-05 - mse: 3.2246e-05 - r2_score: 0.9865
Epoch 3: val_loss did not impro

In [32]:
historydf = pd.DataFrame(history.history)

In [33]:
historydf.head()

Unnamed: 0,loss,mse,r2_score,val_loss,val_mse,val_r2_score
0,0.000151,0.000151,0.93611,0.001424,0.001424,0.970311
1,2.5e-05,2.5e-05,0.989362,0.001158,0.001158,0.975865
2,2.5e-05,2.5e-05,0.989612,0.005538,0.005538,0.884546
3,2.1e-05,2.1e-05,0.991207,0.003644,0.003644,0.924028
4,1.7e-05,1.7e-05,0.992785,0.004002,0.004002,0.916567


In [34]:

# Evaluate the model
test_loss, test_mse, test_r2 = model.evaluate(x_test, y_test)
print(f'Test Loss (MSE): {test_loss:.4f}')
print(f'Test MSE: {test_mse:.4f}')
print(f'Test R²: {test_r2:.4f}')


[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 0.0017 - mse: 0.0017 - r2_score: 0.9058     
Test Loss (MSE): 0.0037
Test MSE: 0.0037
Test R²: 0.9221


In [35]:
#Load the entire model
model = load_model('model.keras')

In [36]:
#To make predictions
y_pred = model.predict(x_test)
y_train_pred = model.predict(x_train)

[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step


In [37]:
#code below to inverse transforms the predictions and actual values
y_test_actual = scalers[target_column].inverse_transform(y_test.reshape(-1, 1)).flatten()
y_pred_actual = scalers[target_column].inverse_transform(y_pred).flatten()
y_train_actual = scalers[target_column].inverse_transform(y_train.reshape(-1, 1)).flatten()
y_train_pred_actual = scalers[target_column].inverse_transform(y_train_pred).flatten()


In [38]:
#code below to calculates R2 scores for train and test sets
from sklearn.metrics import r2_score
r2_train = r2_score(y_train_actual, y_train_pred_actual)
r2_test = r2_score(y_test_actual, y_pred_actual)

In [39]:
r2_train

0.9965803523957171

In [40]:
r2_test

0.9797623766378079

In [41]:
#code below to prints the results
print(f'Inverse Test Min: {y_test_actual.min()}, Inversprovide an accurate code to improve on the model e Test Max: {y_test_actual.max()}')
print(f'Inverse Pred Min: {y_pred_actual.min()}, Inverse Pred Max: {y_pred_actual.max()}')
print(f'R² Score For Train Data: {round(r2_train * 100, 2)}%')
print(f'R² Score For Test Data: {round(r2_test * 100, 2)}%')

Inverse Test Min: 35.95349884033203, Inversprovide an accurate code to improve on the model e Test Max: 186.57049560546875
Inverse Pred Min: 38.146949768066406, Inverse Pred Max: 171.9779815673828
R² Score For Train Data: 99.66%
R² Score For Test Data: 97.98%
