In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
import warnings

warnings.simplefilter("ignore", category=Warning)


In [2]:

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Set the global seed
tf.random.set_seed(42)

In [4]:
Stocks = pd.read_csv('/content/drive/My Drive/Amazon Stocks/Amazon_Stock_Price.csv', index_col=0)

In [5]:
Stocks.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2000-01-03,4.075,4.478125,3.952344,4.46875,322352000
1,2000-01-04,4.26875,4.575,4.0875,4.096875,349748000
2,2000-01-05,3.525,3.75625,3.4,3.4875,769148000
3,2000-01-06,3.565625,3.634375,3.2,3.278125,375040000
4,2000-01-07,3.35,3.525,3.309375,3.478125,210108000


In [6]:
#Removed the column with the header Unnamed
Stocks = pd.read_csv('/content/drive/My Drive/Amazon Stocks/Amazon_Stock_Price.csv', usecols=lambda column: column != 'Unnamed: 0')

In [7]:
Stocks.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2000-01-03,4.075,4.478125,3.952344,4.46875,322352000
1,2000-01-04,4.26875,4.575,4.0875,4.096875,349748000
2,2000-01-05,3.525,3.75625,3.4,3.4875,769148000
3,2000-01-06,3.565625,3.634375,3.2,3.278125,375040000
4,2000-01-07,3.35,3.525,3.309375,3.478125,210108000


In [8]:
Stocks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6050 entries, 0 to 6049
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    6050 non-null   object 
 1   Open    6050 non-null   float64
 2   High    6050 non-null   float64
 3   Low     6050 non-null   float64
 4   Close   6050 non-null   float64
 5   Volume  6050 non-null   int64  
dtypes: float64(4), int64(1), object(1)
memory usage: 283.7+ KB


In [9]:
Stocks.describe()

Unnamed: 0,Open,High,Low,Close,Volume
count,6050.0,6050.0,6050.0,6050.0,6050.0
mean,38.160734,38.614074,37.670999,38.152548,121571500.0
std,51.780179,52.388542,51.116488,51.754349,98246910.0
min,0.2955,0.305,0.2755,0.2985,17626000.0
25%,2.269625,2.30725,2.231313,2.275125,65470000.0
50%,10.35325,10.63475,10.21975,10.48625,99175000.0
75%,64.440501,65.273502,63.576249,64.713247,145984500.0
max,187.199997,188.654007,184.839493,186.570496,2086584000.0


In [10]:
Stocks.shape

(6050, 6)

In [11]:
Stocks['Date'] = pd.to_datetime(Stocks['Date'])
Stocks.sort_values(by='Date', inplace=True)
Stocks.set_index('Date', inplace=True)

In [12]:
Stocks.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-01-03,4.075,4.478125,3.952344,4.46875,322352000
2000-01-04,4.26875,4.575,4.0875,4.096875,349748000
2000-01-05,3.525,3.75625,3.4,3.4875,769148000
2000-01-06,3.565625,3.634375,3.2,3.278125,375040000
2000-01-07,3.35,3.525,3.309375,3.478125,210108000


In [13]:
# Feature engineering
Stocks['year'] = Stocks.index.year
Stocks['month'] = Stocks.index.month
Stocks['day'] = Stocks.index.day

In [14]:
Stocks['season'] = Stocks['month'].apply(lambda month: 1 if month in [12, 1, 2] else 2
                                 if month in [3, 4, 5] else 3 if month in [6, 7, 8] else 4)

In [15]:
# Correlation analysis to select top features
corr_matrix = Stocks.corr().abs()
target = 'Close'
normalized_corr = (corr_matrix[target] - corr_matrix[target].min()) / (corr_matrix[target].max() - corr_matrix[target].min())

In [16]:
Stocks.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,year,month,day,season
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2000-01-03,4.075,4.478125,3.952344,4.46875,322352000,2000,1,3,1
2000-01-04,4.26875,4.575,4.0875,4.096875,349748000,2000,1,4,1
2000-01-05,3.525,3.75625,3.4,3.4875,769148000,2000,1,5,1
2000-01-06,3.565625,3.634375,3.2,3.278125,375040000,2000,1,6,1
2000-01-07,3.35,3.525,3.309375,3.478125,210108000,2000,1,7,1


In [17]:
#Applying normalizer
normalized_corr

Open      0.999783
High      0.999898
Low       0.999903
Close     1.000000
Volume    0.301610
year      0.828067
month     0.036163
day       0.000000
season    0.027937
Name: Close, dtype: float64

In [18]:
# Select top 5 features correlated with the target
n = 5
top_features = normalized_corr.sort_values(ascending=False).index[1:n+1]
top_features = list(top_features)

In [19]:
top_features

['Low', 'High', 'Open', 'year', 'Volume']

In [20]:
#Creating a new column known as "Close"
top_features.append('Close')

In [21]:
top_features

['Low', 'High', 'Open', 'year', 'Volume', 'Close']

In [22]:
# Function to create sequences for the RNN model
def create_sequences_optimized(data, seq_length, target_idx):
    data_values = data.values.astype('float32')
    num_samples = len(data) - seq_length
    num_features = data.shape[1]

    xs = np.empty((num_samples, seq_length, num_features), dtype='float32')
    ys = np.empty(num_samples, dtype='float32')

    for i in range(num_samples):
        xs[i] = data_values[i:i+seq_length]
        ys[i] = data_values[i+seq_length, target_idx]

    return xs, ys

In [23]:
# Prepare the data
STK = Stocks[top_features].copy()
scalers = {}
for feature in top_features:
    scaler = MinMaxScaler(feature_range=(0, 1))
    STK[feature] = scaler.fit_transform(STK[[feature]])
    scalers[feature] = scaler

In [24]:
# Parameters for the LSTM model
# Passing in values for sequence and batch_size
sequence = 12
batch_size = 32
target_column = 'Close'

In [25]:
# Creating sequences for the RNN model
target_idx = top_features.index(target_column)
seq_length = sequence
X, y = create_sequences_optimized(STK, seq_length, target_idx)

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
# Splitting the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

In [28]:
from tensorflow.keras import backend as K

In [29]:
def r2_score(y_true,y_pred):
  ss_res = K.sum(K.square(y_true - y_pred))
  ss_tot = K.sum(K.square(y_true - K.mean(y_true)))
  return 1-ss_res/(ss_tot+K.epsilon())

In [30]:
#RNN model
# Build the RNN model
model = Sequential()
model.add(LSTM(128, input_shape=(seq_length, 6), return_sequences=True))
model.add(LSTM(128))
model.add(Dense(1, activation='relu'))

In [31]:
# Compile the model
model.compile(optimizer='adam', loss='mse')

In [32]:
# Set up ModelCheckpoint callback to save the entire model
checkpoint = ModelCheckpoint('model.h5', save_best_only=True, save_weights_only=False, verbose=1)

In [33]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), callbacks=[checkpoint])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.00087, saving model to model.h5
Epoch 2/10
Epoch 2: val_loss improved from 0.00087 to 0.00085, saving model to model.h5
Epoch 3/10
Epoch 3: val_loss did not improve from 0.00085
Epoch 4/10
Epoch 4: val_loss did not improve from 0.00085
Epoch 5/10
Epoch 5: val_loss did not improve from 0.00085
Epoch 6/10
Epoch 6: val_loss improved from 0.00085 to 0.00077, saving model to model.h5
Epoch 7/10
Epoch 7: val_loss improved from 0.00077 to 0.00071, saving model to model.h5
Epoch 8/10
Epoch 8: val_loss did not improve from 0.00071
Epoch 9/10
Epoch 9: val_loss improved from 0.00071 to 0.00062, saving model to model.h5
Epoch 10/10
Epoch 10: val_loss did not improve from 0.00062


In [34]:

model.save('model.h5')

In [35]:

# Load the best model and make predictions
model = load_model('model.h5')
y_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)



In [36]:
# Inverse transformation of the predictions and actual values
y_test_actual = scalers[target_column].inverse_transform(y_test.reshape(-1, 1)).flatten()
y_pred_actual = scalers[target_column].inverse_transform(y_pred).flatten()
y_train_actual = scalers[target_column].inverse_transform(y_train.reshape(-1, 1)).flatten()
y_train_pred_actual = scalers[target_column].inverse_transform(y_train_pred).flatten()

In [37]:
from sklearn.metrics import r2_score

In [38]:
#Calculate R2 scores for train and test sets
r2_train = r2_score(y_train_actual, y_train_pred_actual)
r2_test = r2_score(y_test_actual, y_pred_actual)

In [39]:
# Print the results
# Summarizing and evaluating the performance of a regression model, both in terms of the value ranges and the goodness of fit.
print(f'Inverse Test Min: {y_test_actual.min()}, Inverse Test Max: {y_test_actual.max()}')
print(f'Inverse Pred Min: {y_pred_actual.min()}, Inverse Pred Max: {y_pred_actual.max()}')
print(f'R² Score For Train Data: {round(r2_train * 100, 2)}%')
print(f'R² Score For Test Data: {round(r2_test * 100, 2)}%')

Inverse Test Min: 81.81999969482422, Inverse Test Max: 186.57049560546875
Inverse Pred Min: 85.219970703125, Inverse Pred Max: 178.1699676513672
R² Score For Train Data: 99.83%
R² Score For Test Data: 97.22%


The R² score for both training and test data is a key metric to evaluate the model's performance.

the closer the R² is to 100%, the better the model fits the data.

With the R² score of the train and test data edging closer to 100%m we can conclude that the model is a better fit for the data.