## Define Imports

In [None]:
import math # Mathematical functions 
import numpy as np # Fundamental package for scientific computing with Python
import pandas as pd # Additional functions for analysing and manipulating data
from datetime import date, timedelta, datetime # Date Functions
from pandas.plotting import register_matplotlib_converters # This function adds plotting functions for calender dates
import matplotlib.pyplot as plt # Important package for visualization - we use this to plot the market data
import matplotlib.dates as mdates # Formatting dates
import tensorflow as tf
from sklearn.metrics import mean_absolute_error, mean_squared_error # Packages for measuring model performance / errors
from keras import Sequential # Deep learning library, used for neural networks
from keras.layers import LSTM, Dense, Dropout # Deep learning classes for recurrent and regular densely-connected layers
from keras.callbacks import EarlyStopping # EarlyStopping during model training
# from keras.models import Sequential
# from keras.layers import Dense, SimpleRNN , Dropout, LSTM, GRU
from sklearn.preprocessing import RobustScaler, MinMaxScaler # This Scaler removes the median and scales the data according to the quantile range to normalize the price data 
import seaborn as sns # Visualization
sns.set_style('white', { 'axes.spines.right': False, 'axes.spines.top': False})
%matplotlib inline
# import matplotlib.pyplot as plt
# import numpy as np
# import pandas as pd
#mounting drive
from google.colab import drive
drive.mount('/content/drive')
data_dir = '/content/drive/MyDrive/Deep_Learning/MSFT.csv'
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
from sklearn.preprocessing import MinMaxScaler

In [None]:
# import yfinance
# msft = yf.download(MSFT, start=start_date, end=end_date)
# hist = msft.history(period='max')
# hist

In [None]:
MSFT = pd.read_csv(data_dir)
# MSFT = MSFT.iloc[7000:,:]
MSFT
MSFT = MSFT.set_index('Date')
display(MSFT)

In [None]:
import plotly.graph_objects as go
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=MSFT.index,y=MSFT['Adj Close'],name='Closing Price'),secondary_y=False)
# fig.add_trace(go.Bar(x=MSFT['Date'],y=MSFT['Volume'],name='Volume'),secondary_y=True)
fig.show()

In [None]:
# Plot line charts
df_plot = MSFT.copy()
df_plot.plot()
# ncols = 2
# nrows = int(round(df_plot.shape[1] / ncols, 0))

# fig, ax = plt.subplots(nrows=3, ncols=2, sharex=True, figsize=(14, 7))
# for i, ax in enumerate(fig.axes):
#         sns.lineplot(data = df_plot.iloc[:, i+1], ax=ax)
#         # ax.tick_params(axis="x", rotation=30, labelsize=10, length=0)
#         # ax.xaxis.set_major_locator(mdates.AutoDateLocator())
# fig.tight_layout()
# plt.show()

In [None]:
class Basenet():
    def __init__(self,index_Close):
        self.train_set = None
        self.val_set = None
        self.test_set = None
        self.index_Close = index_Close

    def partition_dataset(self,sequence_length, data):
        x, y = [], []
        data_len = data.shape[0]
        for i in range(sequence_length, data_len):
            x.append(data[i-sequence_length:i,:]) #contains sequence_length values 0-sequence_length * columsn
            y.append(data[i, self.index_Close]) #contains the prediction values for validation,  for single-step prediction
        
        # Convert the x and y to numpy arrays
        x = np.array(x)
        y = np.array(y)
        return x, y

    def split_data(self,stock,splitsize):
        # data_raw = stock.to_numpy() # convert to numpy array
        data = stock.copy()
        #calc test size
        data = np.array(data)
        test_size = int(np.round(len(data)*splitsize))
        bg_train_size = len(data) - test_size
        val_size = int(np.round(bg_train_size*(splitsize/2)))
        train_size = bg_train_size - val_size
        # test_set_size = int(np.round(0.2*data.shape[0]))
        # train_set_size = data.shape[0] - (test_set_size)
        print(f"Train :{train_size}, Val :{val_size}, test :{test_size}")
        print("bg_train :",bg_train_size)
        self.train_set = data[:train_size,:]
        self.val_set = data[train_size:bg_train_size,:]
        self.test_set = data[bg_train_size:,:]
#         x_train = data[:train_size,:-1]
#         y_train = data[:train_size,-1]
        print("Training Size :",len(self.train_set))
        print("Validation Size :",len(self.val_set))
#         x_test = data[train_size:,:-1]
#         y_test = data[train_size:,-1]
        print("Testing Size :",len(self.test_set))
        # return [x_train, y_train, x_test, y_test]
        return True


  

## Data pre-processing

>  we drop Columns Date since this does not give valuable information regarding stock price instead is just a measure of time passsing

> we drop Adjusted close since it is calculated after market closing post divident claculation and is not a continoues flow .



In [None]:
# # Indexing Batches
# train_df = df.sort_values(by=['Date']).copy()
filtered_df = MSFT.copy()
# Removing Adjusted Close Column
filtered_df = filtered_df.drop([ 'Adj Close'], axis=1)
filtered_df['Prediction'] = filtered_df['Close']
# Print the tail of the dataframe
display(filtered_df.tail())

#Using Min-Max scaling to make data more uniform

# Get the number of rows in the data
# nrows = data_filtered.shape[0]

# # Convert the data to numpy values
# np_data_unscaled = np.array(data_filtered)
# np_data = np.reshape(np_data_unscaled, (nrows, -1))
# print(np_data.shape)

# Transform the data by scaling each feature to a range between 0 and 1
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(filtered_df)
display(data_scaled[7500:,:])
# Creating a separate scaler that works on a single column for scaling predictions
scaler_pred = MinMaxScaler()
df_Close = pd.DataFrame(filtered_df['Close'])
np_Close_scaled = scaler_pred.fit_transform(df_Close)

In [None]:
l = Basenet()
l.split_data(data_scaled, 20,.2)

# Vanilla RNN

In [None]:

class LSTM_cls(Basenet):

  def __init__(self,index_Close):
    super().__init__(index_Close)
    self.optimizer = None
    self.x_train = None

  def train(self,model,sequence_length):
    x_train,y_train = self.partition_dataset(sequence_length,self.train_set)
    x_test,y_test = self.partition_dataset(sequence_length,self.test_set)
    print(f"trainx : {x_train.shape}")
    print(f"testx : {x_test.shape}")
    print(f"y_train : {y_train.shape}")
    print(f"y_test : {y_test.shape}")

    # Training the model
    epochs = 50
    batch_size = 16
    early_stop = EarlyStopping(monitor='loss', patience=5, verbose=1)
    history = model.fit(x_train, y_train, 
                        batch_size=batch_size, 
                        epochs=epochs,
                        validation_data=(x_test, y_test)
                      )
                        
                        # callbacks=[early_stop])
    return history

  def LSTM_model(self,sequence_length,optimizer = 'adam',lr=0.001):
    x_train,y_train = self.partition_dataset(sequence_length,self.train_set)
    self.x_train = x_train
    if optimizer == 'adam':
      self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
    elif optimizer == 'sgd': 
      self.optimizer = tf.keras.optimizers.SGD(learning_rate=lr, momentum=0.0)

    model = Sequential()
    # Model with n_neurons = inputshape Timestamps, each with self.train_set.shape[2] variables
    n_neurons = self.x_train.shape[1] * self.x_train.shape[2]
    # print(n_neurons, self.train_set.shape[1], self.train_set.shape[2])
    model.add(
        LSTM(
            n_neurons, return_sequences=True, 
             input_shape=(self.x_train.shape[1], self.x_train.shape[2]))
        ) 
    model.add(
        LSTM(n_neurons, return_sequences=False)
        )
    model.add(Dense(5))
    model.add(Dense(1))

    # Compile the model
    model.compile(optimizer=self.optimizer, loss='mse')
    return model


In [None]:
lstm.train_set[:, -1][:,3].shape

In [None]:
lstm = LSTM_cls(3)
lstm.split_data(data_scaled,.2)
# display(lstm.train_set.shape)
# lstm.train_set.shape[1] * lstm.train_set.shape[2]
lstm_model = lstm.LSTM_model(20)
history = lstm.train(lstm_model,20)
# Plot training & validation loss values
fig, ax = plt.subplots(figsize=(16, 5), sharex=True)
sns.lineplot(data=history.history["loss"])
plt.title("Model loss")
plt.ylabel("Loss")
plt.xlabel("Epoch")
ax.xaxis.set_major_locator(plt.MaxNLocator(50))
plt.legend(["Train", "Test"], loc="upper left")
plt.grid()
plt.show()