# Stock Price Prediction Using LSTM

In [1]:

import pandas as pd
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



### Feature Engineering

In [2]:
df = pd.read_csv(R"..\data-fetcher\data\total_data.csv")
df_with_hours = df.loc[df['target'].notnull()][['time','hit']]
hours = [1,2,4,8]
for hour in hours:
    df_with_hours[str(hour) + '_hour'] = df_with_hours.apply(lambda x: 1 if((x['hit'] - x['time']) < hour*3600) else 0 , axis = 'columns')
df_with_hours.drop(['time','hit'], axis='columns',inplace=True)
df = df.join(df_with_hours)
df[df[['1_hour','2_hour','4_hour','8_hour']].notnull().any(axis= 'columns')]
df.to_csv(R"..\data-fetcher\data\total_data_with_hours.csv")





### Data Cleaning

In [3]:


df = pd.read_csv(R"..\data-fetcher\data\total_data_with_hours.csv",index_col = 0)

df['time'] = pd.to_datetime(df['time'], unit = "s")
df.set_index(pd.DatetimeIndex(df['time']), inplace = True)
df = df.drop('time', axis= 'columns')

def filter_weekend_hours(df):
    # Extract day of the week and hour
    df['day_of_week'] = df.index.dayofweek  # Monday=0, Sunday=6
    df['hour'] = df.index.hour

    # Define the condition to remove entries between Friday 8 PM to Monday 4 AM
    condition = ~(
        ((df['day_of_week'] == 4) & (df['hour'] >= 20)) |  # Last 4 hours of Friday
        (df['day_of_week'] == 5) |                         # Entire Saturday
        (df['day_of_week'] == 6) |                         # Entire Sunday
        ((df['day_of_week'] == 0) & (df['hour'] < 4))      # First 4 hours of Monday
    )

    # Apply filter and drop temporary columns
    filtered_df = df[condition].drop(columns=['day_of_week', 'hour'])
    return filtered_df

# Apply the function
filtered_df = filter_weekend_hours(df)
filtered_df.loc[filtered_df.index.dayofweek == 0]
filtered_df.to_csv(R"..\data-fetcher\data\total_data_with_hours.csv")

### Find Earliest Times 

In [4]:
def get_earliest_times_by_day(df):
    """
    Finds the earliest time for each day of the week in a DataFrame with a datetime index.

    Parameters:
    - df (pd.DataFrame): The input DataFrame with a datetime index.

    Returns:
    - pd.Series: A series with the days of the week as the index and their earliest times as values.
    """
    # Ensure the DataFrame has a datetime index
    if not isinstance(df.index, pd.DatetimeIndex):
        raise ValueError("The DataFrame must have a datetime index.")
    
    # Extract the day of the week
    df['day_of_week'] = df.index.dayofweek  # Monday=0, Sunday=6

    # Group by day_of_week and find the earliest time for each day
    earliest_times = df.groupby('day_of_week').apply(lambda x: x.index.min())

    # Map day_of_week to day names for readability
    day_names = {0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thursday", 
                 4: "Friday", 5: "Saturday", 6: "Sunday"}
    earliest_times.index = earliest_times.index.map(day_names)

    return earliest_times
get_earliest_times_by_day(filtered_df)

  earliest_times = df.groupby('day_of_week').apply(lambda x: x.index.min())


day_of_week
Monday      2024-11-25 14:56:00
Tuesday     2024-11-26 00:00:00
Wednesday   2024-11-27 00:00:00
Thursday    2024-11-28 00:00:00
Friday      2024-11-29 00:00:00
dtype: datetime64[ns]

In [4]:
filtered_df = pd.read_csv(R"..\data-fetcher\data\total_data_with_hours.csv")

filtered_df.set_index(pd.DatetimeIndex(filtered_df['time']), inplace = True)
filtered_df = filtered_df.drop('time', axis= 'columns')
filtered_df = filtered_df.loc[pd.DatetimeIndex()]



TypeError: DatetimeIndex(...) must be called with a collection of some kind, None was passed

In [8]:
box_indices = filtered_df.index[filtered_df['target'].notnull()].to_list()
box_indices
dataset = []
y = []
max_length = 0
#filtered_df['close'] = scaler.fit_transform(filtered_df['close'])
#filtered_df['moving_average'] = scaler.fit_transform(filtered_df['moving_average'])
#filtered_df[['high_reward','high_risk','low_reward','low_risk']] = scaler.fit_transform(filtered_df[['high_reward','high_risk','low_reward','low_risk']],axis= 0)
for i in box_indices:
    close = [filtered_df['close'].loc[i]]
    box_data = filtered_df[['high_reward','high_risk','low_reward','low_risk']].loc[i].to_list()


    prev_bars = filtered_df['close'].loc[i - pd.Timedelta(minutes = 60):i - pd.Timedelta(minutes = 1)].to_list()
    prev_ma = filtered_df['moving_average'].loc[i - pd.Timedelta(minutes = 60):i - pd.Timedelta(minutes = 1)].to_list()
    

    y_ = filtered_df[['1_hour','2_hour','4_hour','8_hour']].loc[i].to_list()


    
    if(len(prev_bars) == 60): 
        dataset.append(close+prev_bars+prev_ma+box_data)
        y.append(y_)
    else:
        continue
training_len = int(len(dataset)*0.80)
training_data,training_y = scaler.fit_transform(dataset[:training_len]),y[:training_len]
testing_data,testing_y = scaler.fit_transform(dataset[training_len:]),y[training_len:]













KeyboardInterrupt: 

In [9]:
len(box_indices)

14372

In [60]:
n = 20
training_set,testing_set = [], []
target_train, target_test = [],[]

for i in range(training_len - n):
    training_set.append(training_data[i:i+n])
    target_train.append(training_y[i + n - 1])

for i in range(len(testing_data) - n):
    testing_set.append(testing_data[i:i+n])
    target_test.append(testing_y[i + n - 1])







### Model Building

In [105]:
class Price_Prediction_Model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers,output_size):
        super(Price_Prediction_Model, self).__init__()
        self.lstm = nn.LSTM(input_size = input_size, hidden_size=hidden_size, num_layers = num_layers,batch_first = True)
        self.fc = nn.Linear(hidden_size, output_size)
    def forward(self,x):
        x = x.unsqueeze(0)
        lstm_out,_ = self.lstm(x)

        fc_out = self.fc(lstm_out[:,-1,:])
        hit_prob = torch.sigmoid(fc_out)
        return hit_prob

        


### More Data Preprocessing

In [62]:
from numpy import float32


n = 20
training_set,testing_set = [], []
target_train, target_test = [],[]

for i in range(training_len - n):
    training_set.append(training_data[i:i+n])
    target_train.append(training_y[i + n - 1])

for i in range(len(testing_data) - n):
    testing_set.append(testing_data[i:i+n])
    target_test.append(testing_y[i + n - 1])


training_set = torch.tensor(training_set, device = device, dtype=torch.float32)
testing_set = torch.tensor(testing_set, device = device, dtype=torch.float32)
target_train = torch.tensor(target_train, device = device, dtype=torch.float32)
target_test = torch.tensor(target_test, device = device, dtype=torch.float32)





    










    

    

20

In [96]:


EPOCHS = 100



model = Price_Prediction_Model(125,128,1,4)
model = model.to(device)
loss_function = nn.BCELoss()
optimizer = optim.SGD(model.parameters())

for epoch in range(EPOCHS):

    epoch_loss = 0

    model.train()

    
    optimizer.zero_grad()

    pred = model(training_set)
        

    loss = loss_function(pred,target_train)
    epoch_loss += loss.item()
    loss.backward()
    optimizer.step()
#    model.eval()
#    with torch.no_grad():
#        validation_loss = 0
#        for sequence,target in zip(testing_set,target_test):
#            pred = model(sequence).view(4,)
#            loss = loss_function(pred,target)
#            validation_loss += loss.item()



    if((epoch+1)%20 == 1):
        print("Epoch: %d, loss: %1.5f, validation_loss: %1.5f" % (epoch+1, epoch_loss/len(training_set), 0.01))


















Epoch: 1, loss: 0.06906, validation_loss: 0.01000
Epoch: 21, loss: 0.06899, validation_loss: 0.01000
Epoch: 41, loss: 0.06892, validation_loss: 0.01000
Epoch: 61, loss: 0.06885, validation_loss: 0.01000
Epoch: 81, loss: 0.06878, validation_loss: 0.01000


### Reset Model Weights

In [76]:
for layer in model.children():
   if hasattr(layer, 'reset_parameters'):
       layer.reset_parameters()

### Model Testing

In [108]:


model.eval()
with torch.no_grad():
    validation_loss = 0
    
    pred = model(training_set)
    loss = loss_function(pred,target_train)
    validation_loss += loss.item()
    print(pred)
    print("validation loss: %1.5f" % (validation_loss))
    


tensor([[0.4770, 0.4781, 0.4812, 0.5308],
        [0.4769, 0.4782, 0.4812, 0.5307],
        [0.4770, 0.4781, 0.4812, 0.5307],
        [0.4769, 0.4781, 0.4812, 0.5307],
        [0.4766, 0.4779, 0.4813, 0.5307],
        [0.4773, 0.4780, 0.4814, 0.5306],
        [0.4772, 0.4782, 0.4813, 0.5306],
        [0.4771, 0.4782, 0.4812, 0.5307],
        [0.4771, 0.4782, 0.4812, 0.5307],
        [0.4771, 0.4782, 0.4811, 0.5307]], device='cuda:0')
validation loss: 0.68713
