In [2]:
### **Install the Libraries**

In [3]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from tqdm.auto import tqdm

from datetime import date

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Use GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cpu


In [5]:
# Free up unused cached memory
if device == 'cuda':
    torch.cuda.empty_cache()

In [6]:
# Ticker symbol for S&P 500
ticker = "^GSPC"

#yyyy-mm-dd
start_date = "1957-03-04"
end_date = "2024-09-01"

In [7]:
def download_needed_data_to_csv(start_date: str, end_date:str) -> None:
    yf.download(ticker, start=start_date, end=end_date, interval='1d').to_csv('../training_data/s&p500.csv')

download_needed_data_to_csv(start_date, end_date)

[*********************100%***********************]  1 of 1 completed


In [8]:
all_data = pd.read_csv("../training_data/s&p500.csv")

In [9]:
# chunks = np.array_split(all_data, 15)

# # Create a plot
# plt.figure(figsize=(10, 6))

# for i, chunk in enumerate(chunks):
#     plt.figure(figsize=(10, 5))
    
#     # Assuming you want to plot one column (e.g., 'Close')
#     plt.plot(chunk.index, chunk['Close'], label=f'Chunk {i+1}')
    
#     # Add title and labels
#     plt.title(f'Chunk {i+1} of the Data')
#     plt.xlabel('Index')
#     plt.ylabel('Close Price')
#     plt.legend()
    
#     # Show the plot
#     plt.show()

In [10]:
all_data_copy = all_data.drop('Date', axis=1)
corr_matrix = all_data_copy.corr()
print(corr_matrix['Close'].sort_values(ascending=False))

Close        1.000000
Adj Close    1.000000
Low          0.999969
High         0.999964
Open         0.999364
Volume       0.774932
Name: Close, dtype: float64


In [11]:
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler to scale data between 0 and 1
scaler = MinMaxScaler(feature_range=(0, 1))

# Fit and transform the data (excluding the date if necessary)
scaled_data = scaler.fit_transform(all_data_copy)

# Convert back to DataFrame to visualize (optional)
scaled_data = pd.DataFrame(scaled_data, columns=['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'])
print(scaled_data.head())


       Open      High       Low     Close  Adj Close    Volume
0  0.007806  0.000902  0.000907  0.000903   0.000903  0.000055
1  0.007835  0.000931  0.000936  0.000931   0.000931  0.000052
2  0.007837  0.000932  0.000937  0.000933   0.000933  0.000051
3  0.007833  0.000929  0.000934  0.000929   0.000929  0.000050
4  0.007808  0.000904  0.000909  0.000904   0.000904  0.000032


In [13]:
from sklearn.model_selection import train_test_split
from math import ceil

def create_sequences(data: pd.DataFrame, seq_length: int):
    """ Returns tuple of needed DataFrames for training and validation"""
    columns = data.columns
    X, y = pd.DataFrame(columns=data.columns), pd.DataFrame(columns=data.columns)
    print(type(data.iloc[1:3]))
    # print(data.head())
    # print(data.tail())
    
    for i in range(1, len(data) - seq_length):
        # print(data.loc[i-1:i-1 + seq_length])
        # print(data.loc[i-1 + seq_length])
        try:
            X = pd.concat([X, data.loc[i-1:i-1 + seq_length]], ignore_index=True)
            y = pd.concat([y, data.loc[i-1 + seq_length]], ignore_index=True)
        except IndexError:
            X = pd.concat([X, data.loc[i-1:len(data) - 1]], ignore_index=True)
            y = pd.concat([y, data.loc[len(data) - 1]], ignore_index=True)
            
    print(X[0])
    print(X[-1])
    
    #X, y = clear_tail_from_unused_space(X), clear_tail_from_unused_space(y)
    
    X, y = pd.DataFrame(X, columns=columns), pd.DataFrame(y, columns=columns)
    
    X, y = X.drop(['Close', 'Adj Close'], axis=1), y['Close'].copy()
    
    return X, y


def clear_tail_from_unused_space(data_array: np.array) -> np.array:
    i = len(data_array) - 1
    #print(data_array[i])
    while np.all(data_array[i] == 0):
        i -= 1
    # print(data_array[:i])
    # print(data_array[:i+1])
    return data_array[:i]


#creating sequences of data
seq_length = 7
x_data, y_data = create_sequences(scaled_data, seq_length)

print(type(x_data.iloc[:, -1].values))
print(type(x_data.iloc[:, 3].values))
print(y_data.tail())

# Split the dataset: 80% train, 20% validation
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.20, random_state=42)

# print(X_train.tail())
# print(X_test.tail())

<class 'pandas.core.frame.DataFrame'>


  X = pd.concat([X, data.loc[i-1:i-1 + seq_length]], ignore_index=True)


KeyError: 0

In [12]:
# Format datasets for PyTorch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

def format_data_for_torch(data_x: pd.DataFrame, data_y: pd.DataFrame) -> DataLoader:
    data_x_tensor = torch.FloatTensor(data_x.values)
    data_y_tensor = torch.FloatTensor(data_y.values)
    dataset = CustomDataset(data_x_tensor, data_y_tensor)
    return DataLoader(dataset=dataset, batch_size=64, shuffle=True)

train_loader = format_data_for_torch(X_train, y_train)
test_loader = format_data_for_torch(X_test, y_test)


In [13]:
class LSTMModel(torch.nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # Define the LSTM layer
        self.lstm = torch.nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        
        # Define the output layer (fully connected)
        self.fc = torch.nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Initialize hidden and cell states
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        # Forward propagate through the LSTM
        out, (hn, cn) = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size)
        
        # Take the output of the last time step
        out = out[:, -1, :]  # Take the last output
        
        # Pass the last time step output to the fully connected layer
        out = self.fc(out)
        return out



model = LSTMModel(4, 16, 2, 1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

LSTMModel(
  (lstm): LSTM(4, 16, num_layers=2, batch_first=True)
  (fc): Linear(in_features=16, out_features=1, bias=True)
)

In [14]:
# Set up the optimizer and learning rate scheduler
num_epochs = 64
num_training_steps = num_epochs * len(train_loader)
learning_rate = 0.1

loss_function = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) 

In [15]:
# Load accuracy metric
from evaluate import load
metric = load('accuracy')

# Initialize lists to store loss and accuracy
avg_train_losses = []
avg_val_losses = []
validation_accuracies = []

In [16]:
#train and validation
for epoch in range(num_epochs):
    model.train()
    train_losses = []
    
    for batch_id, (x, y) in enumerate(train_loader):
        inputs, targets = x.unsqueeze(1).to(device), y.float().unsqueeze(1).to(device)
        
        outputs = model(inputs)
        loss = loss_function(outputs, targets)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_losses.append(loss.item())
        
    avg_train_loss = sum(train_losses) / len(train_losses)
    avg_train_losses.append(avg_train_loss)
    print(f'Average training loss for epoch {epoch+1}: {avg_train_loss}')
    
    model.eval()
    
    var_losses = []
    
    with torch.no_grad():
        for x, y in test_loader:
            inputs, targets = x.unsqueeze(1).to(device), y.float().unsqueeze(1).to(device)
            
            outputs = model(inputs)
            test_loss = loss_function(outputs, targets)
            
            var_losses.append(test_loss.item())
        
        avg_val_loss = sum(var_losses) / len(var_losses)
        avg_val_losses.append(avg_val_loss)
        print(f'Average validation loss for epoch {epoch+1}: {avg_val_loss}')


# Save model
torch.save(model.state_dict(), 'lstm_model.pth')

Average training loss for epoch 1: 0.041738062305594556
Average validation loss for epoch 1: 0.03938803754539953
Average training loss for epoch 2: 0.038782117582262965
Average validation loss for epoch 2: 0.03730627754703164
Average training loss for epoch 3: 0.036589854606755186
Average validation loss for epoch 3: 0.03441309063108983
Average training loss for epoch 4: 0.03246757507044385
Average validation loss for epoch 4: 0.029103083518782148
Average training loss for epoch 5: 0.024321941173517367
Average validation loss for epoch 5: 0.01808148660455589
Average training loss for epoch 6: 0.011471670806329225
Average validation loss for epoch 6: 0.005495335242745501
Average training loss for epoch 7: 0.00288779040245982
Average validation loss for epoch 7: 0.0015924644723310376
Average training loss for epoch 8: 0.0012262876149230885
Average validation loss for epoch 8: 0.0011368857635210992
Average training loss for epoch 9: 0.000965843398010297
Average validation loss for epoch 9

In [31]:
test_data = yf.download(ticker, start='2024-09-30', end='2024-10-01', interval='1d')
#test_data = test_data.drop('Date')
test_data_x = test_data.drop(['Close', 'Adj Close'], axis=1)
print(test_data_x)
label= test_data['Close'].copy()
model.eval()
with torch.no_grad():
    y_pred = model(test_data_x)
y_pred = y_pred.cpu().numpy()
print(y_pred)
print(label)

[*********************100%***********************]  1 of 1 completed

                  Open         High          Low     Volume
Date                                                       
2024-09-30  5726.52002  5743.859863  5724.350098  992236769





TypeError: 'int' object is not callable