In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, random_split, TensorDataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os


In [2]:
def combine_csvs_from_folder(folder_path):
    """
    Combines all CSV files in a folder into a single pandas DataFrame.

    Args:
        folder_path (str): The path to the folder containing the CSV files.

    Returns:
        A pandas DataFrame containing the concatenated data from all CSV files in the input folder.
    """
    # Use a list comprehension to read all CSV files in the folder into a list of DataFrames.
    dfs = [pd.read_csv(os.path.join(folder_path, f)) for f in os.listdir(folder_path) if f.endswith('.csv')]
    
    # Concatenate all of the DataFrames into a single DataFrame.
    combined_df = pd.concat(dfs, ignore_index=True)
    
    return combined_df

df = combine_csvs_from_folder('market_data/TimeSeries/')

# Drop the date column for now
df = df.drop(['date'], axis=1)

In [3]:
def normalize_columns(df):
    return (df - df.min()) / (df.max() - df.min())

df = normalize_columns(df)

In [4]:
# in the future this should be done in setup but I just did it here for now

# this is actually backward of what it should be since the df starts at present and goes back in time
def add_up_column(df):
    # Create empty 'up' and 'down' columns
    df['up'] = 0
    
    # Loop over the rows (skipping the first row)
    for i in range(1, len(df)):
        if df.loc[i, '4. close'] > df.loc[i-1, '4. close']:
            df.loc[i, 'up'] = 1
    return df


df = add_up_column(df)
df.head()


Unnamed: 0,1. open,2. high,3. low,4. close,5. adjusted close,6. volume,7. dividend amount,8. split coefficient,up
0,0.176046,0.173569,0.175851,0.174652,0.182913,0.179121,0.0,0.0,0
1,0.182066,0.179904,0.178948,0.180088,0.188316,0.155993,0.0,0.0,1
2,0.179602,0.179119,0.178785,0.179098,0.187332,0.164548,0.0,0.0,0
3,0.182288,0.181835,0.181323,0.178229,0.186468,0.190227,0.0,0.0,0
4,0.186631,0.185254,0.186305,0.186453,0.194644,0.191133,0.0,0.0,1


In [5]:
# neural networks require tensors, so we need to convert our dataframes to tensors

def df_to_tensor(df):
    #right now this is just getting columns 1-4 (open, low, high, close)
    #without date for now
    inputs = torch.from_numpy(df.iloc[:, 0:4].values.astype('float32'))
    targets = torch.from_numpy(df.iloc[:, 8:].values.astype('float32'))
    return inputs, targets


inputs, targets = df_to_tensor(df)
print(inputs.shape)
print(targets.shape)

torch.Size([1100, 4])
torch.Size([1100, 1])


In [6]:
# making a training and validation dataset
# just making a training dataset for now

dataset = TensorDataset(inputs, targets)

# val_percent = 0.2
# num_rows = len(df.index)
# val_size = int(num_rows * val_percent)
# train_size = num_rows - val_size
# train_ds, val_ds = random_split(dataset, [train_size, val_size])

In [7]:
# pytorch likes to use data loaders to load data in batches

batch_size = 1
train_loader = DataLoader(dataset, batch_size, shuffle = True, num_workers = 0)
#val_loader = DataLoader(val_ds, batch_size, num_workers = 0)

In [8]:
# use gpu if avaliable
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [9]:
class baselinePredictor(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.layer_1 = nn.Linear(4, 64) 
        self.layer_2 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1) 
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        

    def forward(self, x):
        x = self.relu(self.layer_1(inputs))
        x = self.relu(self.layer_2(x))
        x = self.dropout(x)
        x = self.layer_out(x)
        return x

# input size is 4 because we are using open, low, high, close
# output size is 2 because we are predicting up or down
input_size = 4
output_size = 1
model = baselinePredictor(input_size, output_size)
model.to(device)

baselinePredictor(
  (layer_1): Linear(in_features=4, out_features=64, bias=True)
  (layer_2): Linear(in_features=64, out_features=64, bias=True)
  (layer_out): Linear(in_features=64, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
)

In [10]:
# hyperparameters for training
# will need to change these a bunch to find out what works best
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.01)
num_epochs = 100

In [11]:
# training loop
for epoch in range(num_epochs):
    for batch in train_loader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)
        # forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if(epoch % 10 == 0):
        print(f'epoch: {epoch}, loss: {loss.item()}')

epoch: 0, loss: 0.7361469864845276
epoch: 10, loss: 0.7416725754737854
epoch: 20, loss: 0.7417689561843872
epoch: 30, loss: 0.6503648161888123
epoch: 40, loss: 0.6917323470115662
epoch: 50, loss: 0.6901987195014954
epoch: 60, loss: 0.6590849161148071
epoch: 70, loss: 0.7203540802001953
epoch: 80, loss: 0.6869814395904541
epoch: 90, loss: 0.6461443305015564


In [12]:
# not working yet

# def get_accuracy(model, loader):
#     num_correct = 0
#     num_samples = 0
#     model.eval()
#     with torch.no_grad():
#         for x, y in loader:
#             outputs = model(x)
#             _, predictions = outputs.max(1)
#             num_correct += (predictions == y).sum()
#             num_samples += predictions.size(0)
#         print(f'Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}')

# get_accuracy(model, val_loader)



In [18]:
# test out some sample predictions with the trained model

def single_prediction(model, inputs):
    with torch.no_grad():
        outputs = model(inputs)
        # uncomment to see the raw outputs
        #return outputs
        return torch.clamp(outputs, 0, 1)


In [23]:
testdf = pd.read_csv('market_data/TimeSeries/AMZN.csv')
testdf = testdf.drop(['date'], axis=1)
testdf = normalize_columns(testdf)
testInputs, testOutputs = df_to_tensor(testdf)

for inputs in testInputs:
    print(single_prediction(model, inputs)) 


tensor([0.0314])
tensor([0.0347])
tensor([0.0365])
tensor([0.0300])
tensor([0.0385])
tensor([0.0409])
tensor([0.0429])
tensor([0.0408])
tensor([0.0380])
tensor([0.0395])
tensor([0.0349])
tensor([0.0353])
tensor([0.0378])
tensor([0.0293])
tensor([0.0125])
tensor([0.])
tensor([0.0281])
tensor([0.0320])
tensor([0.0382])
tensor([0.0344])
tensor([0.0411])
tensor([0.0373])
tensor([0.0377])
tensor([0.0390])
tensor([0.0385])
tensor([0.0321])
tensor([0.0309])
tensor([0.0376])
tensor([0.0394])
tensor([0.0346])
tensor([0.0322])
tensor([0.0326])
tensor([0.0328])
tensor([0.0349])
tensor([0.0350])
tensor([0.0334])
tensor([0.0334])
tensor([0.0351])
tensor([0.0328])
tensor([0.0344])
tensor([0.0350])
tensor([0.0340])
tensor([0.0320])
tensor([0.0342])
tensor([0.0349])
tensor([0.0341])
tensor([0.0314])
tensor([0.0316])
tensor([0.0302])
tensor([0.0305])
tensor([0.0316])
tensor([0.0316])
tensor([0.0308])
tensor([0.0323])
tensor([0.0305])
tensor([0.0309])
tensor([0.0317])
tensor([0.0354])
tensor([0.0364])
t

tensor([0.])