In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, random_split, TensorDataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os


In [2]:
def combine_csvs_from_folder(folder_path):
    """
    Combines all CSV files in a folder into a single pandas DataFrame.

    Args:
        folder_path (str): The path to the folder containing the CSV files.

    Returns:
        A pandas DataFrame containing the concatenated data from all CSV files in the input folder.
    """
    # Use a list comprehension to read all CSV files in the folder into a list of DataFrames.
    dfs = [pd.read_csv(os.path.join(folder_path, f)) for f in os.listdir(folder_path) if f.endswith('.csv')]
    
    # Use a list comprehension to get the filenames of all CSV files in the folder.
    filenames = [os.path.splitext(os.path.basename(f))[0] for f in os.listdir(folder_path) if f.endswith('.csv')]
    
    # Add a column to each DataFrame containing the filename.
    for df, filename in zip(dfs, filenames):
        df['company'] = filename

    # Concatenate all of the DataFrames into a single DataFrame.
    combined_df = pd.concat(dfs, ignore_index=True)
    
    return combined_df

df = combine_csvs_from_folder('market_data/TimeSeries/')

# convert the company column to integers
df['company'] = df['company'].astype('category')
df['company'] = df['company'].cat.codes


# Drop the date column for now
df = df.drop(['date'], axis=1)

# reverse the dataframe so that the oldest data is at the top 
df = df[::-1]
df.head()

Unnamed: 0,1. open,2. high,3. low,4. close,5. adjusted close,6. volume,7. dividend amount,8. split coefficient,company
1099,179.34,183.36,178.15,181.65,180.874583,8842330.0,0.0,1.0,10
1098,185.05,186.2125,183.07,185.65,184.857508,7081350.0,0.0,1.0,10
1097,183.34,188.82,182.36,187.67,186.868885,6119876.0,0.0,1.0,10
1096,186.65,188.05,185.25,185.59,184.797764,5163043.0,0.0,1.0,10
1095,183.45,184.61,181.93,183.83,183.045277,5610282.0,0.0,1.0,10


In [3]:
def normalize_columns(df):
    return (df - df.min()) / (df.max() - df.min())

df = normalize_columns(df)
df.head()

Unnamed: 0,1. open,2. high,3. low,4. close,5. adjusted close,6. volume,7. dividend amount,8. split coefficient,company
1099,0.241152,0.246314,0.24194,0.245256,0.251545,0.026703,0.0,0.0,1.0
1098,0.252687,0.252051,0.251967,0.253339,0.259547,0.020947,0.0,0.0,1.0
1097,0.249232,0.257296,0.25052,0.257421,0.263587,0.017804,0.0,0.0,1.0
1096,0.255919,0.255747,0.256409,0.253218,0.259426,0.014676,0.0,0.0,1.0
1095,0.249455,0.248828,0.249643,0.249662,0.255906,0.016138,0.0,0.0,1.0


In [4]:
# in the future this should be done in setup but I just did it here for now

def add_up_column(df):
    # Create empty 'up' and 'down' columns
    df['up'] = 0
    
    # Loop over the rows (skipping the first row)
    for i in range(1, len(df)):
        if df.loc[i, '4. close'] > df.loc[i-1, '4. close']:
            df.loc[i, 'up'] = 1
    return df


df = add_up_column(df)
df.head()


Unnamed: 0,1. open,2. high,3. low,4. close,5. adjusted close,6. volume,7. dividend amount,8. split coefficient,company,up
1099,0.241152,0.246314,0.24194,0.245256,0.251545,0.026703,0.0,0.0,1.0,0
1098,0.252687,0.252051,0.251967,0.253339,0.259547,0.020947,0.0,0.0,1.0,0
1097,0.249232,0.257296,0.25052,0.257421,0.263587,0.017804,0.0,0.0,1.0,1
1096,0.255919,0.255747,0.256409,0.253218,0.259426,0.014676,0.0,0.0,1.0,1
1095,0.249455,0.248828,0.249643,0.249662,0.255906,0.016138,0.0,0.0,1.0,1


In [5]:
# neural networks require tensors, so we need to convert our dataframes to tensors

def df_to_tensor(df):
    #right now this is just getting columns 1-4 (open, low, high, close)
    #without date for now
    columns = ['1. open', '2. high', '3. low', '4. close', '6. volume', 'company']
    inputs = torch.from_numpy(df.loc[:, columns].values.astype('float32'))
    targets = torch.from_numpy(df.loc[:, ['up']].values.astype('float32'))
    return inputs, targets


inputs, targets = df_to_tensor(df)
print(inputs[0])
print(inputs.shape)
print(targets.shape)

tensor([0.2412, 0.2463, 0.2419, 0.2453, 0.0267, 1.0000])
torch.Size([1100, 6])
torch.Size([1100, 1])


In [6]:
# making a training and validation dataset

dataset = TensorDataset(inputs, targets)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [7]:
# pytorch likes to use data loaders to load data in batches

batch_size = 16
train_loader = DataLoader(dataset, batch_size, shuffle = True, num_workers = 0)
val_loader = DataLoader(val_dataset, batch_size, shuffle = False, num_workers = 0)

In [8]:
# use gpu if avaliable
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [9]:
class baselinePredictor(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()

        self.model = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, output_size),
            nn.Sigmoid()
            

        )

        

    def forward(self, x):
        x = self.model(x)
        
        return x

# input size is 4 because we are using open, low, high, close
# output size is 1 because we are predicting up=1 or down=0
input_size = 6
output_size = 1
model = baselinePredictor(input_size, output_size)
model.to(device)

baselinePredictor(
  (model): Sequential(
    (0): Linear(in_features=6, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): ReLU()
    (6): Linear(in_features=32, out_features=1, bias=True)
    (7): Sigmoid()
  )
)

In [10]:
# hyperparameters for training
# will need to change these a bunch to find out what works best
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr = 0.001)
num_epochs = 100

In [11]:
# training loop
for epoch in range(num_epochs):
    for batch in train_loader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)
        # forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if(epoch % 10 == 0 or epoch == num_epochs - 1):
        print(f'epoch: {epoch}, loss: {loss.item()}')




epoch: 0, loss: 0.6878068447113037
epoch: 10, loss: 0.7009245753288269
epoch: 20, loss: 0.6889095306396484
epoch: 30, loss: 0.6914362907409668
epoch: 40, loss: 0.6959040760993958
epoch: 50, loss: 0.6925799250602722
epoch: 60, loss: 0.6944217085838318
epoch: 70, loss: 0.6929605603218079
epoch: 80, loss: 0.6947753429412842
epoch: 90, loss: 0.6927652955055237
epoch: 99, loss: 0.694158136844635


In [12]:
#not sure if this is working
def validate(model, dataloader, loss_fn, device):
    model.eval()  # set model to evaluation mode
    
    val_loss = 0.0
    val_acc = 0.0
    total_samples = 0
    
    with torch.no_grad():  # disable gradient computation
        for batch_idx, (inputs, targets) in enumerate(dataloader):
            inputs, targets = inputs.to(device), targets.to(device)

            # forward pass
            outputs = model(inputs)
            loss = loss_fn(outputs, targets)
            val_loss += loss.item() * inputs.size(0)

            # calculate accuracy
            _, predicted = torch.max(outputs.data, 1)
            val_acc += (predicted == targets).sum().item()
            total_samples += targets.size(0)

    # calculate average loss and accuracy
    avg_loss = val_loss / total_samples
    avg_acc = val_acc / total_samples

    return avg_loss, avg_acc

val_loss, val_acc = validate(model, val_loader, criterion, device)
print(f'val_loss: {val_loss}, val_acc: {val_acc}')

val_loss: 0.6933567903258584, val_acc: 8.036363636363637


In [16]:
# test out some sample predictions with the trained model

def single_prediction(model, inputs):
    with torch.no_grad():
        outputs = model(inputs)
        # uncomment to see the raw outputs
        # return outputs
        return torch.clamp(outputs, 0, 1)


In [17]:
testinput = inputs[0:20]
print(single_prediction(model, testinput)) 


tensor([[0.4993],
        [0.4967],
        [0.5027],
        [0.4960],
        [0.4984],
        [0.4980],
        [0.4975],
        [0.4975],
        [0.4994],
        [0.5042],
        [0.4976],
        [0.4957]])
