In [28]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, random_split, TensorDataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os


In [29]:
def combine_csvs_from_folder(folder_path):
    """
    Combines all CSV files in a folder into a single pandas DataFrame.

    Args:
        folder_path (str): The path to the folder containing the CSV files.

    Returns:
        A pandas DataFrame containing the concatenated data from all CSV files in the input folder.
    """
    # Use a list comprehension to read all CSV files in the folder into a list of DataFrames.
    dfs = [pd.read_csv(os.path.join(folder_path, f)) for f in os.listdir(folder_path) if f.endswith('.csv')]
    
    # Use a list comprehension to get the filenames of all CSV files in the folder.
    filenames = [os.path.splitext(os.path.basename(f))[0] for f in os.listdir(folder_path) if f.endswith('.csv')]
    
    # Add a column to each DataFrame containing the filename.
    for df, filename in zip(dfs, filenames):
        df['company'] = filename

    # Concatenate all of the DataFrames into a single DataFrame.
    combined_df = pd.concat(dfs, ignore_index=True)
    
    return combined_df

df = combine_csvs_from_folder('market_data/TimeSeries/')

# convert the company column to integers
df['company'] = df['company'].astype('category')
df['company'] = df['company'].cat.codes


# Drop the date column for now
df = df.drop(['date'], axis=1)

# reverse the dataframe so that the oldest data is at the top 
df = df[::-1]
df.head()

Unnamed: 0,1. open,2. high,3. low,4. close,5. adjusted close,6. volume,7. dividend amount,8. split coefficient,company
54967,59.5,69.0,55.0,56.5,12.705125,177121500.0,0.0,1.0,10
54966,58.47,65.0,57.5,64.35,14.470351,49746300.0,0.0,1.0,10
54965,67.24,67.7,59.0,59.73,13.431454,37391600.0,0.0,1.0,10
54964,60.63,64.25,59.82,63.25,14.222994,21773000.0,0.0,1.0,10
54963,62.5,64.48,61.57,63.96,14.382652,10777900.0,0.0,1.0,10


In [30]:
def normalize_columns(df):
    return (df - df.min()) / (df.max() - df.min())

df = normalize_columns(df)
df.head()

Unnamed: 0,1. open,2. high,3. low,4. close,5. adjusted close,6. volume,7. dividend amount,8. split coefficient,company
54967,0.014484,0.016836,0.013461,0.013728,0.022605,0.299662,0.0,0.0,1.0
54966,0.014208,0.015774,0.014138,0.015834,0.025795,0.084153,0.0,0.0,1.0
54965,0.016554,0.016491,0.014544,0.014595,0.023918,0.06325,0.0,0.0,1.0
54964,0.014786,0.015575,0.014766,0.015539,0.025348,0.036825,0.0,0.0,1.0
54963,0.015286,0.015636,0.01524,0.01573,0.025637,0.018222,0.0,0.0,1.0


In [31]:
# in the future this should be done in setup but I just did it here for now

def add_up_column(df):
    # Create empty 'up' and 'down' columns
    df['up'] = 0
    
    # Loop over the rows (skipping the first row)
    for i in range(1, len(df)):
        if df.loc[i, '4. close'] > df.loc[i-1, '4. close']:
            df.loc[i, 'up'] = 1
    return df


df = add_up_column(df)
df.head()


Unnamed: 0,1. open,2. high,3. low,4. close,5. adjusted close,6. volume,7. dividend amount,8. split coefficient,company,up
54967,0.014484,0.016836,0.013461,0.013728,0.022605,0.299662,0.0,0.0,1.0,0
54966,0.014208,0.015774,0.014138,0.015834,0.025795,0.084153,0.0,0.0,1.0,1
54965,0.016554,0.016491,0.014544,0.014595,0.023918,0.06325,0.0,0.0,1.0,0
54964,0.014786,0.015575,0.014766,0.015539,0.025348,0.036825,0.0,0.0,1.0,0
54963,0.015286,0.015636,0.01524,0.01573,0.025637,0.018222,0.0,0.0,1.0,1


In [32]:
# neural networks require tensors, so we need to convert our dataframes to tensors

def df_to_tensor(df):
    #right now this is just getting columns 1-4 (open, low, high, close)
    #without date for now
    columns = ['1. open', '2. high', '3. low', '4. close', '6. volume', 'company']
    inputs = torch.from_numpy(df.loc[:, columns].values.astype('float32'))
    targets = torch.from_numpy(df.loc[:, ['up']].values.astype('float32'))
    return inputs, targets


inputs, targets = df_to_tensor(df)
print(inputs[0])
print(inputs.shape)
print(targets.shape)

tensor([0.0145, 0.0168, 0.0135, 0.0137, 0.2997, 1.0000])
torch.Size([54968, 6])
torch.Size([54968, 1])


In [33]:
# making a training and validation dataset

dataset = TensorDataset(inputs, targets)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [34]:
# pytorch likes to use data loaders to load data in batches

batch_size = 16
train_loader = DataLoader(dataset, batch_size, shuffle = True, num_workers = 0)
val_loader = DataLoader(val_dataset, batch_size, shuffle = False, num_workers = 0)

In [35]:
# use gpu if avaliable
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [36]:
class baselinePredictor(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()

        self.model = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, output_size),
            nn.Sigmoid()
            

        )

        

    def forward(self, x):
        x = self.model(x)
        
        return x

# input size is 4 because we are using open, low, high, close
# output size is 1 because we are predicting up=1 or down=0
input_size = 6
output_size = 1
model = baselinePredictor(input_size, output_size)
model.to(device)

baselinePredictor(
  (model): Sequential(
    (0): Linear(in_features=6, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=32, bias=True)
    (5): ReLU()
    (6): Linear(in_features=32, out_features=1, bias=True)
    (7): Sigmoid()
  )
)

In [37]:
# hyperparameters for training
# will need to change these a bunch to find out what works best
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr = 0.001)
num_epochs = 10

In [38]:
# training loop
for epoch in range(num_epochs):
    for batch in train_loader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)
        # forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    if(epoch % 10 == 0 or epoch == num_epochs - 1):
        print(f'epoch: {epoch}, loss: {loss.item()}')




epoch: 0, loss: 0.693450391292572
epoch: 9, loss: 0.7045601606369019


In [39]:
def validate(model, dataloader, criterion):
    model.eval() # Set the model to evaluation mode
    val_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad(): # Disable gradient calculation for efficiency
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device) # Move data to GPU if available
            outputs = model(inputs)
            loss = criterion(outputs, targets.float()) # BCE loss expects float inputs
            val_loss += loss.item() * inputs.size(0) # Track total validation loss
            
            # Calculate accuracy
            predicted = torch.round(torch.sigmoid(outputs))
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
    
    # Calculate average validation loss and accuracy
    val_loss /= len(dataloader.dataset)
    accuracy = correct / total
    
    return val_loss, accuracy

val_loss, val_acc = validate(model, val_loader, criterion)
print(f'val_loss: {val_loss}, val_acc: {val_acc}')

val_loss: 0.6919563686455166, val_acc: 0.47535019101327997


In [40]:
# test out some sample predictions with the trained model

def single_prediction(model, inputs):
    with torch.no_grad():
        outputs = model(inputs)
        # uncomment to see the raw outputs
        # return outputs
        return torch.clamp(outputs, 0, 1)


In [58]:
print(inputs[2])
print(targets[2])
testinput = inputs[7]
print(single_prediction(model, testinput)) 


tensor([0.0294, 0.0293, 0.0294, 0.0292, 0.0046, 1.0000])
tensor([0.])
tensor([0.4787])
