In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import math
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import pad

raw_data = pd.read_csv(r"/content/drive/MyDrive/483/1_ETHUSDT_1.1.2018-1.2.2024_1hour.csv", header=None)
raw_data.columns = ['date', 'open', 'high', 'low', 'close', 'volume'] + list(raw_data.columns[6:])
raw_data.drop(raw_data.columns[6:], axis=1, inplace=True)
raw_data['date'] = pd.to_datetime(raw_data['date'])
raw_data.set_index('date', inplace=True)

# Create a new column 'next_10_close' that contains the next 10 'close' values
next_10_close_values = [list(raw_data['close'].iloc[i+1:i+11]) for i in range(len(raw_data)-10)] + [None]*10
raw_data['next_10_close'] = next_10_close_values
raw_data = raw_data.dropna()

# Normalize the data
scaler = MinMaxScaler()
normalized_data = pd.DataFrame(scaler.fit_transform(raw_data.drop(columns='next_10_close')), columns=raw_data.drop(columns='next_10_close').columns, index=raw_data.index)

# Normalize 'next_10_close' separately
next_10_close_scaler = MinMaxScaler()

# Flatten the list of lists and reshape it to fit the scaler
flattened = np.array([val for sublist in raw_data['next_10_close'].tolist() for val in sublist]).reshape(-1, 1)

next_10_close_scaler.fit(flattened)

next_10_close = raw_data['next_10_close'].apply(lambda x: [next_10_close_scaler.transform(np.array(val).reshape(-1, 1))[0][0] for val in x])

normalized_data = pd.concat([normalized_data, next_10_close.rename('next_10_close')], axis=1)

class TimeSeriesDataset(Dataset):
    def __init__(self, sequences, targets, seq_length):
        self.sequences = sequences
        self.targets = targets
        self.seq_length = seq_length

    def __len__(self):
        return self.sequences.shape[0] - self.seq_length

    def __getitem__(self, index):
        sequence = torch.tensor(self.sequences[index:index+self.seq_length].values, dtype=torch.float32)
        target = torch.tensor(self.targets[index + self.seq_length], dtype=torch.float32)
        if target.dim() == 0:
            target = target.view(1)
        if len(target) < 10:
            return None
        return sequence, target

features = normalized_data[['open', 'high', 'low', 'close', 'volume']]
targets = normalized_data['next_10_close']
seq_length = 24
dataset = TimeSeriesDataset(features, targets, seq_length)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        if d_model % 2 == 0:
            pe[:, 1::2] = torch.cos(position * div_term)
        else:
            pe[:, 1::2] = torch.cos(position * div_term)[:, :-1]
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x) -> torch.Tensor:
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    def __init__(self, ninp, nhead, nhid, nlayers, dropout=0.5):
        super().__init__()
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=ninp, nhead=nhead, dim_feedforward=nhid, dropout=dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=nlayers)
        self.decoder = nn.Linear(ninp, 10)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = self.decoder(output)
        return output

# Define the size of each split
train_size = int(len(dataset) * 0.7)
val_size = int(len(dataset) * 0.15)
test_size = len(dataset) - train_size - val_size

# Split the data
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])

class FilteredDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = [item for item in dataset if item is not None]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        return self.dataset[index]

train_loader = DataLoader(train_dataset, batch_size=24, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=24, shuffle=False, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=24, shuffle=False, drop_last=True)

model = TransformerModel(ninp=5, nhead=5, nhid=256, nlayers=4, dropout=0.1)



In [48]:
normalized_data

Unnamed: 0_level_0,open,high,low,close,volume,next_10_close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-12-31 19:00:00,0.136596,0.136168,0.134327,0.135470,0.004270,"[0.1334441520062797, 0.13472024581596542, 0.13..."
2017-12-31 20:00:00,0.135337,0.135641,0.133647,0.133444,0.004675,"[0.13472024581596542, 0.13691353205136278, 0.1..."
2017-12-31 21:00:00,0.133377,0.134335,0.133813,0.134720,0.004392,"[0.13691353205136278, 0.13907953338622406, 0.1..."
2017-12-31 22:00:00,0.134695,0.136893,0.134889,0.136914,0.004381,"[0.13907953338622406, 0.1399547490418802, 0.14..."
2017-12-31 23:00:00,0.137012,0.138354,0.136427,0.139080,0.004735,"[0.1399547490418802, 0.14272311702703722, 0.14..."
...,...,...,...,...,...,...
2024-01-31 05:00:00,0.467816,0.466196,0.462981,0.462129,0.063831,"[0.4655244787534578, 0.46644377001767223, 0.46..."
2024-01-31 06:00:00,0.462109,0.463423,0.462807,0.465524,0.039286,"[0.46644377001767223, 0.4681879048134762, 0.46..."
2024-01-31 07:00:00,0.465503,0.464892,0.466199,0.466444,0.025457,"[0.4681879048134762, 0.46703984015245964, 0.47..."
2024-01-31 08:00:00,0.466424,0.467119,0.466578,0.468188,0.030382,"[0.46703984015245964, 0.47166148253556484, 0.4..."


In [49]:
optimizer = optim.AdamW(model.parameters(), lr=0.01, weight_decay=0.01)
criterion = nn.MSELoss()

def train_and_evaluate(model, train_loader, val_loader, optimizer, criterion, epochs=30):
    model.train()
    scheduler = StepLR(optimizer, step_size=5, gamma=0.1)
    best_val_loss = float('inf')
    for epoch in range(epochs):
        total_train_loss = 0
        for seq, target in train_loader:
            optimizer.zero_grad()
            output = model(seq)[-10:].squeeze()
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
        scheduler.step()

        val_loss = evaluate(model, criterion, val_loader)
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), '/content/drive/MyDrive/483/best_model.pth')
            print(f'Saved new best model at epoch {epoch+1}')

        print(f'Epoch {epoch+1}, Training Loss: {total_train_loss / len(train_loader)}, Validation Loss: {val_loss}')

def evaluate(model, criterion, data_loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for seq, target in data_loader:
            output = model(seq)[-10:].squeeze()
            loss = criterion(output, target)
            total_loss += loss.item()
    return total_loss / len(data_loader)

train_and_evaluate(model, train_loader, val_loader, optimizer, criterion, epochs=10)

def load_model(model_path, model):
    model.load_state_dict(torch.load(model_path))
    model.eval()
    return model

def predict_and_evaluate(model, data_loader, criterion):
    total_loss = 0
    predictions = []
    actuals = []
    with torch.no_grad():
        for seq, target in data_loader:
            output = model(seq)[-10:].squeeze()
            loss = criterion(output, target)
            total_loss += loss.item()
            predictions.extend(output.tolist())
            actuals.extend(target.tolist())
    average_loss = total_loss / len(data_loader)
    return average_loss, predictions, actuals

# model_path = '/content/drive/MyDrive/483/best_model.pth'
# model = load_model(model_path, model)

  return F.mse_loss(input, target, reduction=self.reduction)


Saved new best model at epoch 1
Epoch 1, Training Loss: 0.056634948215176985, Validation Loss: 0.055587945264730466
Saved new best model at epoch 2
Epoch 2, Training Loss: 0.0561948369888048, Validation Loss: 0.055135421247322514
Epoch 3, Training Loss: 0.05616401759126494, Validation Loss: 0.05517468438764293
Saved new best model at epoch 4
Epoch 4, Training Loss: 0.05622189953803055, Validation Loss: 0.055041602571461214
Epoch 5, Training Loss: 0.05620175637184612, Validation Loss: 0.05607232923271606
Saved new best model at epoch 6
Epoch 6, Training Loss: 0.056027814614917, Validation Loss: 0.055035438596169994
Epoch 7, Training Loss: 0.05600362738534327, Validation Loss: 0.055068991612643
Epoch 8, Training Loss: 0.0559929821435963, Validation Loss: 0.0550552348168679
Epoch 9, Training Loss: 0.055980385982942193, Validation Loss: 0.05504336838150419
Saved new best model at epoch 10
Epoch 10, Training Loss: 0.05598485339432955, Validation Loss: 0.05503416782334806


In [50]:
test_loss, test_predictions, test_actuals = predict_and_evaluate(model, test_loader, criterion)


In [36]:
test_loss

0.056397126505084064

In [55]:
for i in range(10):
    print(f"Predicted: {test_predictions[i]}, Actual: {test_actuals[i]}")

Predicted: [[0.24939605593681335, 0.24952849745750427, 0.24960358440876007, 0.24939647316932678, 0.24962428212165833, 0.24979503452777863, 0.2498684674501419, 0.24964618682861328, 0.24974261224269867, 0.24971795082092285], [0.24939605593681335, 0.24952849745750427, 0.24960358440876007, 0.24939647316932678, 0.24962428212165833, 0.24979503452777863, 0.2498684674501419, 0.24964618682861328, 0.24974261224269867, 0.24971795082092285], [0.24939605593681335, 0.24952849745750427, 0.24960358440876007, 0.24939647316932678, 0.24962428212165833, 0.24979503452777863, 0.2498684674501419, 0.24964618682861328, 0.24974261224269867, 0.24971795082092285], [0.24939605593681335, 0.24952849745750427, 0.24960358440876007, 0.24939647316932678, 0.24962428212165833, 0.24979503452777863, 0.2498684674501419, 0.24964618682861328, 0.24974261224269867, 0.24971795082092285], [0.24939605593681335, 0.24952849745750427, 0.24960358440876007, 0.24939647316932678, 0.24962428212165833, 0.24979503452777863, 0.249868467450141