In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import time
from multiprocessing import Pool
import random

In [2]:
def process_window(args):
    i, features_values, labels_values, WINDOW_SIZE, label_index = args
    window_features = features_values[i:i+WINDOW_SIZE]
    label_value = labels_values[i + WINDOW_SIZE + [5, 10, 20, 40, 60][label_index] - 1, label_index]
    return (window_features, label_value)

def process_single_file(file_path, label_index=0):
    data = pd.read_csv(file_path)
    data = data.sort_values('time')
    
    WINDOW_SIZE = 100
    PREDICTION_STEPS = [5, 10, 20, 40, 60]
    
    data['time'] = data['time'].astype('datetime64[ns]').dt.hour
    features = data.drop(columns=['date','label_5', 'label_10', 'label_20', 'label_40', 'label_60'])
    labels = data[['label_5', 'label_10', 'label_20', 'label_40', 'label_60']]
    
    scaler = StandardScaler()
    features = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)
    
    total_samples = len(data) - WINDOW_SIZE - PREDICTION_STEPS[label_index]
    
    features_values = features.values
    labels_values = labels.values
    
    args_list = [(i, features_values, labels_values, WINDOW_SIZE, label_index) for i in range(total_samples)]
    
    with Pool() as p:
        results = p.map(process_window, args_list)
    X, y = zip(*results)
    
    X = np.array(X)
    y = np.array(y)
    
    return train_test_split(X, y, test_size=0.3, shuffle=False, random_state=42)

def process_folder(folder_path, label_index=0, sample_size=None, random_seed=42):
    all_X_train = []
    all_y_train = []
    all_X_test = []
    all_y_test = []
    
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    
    random.seed(random_seed)
    if sample_size is not None and sample_size < len(csv_files):
        csv_files = random.sample(csv_files, sample_size)
    
    print(f"Processing {len(csv_files)} files: {csv_files}")
    
    for file_name in csv_files:
        file_path = os.path.join(folder_path, file_name)
        X_train, X_test, y_train, y_test = process_single_file(file_path, label_index)
        
        all_X_train.append(X_train)
        all_y_train.append(y_train)
        all_X_test.append(X_test)
        all_y_test.append(y_test)
    
    final_X_train = np.concatenate(all_X_train, axis=0)
    final_y_train = np.concatenate(all_y_train, axis=0)
    final_X_test = np.concatenate(all_X_test, axis=0)
    final_y_test = np.concatenate(all_y_test, axis=0)
    
    return final_X_train, final_X_test, final_y_train, final_y_test

In [3]:
label_index = 0 
folder_path = 'train_set'
sample_size = 60

X_train, X_test, y_train, y_test = process_folder(
    folder_path, 
    label_index=label_index, 
    sample_size=sample_size
)

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

Processing 60 files: ['snapshot_sym4_date58_pm.csv', 'snapshot_sym3_date30_pm.csv', 'snapshot_sym2_date50_am.csv', 'snapshot_sym2_date29_am.csv', 'snapshot_sym4_date16_am.csv', 'snapshot_sym0_date93_pm.csv', 'snapshot_sym3_date28_am.csv', 'snapshot_sym4_date42_am.csv', 'snapshot_sym1_date51_am.csv', 'snapshot_sym0_date44_pm.csv', 'snapshot_sym0_date96_pm.csv', 'snapshot_sym0_date76_am.csv', 'snapshot_sym3_date10_pm.csv', 'snapshot_sym1_date65_pm.csv', 'snapshot_sym3_date53_am.csv', 'snapshot_sym3_date97_pm.csv', 'snapshot_sym0_date20_pm.csv', 'snapshot_sym1_date9_pm.csv', 'snapshot_sym0_date54_pm.csv', 'snapshot_sym2_date98_am.csv', 'snapshot_sym3_date31_pm.csv', 'snapshot_sym0_date70_am.csv', 'snapshot_sym2_date45_am.csv', 'snapshot_sym3_date17_pm.csv', 'snapshot_sym4_date10_pm.csv', 'snapshot_sym3_date64_pm.csv', 'snapshot_sym0_date10_am.csv', 'snapshot_sym4_date18_am.csv', 'snapshot_sym0_date16_am.csv', 'snapshot_sym4_date16_pm.csv', 'snapshot_sym1_date44_am.csv', 'snapshot_sym0_dat

In [52]:
class LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, dropout = 0.3):
        super().__init__()
        self.lstm = nn.LSTM(
            input_dim, hidden_dim, num_layers,
            batch_first=True, bidirectional=True
        )
        self.dropout = nn.Dropout(dropout)
        self.attention = nn.Sequential( 
            nn.Linear(hidden_dim*2, 1),
            nn.Softmax(dim=1)
        )
        self.fc = nn.Linear(hidden_dim*2, num_classes)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x) 
        
        attn_weights = self.attention(lstm_out)
        context = torch.sum(attn_weights * lstm_out, dim=1)
        
        out = self.dropout(context)
        return self.fc(out)

In [53]:
device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

X_train_tensor = torch.from_numpy(X_train).float().to(device)
y_train_tensor = torch.from_numpy(y_train).long().to(device)
X_test_tensor = torch.from_numpy(X_test).float().to(device)
y_test_tensor = torch.from_numpy(y_test).long().to(device)

batch_size = 256
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

Using device: cuda:1


In [54]:
input_dim = 28
hidden_dim = 16
num_layers = 2
num_epochs = 100

model = LSTM(input_dim, hidden_dim, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimiser = torch.optim.RAdam(model.parameters(), lr=0.001)

print(model)

LSTM(
  (lstm): LSTM(28, 16, num_layers=2, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (attention): Sequential(
    (0): Linear(in_features=32, out_features=1, bias=True)
    (1): Softmax(dim=1)
  )
  (fc): Linear(in_features=32, out_features=3, bias=True)
)


In [55]:
hist = np.zeros(num_epochs)
start_time = time.time()

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for batch_x, batch_y in train_loader:
        optimiser.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimiser.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    hist[epoch] = avg_loss
    print(f"Epoch {epoch}, Avg Loss: {avg_loss}")

training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

Epoch 0, Avg Loss: 0.9894709302971401
Epoch 1, Avg Loss: 0.9300250156246936
Epoch 2, Avg Loss: 0.9134460174975617
Epoch 3, Avg Loss: 0.8997903852265116
Epoch 4, Avg Loss: 0.8861745021503824
Epoch 5, Avg Loss: 0.8768098984357606
Epoch 6, Avg Loss: 0.862171410433369
Epoch 7, Avg Loss: 0.8546797484931551
Epoch 8, Avg Loss: 0.8423251789159726
Epoch 9, Avg Loss: 0.8346286084058989
Epoch 10, Avg Loss: 0.8242968325479043
Epoch 11, Avg Loss: 0.8321465315596427
Epoch 12, Avg Loss: 0.8292993640652593
Epoch 13, Avg Loss: 0.8243850705845986
Epoch 14, Avg Loss: 0.809547046305602
Epoch 15, Avg Loss: 0.797074392515143


KeyboardInterrupt: 

In [42]:
model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predicted = torch.max(outputs.data, 1)
    accuracy = (predicted == y_test_tensor).sum().item() / y_test_tensor.size(0)
    print(f"Test Accuracy: {accuracy:.4f}")

ValueError: too many values to unpack (expected 2)

In [32]:
torch.save(model.state_dict(), f"lstm_model_label_{[5,10,20,40,60][label_index]}.pth")
print(f"Model saved as lstm_model_label_{[5,10,20,40,60][label_index]}.pth")

Model saved as lstm_model_label_20.pth


[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2
 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 