In [8]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from hmmlearn import hmm
from torch.utils.data import Dataset, DataLoader
import joblib  

In [9]:
SEQUENCE_LENGTH = 100
BATCH_SIZE = 64
N_EPOCHS = 50
LEARNING_RATE = 1e-3
HMM_COMPONENTS = 3 

TRAIN_DIR = "../src/train_denoised/"
MODEL_HMM_PATH = "../models/hmm_model.pkl"
MODEL_RNN_PATH = "../models/rnn_model.pth"

if torch.cuda.is_available():
    device = "cuda"
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
    torch.cuda.set_per_process_memory_fraction(0.75, device=0) 
    
else:
    device = "cpu"

In [10]:
def train_hmm(directory, n_components=HMM_COMPONENTS):
    all_sequences = []

    for file_name in os.listdir(directory):
        file_path = os.path.join(directory, file_name)
        df = pd.read_csv(file_path, sep="\\s+", names=["time", "pressure"])
        if df.empty or "pressure" not in df:
            continue

        all_sequences.append(df["pressure"].values.reshape(-1, 1))

    X = np.concatenate(all_sequences, axis=0) 

    hmm_model = hmm.GaussianHMM(n_components=n_components, covariance_type="diag", n_iter=100)
    hmm_model.fit(X)  
    joblib.dump(hmm_model, MODEL_HMM_PATH) 
    print(f"HMM Model saved at {MODEL_HMM_PATH}")

    return hmm_model

def label_data_with_hmm(directory, hmm_model):
    labeled_data = []

    for file_name in os.listdir(directory):
        file_path = os.path.join(directory, file_name)
        df = pd.read_csv(file_path, sep="\\s+", names=["time", "pressure"])
        if df.empty or "pressure" not in df:
            continue
        
        X = df["pressure"].values.reshape(-1, 1)

        states = hmm_model.predict(X) 
        df["state"] = states

        labeled_data.append(df)

    return labeled_data

class TimeSeriesDataset(Dataset):
    def __init__(self, labeled_data):
        self.data = []
        self.labels = []

        for df in labeled_data:
            for i in range(len(df) - SEQUENCE_LENGTH):
                seq = df["pressure"].iloc[i : i + SEQUENCE_LENGTH].values.astype(np.float32)
                label = df["state"].iloc[i + SEQUENCE_LENGTH] 

                self.data.append(seq)
                self.labels.append(label)

        self.data = np.array(self.data).reshape(-1, SEQUENCE_LENGTH, 1)
        self.labels = np.array(self.labels)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx]), torch.tensor(self.labels[idx], dtype=torch.long)
    

class RNNModel(nn.Module):
    def __init__(self, hidden_dim=128, output_dim=HMM_COMPONENTS):
        super(RNNModel, self).__init__()
        self.lstm = nn.LSTM(input_size=1, hidden_size=hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])  
        return x

In [11]:
def train_rnn(dataset):
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

    model = RNNModel().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    for epoch in range(N_EPOCHS):
        total_loss = 0
        model.train()

        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{N_EPOCHS}, Loss: {total_loss / len(dataloader):.4f}")

    torch.save(model.state_dict(), MODEL_RNN_PATH)
    print(f"RNN Model saved at {MODEL_RNN_PATH}")

    return model

In [12]:
hmm_model = train_hmm(TRAIN_DIR)

KeyboardInterrupt: 

In [None]:
labeled_data = label_data_with_hmm(TRAIN_DIR, hmm_model)
train_dataset = TimeSeriesDataset(labeled_data)

In [None]:
rnn_model = train_rnn(train_dataset)