In [None]:
import pandas as pd
import numpy as np
import os
import warnings
import torch 
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

In [2]:
os.chdir('../src/raw_data')
warnings.filterwarnings('ignore')
torch.manual_seed(52)
np.random.seed(52)

In [3]:
label_df = pd.read_csv('ground_truth.csv', sep = ';')
label_df

Unnamed: 0,file,mark,recovery,drop
0,00e03657-8e1e-4c8c-a724-1d3c77b48510,"[0.0,235.9225,237.06666666666666,2076.06055555...","[[2419.9805555555554,2437.4241666666667],[3177...","[[3453.6875,3763.9605555555554]]"
1,00e4dba2-36d2-42b4-beb1-c55aed75f506,"[0.0,7979.234444444444,13284.465,19439.8005555...",[],"[[13284.465,19439.800555555557]]"
2,00f035b7-ad7a-4f30-9081-522a3c10805b,"[0.0,42.75,2438.3330555555553]",[],"[[0.0,42.75]]"
3,01a0c034-6afc-4e73-95fa-621f702a0b7d,"[0.0,491.98305555555555,1439.9830555555557,154...",[],"[[0.0,491.98305555555555]]"
4,01a530d3-6496-4515-9fbb-4f44e298fd29,"[0.0,1287.0341666666666,1288.0483333333334,156...",[],"[[4920.376666666667,6208.231666666667]]"
...,...,...,...,...
95,1dfaf03c-e297-4d92-a0bf-40b1a829391f,"[0.0,7.4,7.933055555555556,14.466666666666667,...",[],[]
96,1e149fbd-41c6-4779-b87d-c5dc17fbb4c0,"[0.0,635.3127777777778]",[],"[[0.0,635.3127777777778]]"
97,1e19b77c-8a0e-4749-a384-9c1e679035bf,"[0.0,82.16555555555556,216.66027777777776,229....",[],[]
98,1e4b4c18-1e32-45eb-917a-5760e33fbaca,"[0.0,1217.8258333333333,1223.6030555555556,125...","[[9541.77638888889,10288.5075]]","[[10339.343055555555,10739.613055555556],[1311..."


In [None]:
SEQUENCE_LENGTH = 100  
BATCH_SIZE = 32
NOISE_FACTOR = 0.05
TRAIN_DIR = "../train_reduced/"
CHECKPOINT_PATH = '../../models/checkpoint.pth'

In [None]:
def load_raw_data(file_path: str, sequence_length = SEQUENCE_LENGTH, noise_factor = NOISE_FACTOR):
    df = pd.read_csv(file_path, sep="\\s+", names=["time", "pressure"])  
    
    if df.empty or "pressure" not in df:
        return torch.empty(0), torch.empty(0) 
    
    scaler = MinMaxScaler()
    df["pressure"] = scaler.fit_transform(df[["pressure"]]) 

    sequences, noisy_sequences = [], []
    
    for i in range(len(df) - sequence_length):
        seq = df["pressure"].iloc[i : i + sequence_length].values
        noisy_seq = seq + noise_factor * np.random.normal(0, 1, seq.shape) 
        
        sequences.append(seq)
        noisy_sequences.append(noisy_seq)

    return torch.tensor(noisy_sequences, dtype=torch.float32).unsqueeze(-1), torch.tensor(sequences, dtype=torch.float32).unsqueeze(-1)

In [12]:
class TorchTypeDataset(Dataset):
    def __init__(self, file_paths, sequence_length = SEQUENCE_LENGTH, noise_factor = NOISE_FACTOR):
        self.noisy_data, self.clean_data = [], []
        
        for file_path in file_paths:
            if os.path.exists(file_path): 
                noisy, clean = load_raw_data(file_path, sequence_length, noise_factor)

                if noisy.shape[0] > 1 and clean.shape[0] > 1:  
                    noisy = noisy.unsqueeze(-1) if noisy.dim() == 2 else noisy
                    clean = clean.unsqueeze(-1) if clean.dim() == 2 else clean

                    self.noisy_data.append(noisy)
                    self.clean_data.append(clean)

        self.noisy_data = torch.cat(self.noisy_data, dim=0)  
        self.clean_data = torch.cat(self.clean_data, dim=0)

    def __len__(self):
        return len(self.clean_data)

    def __getitem__(self, idx):
        return self.noisy_data[idx], self.clean_data[idx]

In [None]:
class CharbonnierLoss(nn.Module):
    def __init__(self, epsilon = 1e-3):
        super(CharbonnierLoss, self).__init__()
        self.epsilon = epsilon
    
    def forward(self, x, y):
        return torch.mean(torch.sqrt((x - y) ** 2 + self.epsilon ** 2))

class Attention(nn.Module):
    def __init__(self, dim):
        super(Attention, self).__init__()
        self.attn = nn.MultiheadAttention(embed_dim = dim, num_heads = 4, batch_first = True)
    
    def forward(self, x):
        attn_output, _ = self.attn(x, x, x)
        return attn_output

class DAE(nn.Module):
    def __init__(self, input_dim = 1, hidden_dim = 64, bottleneck_dim = 32):
        super(DAE, self).__init__()  
      
        self.conv1 = nn.Conv1d(in_channels = input_dim, out_channels = 32, kernel_size = 5, padding = 2)
        self.lstm1 = nn.LSTM(input_size = 32, hidden_size = hidden_dim, batch_first = True, bidirectional = False)
        self.attn = Attention(hidden_dim)
        self.bottleneck = nn.Linear(hidden_dim, bottleneck_dim)
        
        self.fc = nn.Linear(bottleneck_dim, hidden_dim)
        self.lstm2 = nn.LSTM(input_size = hidden_dim, hidden_size = hidden_dim, batch_first = True, bidirectional = False)
        self.conv2 = nn.Conv1d(in_channels = hidden_dim, out_channels = input_dim, kernel_size = 5, padding = 2)
    
    def forward(self, x):
        x = self.conv1(x.transpose(1, 2))
        x = F.relu(x).transpose(1, 2)
        x, _ = self.lstm1(x)
        x = self.attn(x)
        x = self.bottleneck(x)
        
        x = self.fc(x)
        x, _ = self.lstm2(x)
        x = self.conv2(x.transpose(1, 2)).transpose(1, 2)
        return x

In [17]:
def DAE_train(model, dataloader, epochs = 50, lr = 1e-3, device='cuda', save_path = "../../models/dae_checkpoint.pth", final_save_path = "../../models/dae_final.pth"):
    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr = lr)
    criterion = CharbonnierLoss()

    best_loss = float("inf")  

    for epoch in range(epochs):
        total_loss = 0.0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch + 1} / {epochs}") 
        
        for noisy, clean in progress_bar:
            noisy, clean = noisy.to(device), clean.to(device)
            optimizer.zero_grad()
            output = model(noisy)
            loss = criterion(output, clean)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            progress_bar.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch + 1} / {epochs}, Avg Loss: {avg_loss:.6f}")

        if avg_loss < best_loss:
            best_loss = avg_loss
            torch.save(model.state_dict(), save_path)  
            print(f"Model checkpoint saved at {save_path} (Loss improved: {best_loss:.6f})")

    torch.save(model.state_dict(), final_save_path)
    print(f"Final model saved at {final_save_path}")

    return model

In [None]:
file_paths = [os.path.join(TRAIN_DIR, file) for file in label_df['file'][:100] if file in os.listdir(TRAIN_DIR)]

dataset = TorchTypeDataset(file_paths)
dataloader = DataLoader(dataset, batch_size = BATCH_SIZE, shuffle = True)

In [None]:
for noisy_batch, clean_batch in dataloader:
    print("Noisy batch shape:", noisy_batch.shape, end = ' ')  
    print("Clean batch shape:", clean_batch.shape)
    break

In [None]:
dae_model = DAE()

if os.path.exists(CHECKPOINT_PATH):
    dae_model.load_state_dict(torch.load(CHECKPOINT_PATH))
    print(f"Checkpoint loaded from {CHECKPOINT_PATH}")

trained_model = DAE_train(dae_model, dataloader, epochs = 20, lr=1e-3, device='cuda')