In [88]:
import pandas as pd
import numpy as np
import os
import warnings
import torch 
import torch.nn as nn 
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

In [90]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
torch.cuda.set_per_process_memory_fraction(0.7, device = 0)
warnings.filterwarnings('ignore')
torch.manual_seed(52)
np.random.seed(52)

In [91]:
label_df = pd.read_csv('ground_truth.csv', sep = ';')
label_df

Unnamed: 0,file,mark,recovery,drop
0,00e03657-8e1e-4c8c-a724-1d3c77b48510,"[0.0,235.9225,237.06666666666666,2076.06055555...","[[2419.9805555555554,2437.4241666666667],[3177...","[[3453.6875,3763.9605555555554]]"
1,00e4dba2-36d2-42b4-beb1-c55aed75f506,"[0.0,7979.234444444444,13284.465,19439.8005555...",[],"[[13284.465,19439.800555555557]]"
2,00f035b7-ad7a-4f30-9081-522a3c10805b,"[0.0,42.75,2438.3330555555553]",[],"[[0.0,42.75]]"
3,01a0c034-6afc-4e73-95fa-621f702a0b7d,"[0.0,491.98305555555555,1439.9830555555557,154...",[],"[[0.0,491.98305555555555]]"
4,01a530d3-6496-4515-9fbb-4f44e298fd29,"[0.0,1287.0341666666666,1288.0483333333334,156...",[],"[[4920.376666666667,6208.231666666667]]"
...,...,...,...,...
95,1dfaf03c-e297-4d92-a0bf-40b1a829391f,"[0.0,7.4,7.933055555555556,14.466666666666667,...",[],[]
96,1e149fbd-41c6-4779-b87d-c5dc17fbb4c0,"[0.0,635.3127777777778]",[],"[[0.0,635.3127777777778]]"
97,1e19b77c-8a0e-4749-a384-9c1e679035bf,"[0.0,82.16555555555556,216.66027777777776,229....",[],[]
98,1e4b4c18-1e32-45eb-917a-5760e33fbaca,"[0.0,1217.8258333333333,1223.6030555555556,125...","[[9541.77638888889,10288.5075]]","[[10339.343055555555,10739.613055555556],[1311..."


In [92]:
SEQUENCE_LENGTH = 100  
BATCH_SIZE = 32
NOISE_FACTOR = 0.05
TRAIN_DIR = "../train_reduced/"
CHECKPOINT_PATH = '../../models/checkpoint.pth'

In [93]:
def load_raw_data(file_path: str, sequence_length = SEQUENCE_LENGTH, noise_factor = NOISE_FACTOR):
    df = pd.read_csv(file_path, sep="\\s+", names=["time", "pressure"])  
    
    if df.empty or "pressure" not in df:
        return torch.empty(0), torch.empty(0) 
    
    scaler = MinMaxScaler()
    df["pressure"] = scaler.fit_transform(df[["pressure"]]) 

    sequences, noisy_sequences = [], []
    
    for i in range(len(df) - sequence_length):
        seq = df["pressure"].iloc[i : i + sequence_length].values
        noisy_seq = seq + noise_factor * np.random.normal(0, 1, seq.shape) 
        
        sequences.append(seq)
        noisy_sequences.append(noisy_seq)

    return torch.tensor(noisy_sequences, dtype=torch.float32).unsqueeze(-1), torch.tensor(sequences, dtype=torch.float32).unsqueeze(-1)

In [94]:
class TorchTypeDataset(Dataset):
    def __init__(self, file_paths, sequence_length = SEQUENCE_LENGTH, noise_factor = NOISE_FACTOR):
        self.noisy_data, self.clean_data = [], []
        
        for file_path in file_paths:
            if os.path.exists(file_path): 
                noisy, clean = load_raw_data(file_path, sequence_length, noise_factor)

                if noisy.shape[0] > 1 and clean.shape[0] > 1:  
                    noisy = noisy.unsqueeze(-1) if noisy.dim() == 2 else noisy
                    clean = clean.unsqueeze(-1) if clean.dim() == 2 else clean

                    self.noisy_data.append(noisy)
                    self.clean_data.append(clean)

        self.noisy_data = torch.cat(self.noisy_data, dim=0)  
        self.clean_data = torch.cat(self.clean_data, dim=0)

    def __len__(self):
        return len(self.clean_data)

    def __getitem__(self, idx):
        return self.noisy_data[idx], self.clean_data[idx]

In [95]:
class CharbonnierLoss(nn.Module):
    def __init__(self, epsilon = 1e-3):
        super(CharbonnierLoss, self).__init__()
        self.epsilon = epsilon
    
    def forward(self, x, y):
        return torch.mean(torch.sqrt((x - y) ** 2 + self.epsilon ** 2))

class Attention(nn.Module):
    def __init__(self, dim):
        super(Attention, self).__init__()
        self.attn = nn.MultiheadAttention(embed_dim = dim, num_heads = 4, batch_first = True)
    
    def forward(self, x):
        attn_output, _ = self.attn(x, x, x)
        return attn_output

class DAE(nn.Module):
    def __init__(self, input_dim = 1, hidden_dim = 64, bottleneck_dim = 32):
        super(DAE, self).__init__()  
      
        self.conv1 = nn.Conv1d(in_channels = input_dim, out_channels = 32, kernel_size = 5, padding = 2)
        self.lstm1 = nn.LSTM(input_size = 32, hidden_size = hidden_dim, batch_first = True, bidirectional = False)
        self.attn = Attention(hidden_dim)
        self.bottleneck = nn.Linear(hidden_dim, bottleneck_dim)
        
        self.fc = nn.Linear(bottleneck_dim, hidden_dim)
        self.lstm2 = nn.LSTM(input_size = hidden_dim, hidden_size = hidden_dim, batch_first = True, bidirectional = False)
        self.conv2 = nn.Conv1d(in_channels = hidden_dim, out_channels = input_dim, kernel_size = 5, padding = 2)
    
    def forward(self, x):
        x = self.conv1(x.transpose(1, 2))
        x = F.relu(x).transpose(1, 2)
        x, _ = self.lstm1(x)
        x = self.attn(x)
        x = self.bottleneck(x)
        
        x = self.fc(x)
        x, _ = self.lstm2(x)
        x = self.conv2(x.transpose(1, 2)).transpose(1, 2)
        return x

In [111]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Subset

def split_dataset(dataset, val_split=0.2):
    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(val_split * dataset_size)

    train_indices, val_indices = train_test_split(indices, test_size=val_split, random_state=42)
    train_set = Subset(dataset, train_indices)
    val_set = Subset(dataset, val_indices)

    return train_set, val_set


def DAE_train(model, dataloader, epochs=50, lr=1e-3, device='cuda',
              save_path="../../models/dae_checkpoint.pth",
              final_save_path="../../models/dae_final.pth",
              patience=5,
              batch_size=32): 

    model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = CharbonnierLoss()


    train_set, val_set = split_dataset(dataloader.dataset, val_split=0.2)
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)


    best_val_loss = float("inf")
    patience_counter = 0 

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0.0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch + 1} / {epochs}") 
        
        for noisy, clean in progress_bar:
            noisy, clean = noisy.to(device), clean.to(device)
            optimizer.zero_grad()
            output = model(noisy)
            loss = criterion(output, clean)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
            progress_bar.set_postfix(train_loss=loss.item())

        avg_train_loss = total_train_loss / len(train_loader)

        model.eval()
        total_val_loss = 0.0

        with torch.no_grad():
            for noisy, clean in val_loader: 
                noisy, clean = noisy.to(device), clean.to(device)
                output = model(noisy)
                val_loss = criterion(output, clean)
                total_val_loss += val_loss.item()

        avg_val_loss = total_val_loss / len(val_loader)


        print(f"Epoch {epoch + 1} / {epochs}, Train Loss: {avg_train_loss:.6f}, Val Loss: {avg_val_loss:.6f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            torch.save(model.state_dict(), save_path) 
            print(f"Model checkpoint saved at {save_path} (Loss improved: {best_val_loss:.6f})")
        else:
            patience_counter += 1 
            print(f"⚠ No improvement. Patience: {patience_counter}/{patience}")

        if patience_counter >= patience:
            print(f"⏹ Early Stopping triggered at epoch {epoch + 1}. Best loss: {best_val_loss:.6f}")
            break 

    torch.save(model.state_dict(), final_save_path) # надо заменить на torch.save(model, final_save_path), чтобы сохранять модель вместе со структурой 
    print(f"Final model saved at {final_save_path}")

    return model

In [112]:
def run_inference(model, input_tensor, scaler, chunk_size=500, device='cuda'):
    model.to(device)
    input_tensor = input_tensor.to(device)
    denoised_chunks = []
    
    with torch.no_grad():
        for chunk in torch.split(input_tensor, chunk_size, dim=0):
            denoised_chunk = model(chunk)
            denoised_chunks.append(denoised_chunk.cpu())
    
    denoised_output = torch.cat(denoised_chunks, dim=0)
    return scaler.inverse_transform(denoised_output.numpy().squeeze())


In [100]:
file_paths = [os.path.join(TRAIN_DIR, file) for file in label_df['file'][:100] if file in os.listdir(TRAIN_DIR)]

dataset = TorchTypeDataset(file_paths)
dataloader = DataLoader(dataset, batch_size = BATCH_SIZE, shuffle = True)

In [114]:
dae_model = DAE()

if os.path.exists(CHECKPOINT_PATH):
    dae_model.load_state_dict(torch.load(CHECKPOINT_PATH))
    print(f"Checkpoint loaded from {CHECKPOINT_PATH}")

trained_model = DAE_train(dae_model, dataloader, epochs = 25, lr=1e-3, device='cuda')

Epoch 1 / 25: 100%|██████████| 13539/13539 [01:47<00:00, 125.58it/s, train_loss=0.00997]


Epoch 1 / 25, Train Loss: 0.015667, Val Loss: 0.010784
Model checkpoint saved at ../../models/dae_checkpoint.pth (Loss improved: 0.010784)


Epoch 2 / 25: 100%|██████████| 13539/13539 [01:44<00:00, 129.16it/s, train_loss=0.0163] 


Epoch 2 / 25, Train Loss: 0.013061, Val Loss: 0.009844
Model checkpoint saved at ../../models/dae_checkpoint.pth (Loss improved: 0.009844)


Epoch 3 / 25: 100%|██████████| 13539/13539 [01:51<00:00, 121.81it/s, train_loss=0.0105] 


Epoch 3 / 25, Train Loss: 0.012632, Val Loss: 0.010171
⚠ No improvement. Patience: 1/5


Epoch 4 / 25: 100%|██████████| 13539/13539 [01:48<00:00, 124.54it/s, train_loss=0.00881]


Epoch 4 / 25, Train Loss: 0.012364, Val Loss: 0.009794
Model checkpoint saved at ../../models/dae_checkpoint.pth (Loss improved: 0.009794)


Epoch 5 / 25: 100%|██████████| 13539/13539 [01:47<00:00, 126.41it/s, train_loss=0.0071] 


Epoch 5 / 25, Train Loss: 0.012191, Val Loss: 0.010134
⚠ No improvement. Patience: 1/5


Epoch 6 / 25: 100%|██████████| 13539/13539 [01:47<00:00, 125.79it/s, train_loss=0.0101] 


Epoch 6 / 25, Train Loss: 0.012097, Val Loss: 0.009468
Model checkpoint saved at ../../models/dae_checkpoint.pth (Loss improved: 0.009468)


Epoch 7 / 25: 100%|██████████| 13539/13539 [01:44<00:00, 129.72it/s, train_loss=0.01]   


Epoch 7 / 25, Train Loss: 0.012002, Val Loss: 0.009455
Model checkpoint saved at ../../models/dae_checkpoint.pth (Loss improved: 0.009455)


Epoch 8 / 25: 100%|██████████| 13539/13539 [01:49<00:00, 123.66it/s, train_loss=0.00958]


Epoch 8 / 25, Train Loss: 0.011953, Val Loss: 0.009691
⚠ No improvement. Patience: 1/5


Epoch 9 / 25: 100%|██████████| 13539/13539 [01:45<00:00, 128.28it/s, train_loss=0.00701]


Epoch 9 / 25, Train Loss: 0.011871, Val Loss: 0.009343
Model checkpoint saved at ../../models/dae_checkpoint.pth (Loss improved: 0.009343)


Epoch 10 / 25: 100%|██████████| 13539/13539 [01:47<00:00, 125.52it/s, train_loss=0.00924]


Epoch 10 / 25, Train Loss: 0.011824, Val Loss: 0.009331
Model checkpoint saved at ../../models/dae_checkpoint.pth (Loss improved: 0.009331)


Epoch 11 / 25: 100%|██████████| 13539/13539 [01:45<00:00, 127.99it/s, train_loss=0.00925]


Epoch 11 / 25, Train Loss: 0.011759, Val Loss: 0.009293
Model checkpoint saved at ../../models/dae_checkpoint.pth (Loss improved: 0.009293)


Epoch 12 / 25: 100%|██████████| 13539/13539 [01:48<00:00, 124.46it/s, train_loss=0.00925]


Epoch 12 / 25, Train Loss: 0.011718, Val Loss: 0.009339
⚠ No improvement. Patience: 1/5


Epoch 13 / 25: 100%|██████████| 13539/13539 [01:45<00:00, 128.57it/s, train_loss=0.00871]


Epoch 13 / 25, Train Loss: 0.011694, Val Loss: 0.009336
⚠ No improvement. Patience: 2/5


Epoch 14 / 25: 100%|██████████| 13539/13539 [01:45<00:00, 128.48it/s, train_loss=0.00948]


Epoch 14 / 25, Train Loss: 0.011670, Val Loss: 0.009298
⚠ No improvement. Patience: 3/5


Epoch 15 / 25: 100%|██████████| 13539/13539 [01:44<00:00, 129.23it/s, train_loss=0.00842]


Epoch 15 / 25, Train Loss: 0.011647, Val Loss: 0.009147
Model checkpoint saved at ../../models/dae_checkpoint.pth (Loss improved: 0.009147)


Epoch 16 / 25: 100%|██████████| 13539/13539 [01:45<00:00, 128.81it/s, train_loss=0.0133] 


Epoch 16 / 25, Train Loss: 0.011625, Val Loss: 0.009384
⚠ No improvement. Patience: 1/5


Epoch 17 / 25: 100%|██████████| 13539/13539 [01:45<00:00, 128.71it/s, train_loss=0.0105] 


Epoch 17 / 25, Train Loss: 0.011612, Val Loss: 0.009122
Model checkpoint saved at ../../models/dae_checkpoint.pth (Loss improved: 0.009122)


Epoch 18 / 25: 100%|██████████| 13539/13539 [01:44<00:00, 129.01it/s, train_loss=0.0122] 


Epoch 18 / 25, Train Loss: 0.011598, Val Loss: 0.009396
⚠ No improvement. Patience: 1/5


Epoch 19 / 25: 100%|██████████| 13539/13539 [01:45<00:00, 128.50it/s, train_loss=0.00733]


Epoch 19 / 25, Train Loss: 0.011584, Val Loss: 0.009287
⚠ No improvement. Patience: 2/5


Epoch 20 / 25: 100%|██████████| 13539/13539 [01:44<00:00, 129.34it/s, train_loss=0.0103] 


Epoch 20 / 25, Train Loss: 0.011571, Val Loss: 0.009124
⚠ No improvement. Patience: 3/5


Epoch 21 / 25: 100%|██████████| 13539/13539 [01:43<00:00, 131.24it/s, train_loss=0.00989]


Epoch 21 / 25, Train Loss: 0.011553, Val Loss: 0.009810
⚠ No improvement. Patience: 4/5


Epoch 22 / 25: 100%|██████████| 13539/13539 [01:42<00:00, 132.03it/s, train_loss=0.00905]


Epoch 22 / 25, Train Loss: 0.011545, Val Loss: 0.009316
⚠ No improvement. Patience: 5/5
⏹ Early Stopping triggered at epoch 22. Best loss: 0.009122
Final model saved at ../../models/dae_final.pth


In [115]:
trained_model.eval()
trained_model.to(device)

def preprocess_file(file_path, sequence_length=100):
    df = pd.read_csv(file_path, sep="\\s+", names=["time", "pressure"])
    
    scaler = MinMaxScaler()
    df["pressure"] = scaler.fit_transform(df[["pressure"]])
    
    sequences = []
    for i in range(len(df) - sequence_length):
        seq = df["pressure"].iloc[i : i + sequence_length].values
        sequences.append(seq)

    input_tensor = torch.tensor(np.array(sequences), dtype=torch.float32).unsqueeze(-1)  # (batch, seq_len, 1)

    if "time" in df and len(df["time"]) >= len(sequences):
        time_values = df["time"][:len(sequences)].values
        return input_tensor, scaler, time_values
    else:
        return input_tensor, scaler



# Проверка

In [None]:
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

test_file = '../raw_data/test/1c9db047-e335-46ac-8039-effd8589b25b'
result = preprocess_file(test_file)

In [193]:
if len(result) == 3:
    input_tensor, scaler, time_values = result
else:
    input_tensor, scaler = result
    time_values = None

input_tensor = input_tensor.to(device)

chunk_size = 500  
denoised_chunks = []

with torch.no_grad():
    
    for chunk in torch.split(input_tensor, chunk_size, dim=0):
        denoised_chunk = dae_model(chunk.to(device))  
        denoised_chunks.append(denoised_chunk.cpu()) 

denoised_output = torch.cat(denoised_chunks, dim=0)
denoised_data = scaler.inverse_transform(denoised_output.numpy().squeeze())

In [195]:
orig_df = pd.read_csv(test_file, sep="\\s+", names=["time", "pressure"])

In [196]:
if denoised_data.shape[1] > 1:  
    denoised_data = denoised_data[:, -1].reshape(-1, 1)  

full_denoised = np.concatenate([orig_df["pressure"].values[:100].reshape(-1, 1), denoised_data], axis=0)

orig_df["denoised_pressure"] = full_denoised


Размер итогового `orig_df`: (55129, 3)


In [198]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=orig_df["time"],
    y=orig_df["pressure"],
    mode='lines',
    name='Original Pressure',
    line=dict(color='blue')
))

fig.add_trace(go.Scatter(
    x=orig_df["time"],
    y=orig_df["denoised_pressure"],
    mode='lines',
    name='Denoised Pressure',
    line=dict(color='red')
))

fig.update_layout(
    title="Pressure vs Denoised Pressure Over Time",
    xaxis_title="Time",
    yaxis_title="Pressure",
    legend=dict(x=0, y=1),
)

fig.show()