Ноутбук с применением pytorch для классификации

In [214]:
libpath = "../../scripts"

import os
import sys

import numpy as np  
import pandas as pd
from functools import cache

sys.path.append(libpath)
from preprocessing import TsfreshDatasetTransformer

from torch.utils.data import Dataset, DataLoader, random_split
import torch
import torch.nn as nn
import torch.nn.functional as F


In [215]:
class TorchDataset(Dataset):
    """
    Обертка для взятия данных через pytorch из директорий в структуре tsfresh
    """
    def __init__(self, paths_to_nested: list[str], old_paths: list[str] = None,
                 labels_to_delete: list[str] = None, dict_to_rename: dict[str, str] = None,
                 balanced_classes: bool = False, output_size: int = None):
        self.paths_to_nested = paths_to_nested
        self.old_paths = old_paths
        self.data_getter = TsfreshDatasetTransformer()
        self.list_of_datasets = []
        self.output_size = output_size
        self.adaptive_average_pooling1d = nn.AdaptiveAvgPool1d(self.output_size)
        for path in paths_to_nested:
            self.list_of_datasets.append(
                self.data_getter.make_tsfresh_structure_from_nested_directory(path)
            )
        if old_paths:
            for path in old_paths:
                self.list_of_datasets.append(
                    self.data_getter.make_tsfresh_structure_from_simple_directory(path)
                )
        self.data_hdf5, self.label_hdf5 = self.concat_datasets()
        
        if labels_to_delete:
            self.drop_classes(labels_to_delete)
        if dict_to_rename:
            self.rename_classes(dict_to_rename)
        
        self.data_hdf5["id"] = self.rename_idxs(self.data_hdf5["id"])
        self.data_hdf5.reset_index(drop=True, inplace=True)
        self.label_hdf5.reset_index(drop=True, inplace=True)
        if balanced_classes:
            self.truncate_classes()
        
        self.id_to_label = dict(enumerate(self.label_hdf5.unique()))
        self.label_to_id = {i: j for j, i in self.id_to_label.items()}
        self.indexes = self.label_hdf5.index
    def __getitem__(self, index):
        x = torch.Tensor(self.data_hdf5[self.data_hdf5["id"] == self.indexes[index]]["signal_raw"].to_numpy())
        if self.output_size:
            x = self.transform(x)
        y = self.label_to_id[self.label_hdf5[self.indexes[index]]]
        print(y)
        return x, y
    def __len__(self):
        return len(self.label_hdf5)
    
    def transform(self, x):
        if x.dim() == 1:
            return self.adaptive_average_pooling1d(x[None, :])
        else:
            return self.adaptive_average_pooling1d(x)
    
    def concat_datasets(self):
        # Соединяет несколько датасетов
        concated_dataframe = pd.DataFrame([])
        concated_labels = []
        current_id = 0
        for (data, label) in self.list_of_datasets:
            data_copy = data.copy()
            data_copy["id"] += current_id
            concated_dataframe = pd.concat([concated_dataframe, data_copy], ignore_index=True)
            current_id = concated_dataframe["id"].iloc[-1] +1
            concated_labels.extend(label)
        return concated_dataframe, pd.Series(concated_labels)

    def rename_idxs(self, array):
        # Переименовывает колонку id после удаления метки
        a = array.copy()
        a = np.array(a)
        i = 1
        while i < len(a):
            if a[i] - a[i-1] > 1:
                j = i
                constant = a[j]
                while j < len(a) and a[j] == constant:
                    a[j] = a[i-1] + 1
                    j += 1
                i = j
            else:
                i += 1
        if a[0] > 0:
            a -= a[0]
        return a
    
    def drop_classes(self, labels_to_delete: list[str]):
        mask_drop = self.label_hdf5.isin(labels_to_delete)
        drop_idxs =  self.label_hdf5[mask_drop].index # id of drop labels

        self.data_hdf5.drop(
            index = self.data_hdf5[self.data_hdf5["id"].isin(drop_idxs)].index,
            inplace=True
        )
        self.label_hdf5.drop(
            index = drop_idxs,
            inplace = True
        )
    
    def rename_classes(self, dict_to_rename: dict[str, str]):
        for key, item in dict_to_rename.items():
            self.label_hdf5.replace(key, item, inplace=True)

    def truncate_classes(self):
        idxs_after_truncate = np.array([], dtype=np.uint16) 
        interaction_count = self.label_hdf5.value_counts().min()

        for interaction_name in self.label_hdf5.unique():
            idxs_after_truncate = np.append(idxs_after_truncate, 
                                            self.label_hdf5[self.label_hdf5==interaction_name].
                                            keys().
                                            to_numpy(np.uint16)[:interaction_count])
        idxs_after_truncate.sort()
        self.label_hdf5 = self.label_hdf5.loc[idxs_after_truncate]
        self.data_hdf5 = self.data_hdf5[self.data_hdf5["id"].isin(idxs_after_truncate)]


        

In [202]:
torchDataset = TorchDataset(
    paths_to_nested=["/home/drozdovmk/Projects/ZB/data/data_markup/cesis_nabor_0",
                     "/home/drozdovmk/Projects/ZB/data/data_markup/demostend_0",
                     "/home/drozdovmk/Projects/ZB/data/data_markup/samara_wind",
                     "/home/drozdovmk/Projects/ZB/data/data_markup/Kashira_13may"],
    old_paths=["/home/drozdovmk/Projects/ZB/data/data_markup/hdf5_adaptive"],
    labels_to_delete=["unknown", "hit_series"],
    dict_to_rename={"hit_g" : "hit", "hit_z": "hit"},
    balanced_classes = False,
    output_size=1000
);



Found unknown label in zone 455 
date: 23/12/2024 14:57:52
Found unknown label in zone 737 
date: 24/12/2024 05:47:32
Found unknown label in zone 737 
date: 24/12/2024 05:51:41
Found unknown label in zone 737 
date: 24/12/2024 14:41:29
Found unknown label in zone 737 
date: 24/12/2024 14:41:39
Found unknown label in zone 737 
date: 24/12/2024 06:08:46
Found unknown label in zone 737 
date: 24/12/2024 16:17:34
Found unknown label in zone 737 
date: 24/12/2024 16:17:49
Found unknown label in zone 775 
date: 13/05/2025 11:39:49
Found unknown label in zone 775 
date: 13/05/2025 12:40:28
Found unknown label in zone 775 
date: 13/05/2025 12:40:34
Found unknown label in zone 775 
date: 13/05/2025 12:40:40
Found unknown label in zone 775 
date: 13/05/2025 12:40:52
Found unknown label in zone 610 
date: 13/05/2025 05:57:58
Found unknown label in zone 610 
date: 13/05/2025 14:04:13
Found unknown label in zone 610 
date: 13/05/2025 14:23:49
Found unknown label in zone 610 
date: 13/05/2025 14:25:

In [216]:
train_size = int(0.7 * len(torchDataset))  # 70% под обучение
val_size = int(0.15 * len(torchDataset))   # 15% под валидацию
test_size = len(torchDataset) - train_size - val_size  # остальное под тест

train_dataset, val_dataset, test_dataset = random_split(
    torchDataset, 
    [train_size, val_size, test_size]
)

In [220]:
train_dataloader = DataLoader(train_dataset, batch_size=8)
val_dataloader = DataLoader(val_dataset, batch_size=8)

In [206]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader

# Предположим, у нас есть модель, даталоадеры и функция потерь
model = ...  # Ваша модель (nn.Module)
train_loader = ...  # DataLoader для обучающих данных
val_loader = ...  # DataLoader для валидационных данных (опционально)
criterion = ...  # Функция потерь (например, nn.CrossEntropyLoss())
optimizer = ...  # Оптимизатор (например, optim.Adam(model.parameters()))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 10

for epoch in range(num_epochs):
    # Режим обучения
    model.train()
    running_loss = 0.0
    
    for batch_idx, (inputs, targets) in enumerate(train_dataloader):
        # Перенос данных на устройство (GPU/CPU)
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Обнуляем градиенты
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # Backward pass и оптимизация
        loss.backward()
        optimizer.step()
        
        # Статистика
        running_loss += loss.item()
        
        # Можно выводить прогресс
        if batch_idx % 100 == 99:  # Печатаем каждые 100 батчей
            print(f'Epoch: {epoch+1}, Batch: {batch_idx+1}, Loss: {running_loss / 100:.4f}')
            running_loss = 0.0
    
    # Валидация после эпохи (опционально)
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            val_loss += criterion(outputs, targets).item()
    
    print(f'Epoch {epoch+1} completed. Train Loss: {running_loss / len(train_loader):.4f}, Val Loss: {val_loss / len(val_loader):.4f}')

print('Training finished!')

device(type='cuda')

In [17]:
import torch
x = torch.Tensor([[1,2,3,4,5]])
torch.nn.functional.pad(x, pad=(0,2-1), value=x[:,-1][0])

tensor([[1., 2., 3., 4., 5., 5.]])

In [14]:
x[:,-1][0]

tensor(5.)

In [None]:
ф