In [1]:

libpath = "../../scripts"

import os
import sys
sys.path.append(libpath)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tsfresh import (extract_features, select_features)
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import settings
from tsfresh.transformers import RelevantFeatureAugmenter, FeatureAugmenter
from tsfresh.feature_extraction.settings import MinimalFCParameters, EfficientFCParameters, ComprehensiveFCParameters
from tsfresh.feature_extraction.settings import from_columns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion

from preprocessing import TsfreshDatasetTransformer


In [2]:
def concat_dasasets(set_of_datasets):
    # Соединяет несколько датасетов
    concated_dataframe = pd.DataFrame([])
    concated_labels = []
    current_id = 0
    for (name, data, label) in set_of_datasets:
        data_copy = data.copy()
        data_copy["id"] += current_id
        concated_dataframe = pd.concat([concated_dataframe, data_copy], ignore_index=True)
        current_id = concated_dataframe["id"].iloc[-1] +1
        concated_labels.extend(label)
    return concated_dataframe, pd.Series(concated_labels)

def rename_idxs(array):
    # Переименовывает колонку id после удаления метки
    a = array.copy()
    a = np.array(a)
    i = 1
    while i < len(a):
        if a[i] - a[i-1] > 1:
            j = i
            constant = a[j]
            while j < len(a) and a[j] == constant:
                a[j] = a[i-1] + 1
                j += 1
            i = j
        else:
            i += 1
    if a[0] > 0:
        a -= a[0]
    return a


In [3]:

# Конкатенируем данные с нескольких объектов в один объект

dataset_path_1 = "/home/drozdovmk/Projects/ZB/zb-classification/data/data_markup/Classifier_cesis_12_11"
dataset_path_2 = "/home/drozdovmk/Projects/ZB/zb-classification/data/data_markup/Classifier_demostend_18_11"
dataset_path_3 = "/home/drozdovmk/Projects/ZB/zb-classification/data/data_markup/hdf5_adaptive"
dataset_path_4 = "/home/drozdovmk/Projects/ZB/zb-classification/data/data_markup/Classifier_demostend_10_12_24"
data_getter = TsfreshDatasetTransformer()

data_hdf5_long_1, label_hdf5_1 = data_getter.make_tsfresh_structure_from_nested_directory(dataset_path_1)
data_hdf5_long_2, label_hdf5_2 = data_getter.make_tsfresh_structure_from_nested_directory(dataset_path_2)
data_hdf5_long_3, label_hdf5_3 = data_getter.make_tsfresh_structure_from_simple_directory(dataset_path_3)
data_hdf5_long_4, label_hdf5_4 = data_getter.make_tsfresh_structure_from_nested_directory(dataset_path_4)

set_of_datasets = (
    ("cesis_data_nested", data_hdf5_long_1, label_hdf5_1),
    ("demostend_data_nested", data_hdf5_long_2, label_hdf5_2),
    ("demostend_data_simple", data_hdf5_long_3, label_hdf5_3),
    ("demostend_device", data_hdf5_long_4, label_hdf5_4)
)

Found unknown label in zone 455 
date: 22/11/2024 14:02:36
Found unknown label in zone 455 
date: 09/12/2024 14:29:34
Found unknown label in zone 455 
date: 09/12/2024 09:49:55


In [4]:
# объединяем все датасеты в один
data_hdf5, label_hdf5 = concat_dasasets(set_of_datasets)
label_hdf5.value_counts()

wind          698
device        542
hit           370
unknown       236
perelaz       186
saw           151
hit_z         109
hit_g         109
hit_series      9
Name: count, dtype: int64

In [5]:
# заменяем имена меток
label_hdf5.replace(["hit_g", "hit_z"], "hit", inplace=True)

# удалить метки из спика
TO_DROP = ["unknown", "hit_series",]
mask_drop = label_hdf5.isin(TO_DROP)
drop_idxs =  label_hdf5[mask_drop].index # id of drop labels

data_hdf5.drop(
    index = data_hdf5[data_hdf5["id"].isin(drop_idxs)].index,
    inplace=True
)
label_hdf5.drop(
    index = drop_idxs,
    inplace = True
)

# min_count = label_hdf5.value_counts().min()
# balanced_indexes = (label_hdf5.groupby(label_hdf5).
#                    apply(lambda x: x.sample(min_count))).reset_index()["level_1"]
# data_hdf5 = data_hdf5[data_hdf5["id"].isin(balanced_indexes)]
# label_hdf5 = label_hdf5.loc[balanced_indexes]

# Переименовываем колонку id (она изменилась после удаления меток из списка)


data_hdf5["id"] = rename_idxs(data_hdf5["id"])

data_hdf5.reset_index(drop=True, inplace=True)
label_hdf5.reset_index(drop=True, inplace=True)



In [53]:

class FilterColumnCreator(BaseEstimator, TransformerMixin):
    def __init__(self, transformations=None):
        self.transformations = transformations
    def fit(self, X=None, y=None):
        return self
    def transform(self, X:pd.DataFrame, transformations=None):
        if self.transformations==None:
            return X
        X_transformed = X.copy()
        for name, transformation in self.transformations:
            X_transformed[name] = (
                X_transformed.
                groupby("id")["signal_raw"].
                transform(transformation)
            )
        return X_transformed

class FourierColumnCreator(BaseEstimator, TransformerMixin):
    def __init__(self, augmenter=None):
        self.augmenter = augmenter
        pass
    def binned_fourier_transform(self, signal, n_bins=70, fs=1000):
        N = len(signal)
        Ts = 1/fs
        fftfreqz = np.fft.fftfreq(N, Ts)[:N//2]
        signal_fft = 2/N * np.abs(np.fft.fft(signal))[:N//2]
        bin_edges = np.logspace(start=np.log2(fftfreqz[1]),
                                stop=np.log2(fftfreqz[-1]),
                                num=n_bins+1,
                                base=2)
        binned_fft = []
        binned_freqz = []
        for i in range(n_bins):
            bin_mask = (fftfreqz >= bin_edges[i]) & (fftfreqz < bin_edges[i + 1])
            if np.any(bin_mask):
                binned_fft.append(np.mean(signal_fft[bin_mask]))
                binned_freqz.append(np.mean(fftfreqz[bin_mask]))
        return np.array(binned_fft)
    def fit(self, X=None, y=None):
        return self
    
    def transform(self, X:pd.DataFrame):
        result_df = pd.DataFrame()
        indexes = X["id"].unique()
        result = (
            X.groupby("id")
            ["signal_raw"].
            apply(func=lambda x : self.binned_fourier_transform(
                signal=x,
                n_bins=100,
                fs=1000,
                ))
            )
        for idx in indexes:
            temp_data = result[idx]
            temp_df = pd.DataFrame({
                'id': [int(idx)] * len(temp_data),
                'freq_num': range(len(temp_data)),
                'signal_binned_fft': temp_data
            })
            result_df = pd.concat([result_df, temp_df])
        return result_df.reset_index(drop=True)
    
class FeaturesToTsfresh():
    def __init__(self, ):
        pass
    def get_time_domain_features(self):
        sett_answ = {
            "signal_raw" : 
                {
                    "skewness":None,
                    "standard_deviation":None,
                    "length": None,
                    "fft_aggregated": [{"aggtype": "centroid"},
                                    {"aggtype": "variance"},
                                    {"aggtype": "skew"},
                                    {"aggtype": "kurtosis"}]
                },
                
            "signal_std32" :
                {
                "standard_deviation": None,
                },
                
            "signal_mean256": 
                {
                    "standard_deviation": None,
                    "kurtosis": None,
                }
            }
        return sett_answ
    def get_freq_domain_features(self):
        sett_answ = EfficientFCParameters()
        sett_answ.pop("fft_coefficient", None)
        sett_answ.pop("cwt_coefficients", None)
        sett_answ.pop("fft_aggregated", None)
        sett_answ.pop("number_cwt_peaks", None)
        sett_answ.pop("spkt_welch_density", None)
        sett_answ.pop("fourier_entropy", None)
        sett_answ.pop("query_similarity_count", None)
        sett_answ.pop("symmetry_looking", None)
        sett_answ.pop("large_standard_deviation", None)
        
        return sett_answ
        

In [20]:
from preprocessing import FeaturesToTsfresh, Preprocessor

In [54]:

filter_column_transformer = FilterColumnCreator()
fourier_column_transformer = FourierColumnCreator()

df_ts_time = filter_column_transformer.transform(data_hdf5)
df_ts_freq = fourier_column_transformer.transform(data_hdf5)

In [9]:
pr = Preprocessor()
long_signal, df_ts_freq, df_ts_time = pr.transform(data_hdf5, from_numpy=False)

In [17]:
X_idxs = pd.DataFrame(index=df_ts_freq["id"].unique())
X_idxs_train, X_idxs_test, y_train, y_test = train_test_split(X_idxs, label_hdf5)

In [21]:
def create_pipeline():
    fft_features_extractor = Pipeline([
            (
                  'augmenter', RelevantFeatureAugmenter(
                        column_id='id',
                        column_sort='freq_num',
                        default_fc_parameters= FeaturesToTsfresh().get_freq_domain_features(),
                        multiclass=True,
                        n_significant=label_hdf5.nunique(),
                        disable_progressbar=True
                        )
            ),
      ])
    time_features_extractor = Pipeline([
                (
                    'augmenter', FeatureAugmenter(
                            column_id='id',
                            column_sort='time',
                            kind_to_fc_parameters= FeaturesToTsfresh().get_time_domain_features(),
                            disable_progressbar=True
                            )
                ),
        ])

    feature_union = FeatureUnion([
        ("fft_features_extractor", fft_features_extractor),
        ("time_features_extractor", time_features_extractor)
        ])

    result_ppl = Pipeline([
        ("feature_extraction", feature_union),
        ("model", RandomForestClassifier(
                class_weight = "balanced_subsample",
                n_estimators = 100,
                max_depth = 12,
                min_samples_leaf=5))
        ])
    return result_ppl

In [22]:

df_ts_freq_train = df_ts_freq[df_ts_freq["id"].isin(y_train.index)]
df_ts_time_train = df_ts_time[df_ts_time["id"].isin(y_train.index)]


result_ppl = create_pipeline()

result_ppl["feature_extraction"]["fft_features_extractor"].set_params(
      augmenter__timeseries_container=df_ts_freq_train
      );
result_ppl["feature_extraction"]["time_features_extractor"].set_params(
      augmenter__timeseries_container=df_ts_time_train
      );


In [23]:
result_ppl.fit(X_idxs_train, y_train)
train_pred = result_ppl.predict(X_idxs_train)


In [25]:
print(classification_report(y_train, train_pred))

              precision    recall  f1-score   support

      device       1.00      1.00      1.00       407
         hit       0.99      0.98      0.99       442
     perelaz       0.93      0.95      0.94       135
         saw       0.99      0.98      0.99       105
        wind       0.98      0.99      0.99       534

    accuracy                           0.99      1623
   macro avg       0.98      0.98      0.98      1623
weighted avg       0.99      0.99      0.99      1623



In [26]:

df_ts_freq_test = df_ts_freq[df_ts_freq["id"].isin(y_test.index)]
df_ts_time_test = df_ts_time[df_ts_time["id"].isin(y_test.index)]

result_ppl["feature_extraction"]["fft_features_extractor"].set_params(augmenter__timeseries_container=df_ts_freq_test);
result_ppl["feature_extraction"]["time_features_extractor"].set_params(augmenter__timeseries_container=df_ts_time_test);


In [27]:

test_pred = result_ppl.predict(X_idxs_test)

In [28]:
from sklearn.metrics import roc_auc_score

print(classification_report(y_test, test_pred))

              precision    recall  f1-score   support

      device       0.99      1.00      0.99       135
         hit       0.95      0.96      0.96       146
     perelaz       0.86      0.86      0.86        51
         saw       0.93      0.89      0.91        46
        wind       0.96      0.95      0.95       164

    accuracy                           0.95       542
   macro avg       0.94      0.93      0.94       542
weighted avg       0.95      0.95      0.95       542



## Обучаем итоговую модель

In [30]:

result_ppl = create_pipeline()
result_ppl["feature_extraction"]["fft_features_extractor"].set_params(
      augmenter__timeseries_container=df_ts_freq
      );
result_ppl["feature_extraction"]["time_features_extractor"].set_params(
      augmenter__timeseries_container=df_ts_time
      );

data_idxs = pd.DataFrame(index=data_hdf5["id"].unique())
result_ppl.fit(data_idxs, label_hdf5)

result_ppl["feature_extraction"]["fft_features_extractor"].set_params(
      augmenter__timeseries_container=pd.DataFrame(index=[0])
      );

In [32]:
import os
os.listdir('../..')
import joblib
joblib.dump(result_ppl, '../../RnD/classifier_v3_rf/pipeline2.pkl')

['../../RnD/classifier_v3_rf/pipeline2.pkl']