In [1]:
import os

fault_dir = os.path.join('.', 'datasets','fault')
normal_dir = os.path.join('.', 'datasets','datanormal')

fault_folders = [os.path.join(fault_dir,folder) for folder in os.listdir(fault_dir)]
normal_folders = [os.path.join(normal_dir,folder) for folder in os.listdir(normal_dir)]

In [2]:
import pandas as pd
import numpy as np

fault_files = [os.path.join(files_folder, file)
               for files_folder in fault_folders
               for file in os.listdir(files_folder)]

normal_files = [os.path.join(files_folder, file)
                for files_folder in normal_folders
                for file in os.listdir(files_folder)]


### Load data to DataFrame

In [3]:
from scipy import signal

b, a = signal.butter(3, 0.05)
cc_df = pd.DataFrame(columns=['values', 'sequence', 'step'])

for i, file in enumerate(fault_files):
    df = pd.read_csv(file, header=None)
    df.columns = ['values']
    df['values'] = signal.lfilter(b,a, df['values'])
    df['step'] = np.arange(df.shape[0])
    df['sequence'] = i
    cc_df = pd.concat([cc_df, df])

for i, file in enumerate(normal_files):
    df = pd.read_csv(file, header=None)
    df.columns = ['values']
    df['values'] = signal.lfilter(b,a, df['values'])
    df['step'] = np.arange(df.shape[0])
    df['sequence'] = i + 87
    cc_df = pd.concat([cc_df, df])


### Set Labels

In [4]:
labels = pd.Series(data=[0 for i, _ in enumerate(fault_files)]
                   + [1 for i, _ in enumerate(normal_files)])

sequence_ids = labels.index.tolist()

### Split train and test data

In [5]:
from sklearn.model_selection import train_test_split

train_ids, test_ids, y_train, y_test = train_test_split(sequence_ids, labels)

In [6]:
X_train = cc_df[cc_df['sequence'].isin(train_ids)]
X_test =  cc_df[cc_df['sequence'].isin(test_ids)]


### Feature extraction and selection

In [7]:
from tsfresh import extract_features

extracted_features = extract_features(X_train, column_id='sequence', column_sort='step')
extracted_features.head()

  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
  from pandas import (to_datetime, Int64Index, DatetimeIndex, Period,
Feature Extraction: 100%|██████████| 26/26 [00:22<00:00,  1.15it/s]


Unnamed: 0,values__variance_larger_than_standard_deviation,values__has_duplicate_max,values__has_duplicate_min,values__has_duplicate,values__sum_values,values__abs_energy,values__mean_abs_change,values__mean_change,values__mean_second_derivative_central,values__median,...,values__permutation_entropy__dimension_5__tau_1,values__permutation_entropy__dimension_6__tau_1,values__permutation_entropy__dimension_7__tau_1,values__query_similarity_count__query_None__threshold_0.0,"values__matrix_profile__feature_""min""__threshold_0.98","values__matrix_profile__feature_""max""__threshold_0.98","values__matrix_profile__feature_""mean""__threshold_0.98","values__matrix_profile__feature_""median""__threshold_0.98","values__matrix_profile__feature_""25""__threshold_0.98","values__matrix_profile__feature_""75""__threshold_0.98"
0,0.0,0.0,0.0,0.0,13.297689,78.604465,0.010433,-0.001517,5e-06,-0.00046,...,1.331891,1.549795,1.780047,,2.333063,10.947348,4.144545,3.38393,2.6603,4.480345
1,0.0,0.0,0.0,0.0,-0.84897,224.944531,0.014534,0.000326,-2e-05,-0.008763,...,0.90589,0.970307,1.034836,,3.930029,25.147982,19.583067,23.94168,15.423417,24.201467
2,0.0,0.0,0.0,0.0,-8.463227,180.900401,0.015639,0.001193,1.4e-05,-0.003426,...,0.847259,0.89887,0.950584,,3.304086,22.378055,16.54471,21.030483,10.764408,22.110014
6,0.0,0.0,0.0,0.0,13.309109,79.2001,0.010806,-0.001525,5e-06,-0.004952,...,1.332425,1.553203,1.784105,,2.247519,11.782639,4.4044,3.81054,3.027567,4.875977
8,0.0,0.0,0.0,0.0,-8.197441,180.000343,0.015642,0.001196,1.4e-05,-0.002433,...,0.909991,0.987113,1.064338,,3.243687,22.37283,16.526528,21.022365,10.734034,22.104686


In [8]:
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

impute(extracted_features)
features_filtered = select_features(extracted_features, y_train)



In [9]:
features_filtered.head(6)

Unnamed: 0,values__spkt_welch_density__coeff_5,values__spkt_welch_density__coeff_8,values__number_cwt_peaks__n_1,values__number_cwt_peaks__n_5,"values__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.2","values__change_quantiles__f_agg_""var""__isabs_True__qh_1.0__ql_0.2","values__change_quantiles__f_agg_""var""__isabs_True__qh_0.8__ql_0.4","values__change_quantiles__f_agg_""var""__isabs_True__qh_1.0__ql_0.4",values__large_standard_deviation__r_0.30000000000000004,values__number_peaks__n_3,...,"values__fft_coefficient__attr_""abs""__coeff_48","values__fft_coefficient__attr_""abs""__coeff_52","values__fft_coefficient__attr_""abs""__coeff_46","values__fft_coefficient__attr_""abs""__coeff_42",values__autocorrelation__lag_3,values__quantile__q_0.4,"values__fft_coefficient__attr_""abs""__coeff_43","values__fft_coefficient__attr_""abs""__coeff_40","values__fft_coefficient__attr_""real""__coeff_13","values__fft_coefficient__attr_""real""__coeff_6"
0,0.311048,0.094585,6.0,11.0,9.8e-05,0.000161,0.0001,0.000187,0.0,11.0,...,1.342036,1.250398,1.41714,1.565385,0.985689,-0.030574,1.468636,1.597462,-0.324657,1.165283
1,0.057516,0.040841,6.0,12.0,5.2e-05,7.1e-05,7.4e-05,8.9e-05,1.0,3.0,...,0.258972,0.228371,0.303268,0.323699,1.002735,-0.271557,0.331839,0.329199,-3.328736,-1.457848
2,0.057125,0.04134,8.0,17.0,4.8e-05,7.2e-05,5.8e-05,9e-05,0.0,2.0,...,1.067767,0.969817,1.13546,1.240905,0.998683,-0.24977,1.233607,1.309746,-4.328706,-8.698899
6,0.339865,0.1424,9.0,14.0,0.000108,0.000174,8.5e-05,0.000188,0.0,11.0,...,1.360918,1.26769,1.423255,1.570038,0.985212,-0.031629,1.479629,1.613241,-0.672128,1.199498
8,0.04244,0.054145,14.0,19.0,4.5e-05,7.1e-05,5.8e-05,9e-05,0.0,3.0,...,1.052695,0.968542,1.143582,1.252438,0.998607,-0.25103,1.236666,1.297375,-4.246093,-8.604534
9,0.388963,0.24766,9.0,14.0,0.000152,0.000203,0.000116,0.000201,0.0,10.0,...,1.404032,1.267939,1.419842,1.582131,0.984429,-0.038443,1.460065,1.601313,-1.365755,1.090681


### Fit model

In [10]:
from sklearn.ensemble import RandomForestClassifier

# rc = KNeighborsClassifier(n_neighbors= 6, algorithm='kd_tree', metric='infinity')
rc = RandomForestClassifier(n_estimators=500)
rc.fit(features_filtered, y_train)

RandomForestClassifier(n_estimators=500)

### Feature extraction for test data

In [11]:
test_features_extracted = extract_features(X_test, column_id='sequence', column_sort='step')
impute(test_features_extracted)
test_features_filtered = test_features_extracted[features_filtered.columns]

Feature Extraction: 100%|██████████| 22/22 [00:11<00:00,  1.84it/s]


### Check model score

In [12]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, rc.predict(test_features_filtered))

0.4090909090909091

In [13]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(iterations=100,
                            depth=6,
                            learning_rate=0.01,
                            loss_function='CrossEntropy',
                            verbose=False)

model.fit(features_filtered, y_train)

<catboost.core.CatBoostClassifier at 0x142c7f1a6a0>

In [14]:
from catboost import Pool

test_data = Pool(test_features_filtered, y_test)

accuracy_score(y_test, model.predict(test_features_filtered))

0.4318181818181818