<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Structure-cheat-sheet" data-toc-modified-id="Structure-cheat-sheet-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Structure cheat sheet</a></span></li><li><span><a href="#Data-structure" data-toc-modified-id="Data-structure-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Data structure</a></span></li><li><span><a href="#get-features" data-toc-modified-id="get-features-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>get features</a></span></li></ul></div>

# Basics

## Structure cheat sheet

1. func: train data lead (following order)
    1. read the descriptive dataframe from the feature-pipeline
    2. extract feature from the feature-objects which are labeled train-dataset from dataframe
    3. create numpy feature array for the processing pipeline
2. preprocessing
    1. Transformation (any combination of the following)
        + log-transform
        + PCA
        + others
    2. Scaling (one of the following)
        + StandardScaler
        + MinMaxScaler
3. Unsupervised Clustering
    1. Estimate initial hyperparameter
    2. Create grid over various hyperparameters
    3. Train all and choose the best according to metric
    
    
in all steps the cluster-recorder object (possibly dataframe-row) will record all the meta-information like hyper-parameters

## Data structure

There are multiple degrees of freedom in the data:

1. Signal to noise ratio (SNR)
2. Machine type
    1. pump
    2. fan
    3. valve (solenoid)
    4. slider
3. Machine ID
    1. four different machine IDs
    
The pipeline will be applied to fixed SNR, fixed machine type and fixed ID

## get features

Get the descriptive dataframe for the features.

The descriptive dataframe contains all IDs of the pump. We will focus on ID '00' for now since the modeling phase is seperated per SNR, per machine, per ID anyway.

class: 
+ uni\_\<model\>
attributes:
+ default threshold
+ roc_auc
methods:
+ fit
+ predict
+ predict_score
+ eval_roc_auc

In [3]:

#===============================================
# Basic Imports
import numpy as np
import scipy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from tqdm import tqdm
sns.set()

BASE_FOLDER = '../../'
%run -i ..\..\utility\feature_extractor\JupyterLoad_feature_extractor.py
%run -i ..\..\utility\modeling\JupyterLoad_modeling.py


#===============================================
# Define the Model classes
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM

from sklearn.metrics import roc_auc_score

class uni_EllipticEnvelope(EllipticEnvelope):
    def __init__(self, 
                 store_precision=True, 
                 assume_centered=False,
                 support_fraction=None, 
                 contamination=0.1,
                 random_state=None, 
                 def_threshold=0):

        super().__init__(store_precision=store_precision,
            assume_centered=assume_centered,
            support_fraction=support_fraction,
            contamination=contamination,
            random_state=random_state)

        self.def_threshold=def_threshold
        self.roc_auc = None

    #fit inherited    
    # predict inherited

    def predict_score(self, data):
        return self.decision_function(data)

    def eval_roc_auc(self, data_test, y_true):
        return roc_auc_score(y_true, self.predict_score(data_test))

class uni_IsolationForest(IsolationForest):
    pass

class uni_OneClassSVM(OneClassSVM):
    pass


#===============================================
# define the Pipeline Class
import pickle
from datetime import datetime
import time

class Pipe(object):
    def __init__(self, preprocessing_steps=None, modeling_step=None):        
        # instantiate evaluating parameters
        self.roc_auc = None

        # instantiate the preprocessing steps
        self.preproc_steps = [step(**kwargs) for step, kwargs in preprocessing_steps]        

        # create the predictive model             
        self._mdl, self.model_args = modeling_step # model object
        # model instance
        self.model = self._mdl(**self.model_args)

    def to_pickle(self, filepath=None):
        self.update_filepath(filepath)

        with open(self.filepath, 'wb') as f:
            pickle.dump(self, f)

    def update_filepath(self, path=None):
        if not path or (type(path)==dict):
            if not path:
                task = self.task
            else:
                task = path
                self.filepath = '.\\pipes\\' + '_'.join([ task['feat_col'],
                                    ''.join([str(i) for i in list(task['feat'].values())]),
                                    task['SNR'],
                                    task['machine'],
                                    'ID'+task['ID'],
                                    datetime.now().strftime("%Y%m%d_%H%M%S")
                                    ]) + '.pkl'
        else:
            self.filepath = path

    def get_data(self, task):
        time.sleep(.5)
        self.df_train, data_train = load_data(train_set=True, **task)
        self.df_test, data_test = load_data(train_set=False, **task)
        self.ground_truth = self.df_test.abnormal.apply(lambda x : 1 if x==0 else -1)

        # update filepath accordingly to task
        self.update_filepath(task)

        return data_train, data_test

    def preprocess(self, data_train, data_test):
        # run through all the preprocessing steps
        for step in self.preproc_steps:
            data_train =  step.fit_transform(data_train)
            data_test = step.transform(data_test)

        # return preprocessed data
        return data_train, data_test

    def fit_model(self, data_train):
        # fit the model
        self.model.fit(data_train)

    def evaluate(self, data_test, ground_truth):
        # calculate evaluation score

        self.df_test['pred_scores'] = self.model.predict_score(data_test)
        self.df_test['pred_labels'] = self.model.predict(data_test)
        self.roc_auc = self.model.eval_roc_auc(data_test, ground_truth)

    def run_pipe(self, task):
        self.task = task
        # get the data
        print('...loading data')
        data_train, data_test = self.get_data(task)
        print('data loading completed\n\n...preprocessing data')

        # preprocessing
        data_train, data_test = self.preprocess(data_train, data_test)
        print('data preprocessing finished\n\n...fitting the model')

        # fitting the model
        self.fit_model(data_train)
        print('model fitted successfully\n\n...evaluating model')

        # evaluating over ground truth
        self.evaluate(data_test, self.ground_truth)
        print('evaluation successfull, roc_auc:', self.roc_auc)

        # saving to pickle
        self.to_pickle()
        print('pipe saved to pickle')
        return True

importing numpy on engine(s)
importing scipy on engine(s)
importing pandas on engine(s)
importing matplotlib.pyplot on engine(s)
importing seaborn on engine(s)
importing pickle on engine(s)
importing tqdm from tqdm on engine(s)
importing cycler from cycler on engine(s)
importing librosa on engine(s)
importing librosa.display on engine(s)
importing os on engine(s)
importing sys on engine(s)
load feature_extractor_mother
importing Enum from enum on engine(s)
load feature_extractor_mel_spectra
load feature_extractor_psd
load feature_extractore_pre_nnFilterDenoise
importing calinski_harabasz_score,davies_bouldin_score from sklearn.metrics on engine(s)
importing train_test_split from sklearn.model_selection on engine(s)
importing argrelextrema from scipy.signal on engine(s)
importing EllipticEnvelope from sklearn.covariance on engine(s)
importing IsolationForest from sklearn.ensemble on engine(s)
importing OneClassSVM from sklearn.svm on engine(s)
importing roc_auc_score from sklearn.metric

In [4]:

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FastICA

preprocessing = [
    (FastICA, {'n_components':40, 'algorithm':'parallel'}),
    (StandardScaler, {})
]

modeling = (uni_EllipticEnvelope, {'random_state':42})

IDs = [
    '00',
    '02',
    '04',
    '06'
      ]

machines = [
    'pump',
    'slider',
    'fan',
    'valve'
]

paths = [BASE_FOLDER
         +'dataset/MEL_to_Pandas/data_6dB_{}/FEpandas_MELv1_nm80_ch0.pkl'.format(machine) 
         for machine in machines]

tasks = [{
    'path_descr':path, 
    'feat':{'function':'frame', 'frames':5}, 
    'feat_col':'MELv1_nm80_ch0', 
    'SNR':'6dB', 
    'machine':machine, 
    'ID':ID,
    'BASE_FOLDER':BASE_FOLDER
    } for ID in IDs for machine, path in zip(machines, paths)]

pipes = [Pipe(preprocessing, modeling) for i in range(len(tasks))]

# make a function to call on each worker
def build_run_pipe(task, preprocessing, modeling):
    pipe = Pipe(preprocessing, modeling)
    return pipe.run_pipe(task)

importing StandardScaler from sklearn.preprocessing on engine(s)
importing PCA,FastICA from sklearn.decomposition on engine(s)


In [None]:
# Example settings
n_samples = 200
outliers_fraction = 0.25
clusters_separation = [0, 1, 2]

# define two outlier detection tools to be compared
classifiers = {
    "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
                                     kernel="rbf", gamma=0.1),
    "Robust covariance": EllipticEnvelope(contamination=outliers_fraction),
    "Isolation Forest": IsolationForest(max_samples=n_samples,
                                        contamination=outliers_fraction,
                                        random_state=rng)}

for i, (clf_name, clf) in enumerate(classifiers.items()):
    # fit the data and tag outliers
    clf.fit(X)
    scores_pred = clf.decision_function(X)
    threshold = stats.scoreatpercentile(scores_pred,
                                        100 * outliers_fraction)
    y_pred = clf.predict(X)
    n_errors = (y_pred != ground_truth).sum()
    # plot the levels lines and the points
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    subplot = plt.subplot(1, 3, i + 1)
    subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
                        cmap=plt.cm.Blues_r)
    a = subplot.contour(xx, yy, Z, levels=[threshold],
                        linewidths=2, colors='red')
    subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],
                        colors='orange')
    b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white')
    c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black')
    subplot.axis('tight')
    subplot.legend(
        [a.collections[0], b, c],
        ['learned decision function', 'true inliers', 'true outliers'],
        prop=matplotlib.font_manager.FontProperties(size=11),
        loc='lower right')
    subplot.set_title("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors))
    subplot.set_xlim((-7, 7))
    subplot.set_ylim((-7, 7))
plt.subplots_adjust(0.04, 0.1, 0.96, 0.92, 0.1, 0.26)

plt.show()