In [1]:
# (UCL=UPPER CONTROL LIMITS)

### Imports

In [2]:
import os
print(os.cpu_count())

12


In [3]:
# libraries importing
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import math

# additional modules
import sys
import time
from pathlib import Path

from IPython.display import display, HTML
from IPython.display import clear_output

sys.path.append('../utils')

import numpy as np
from tensorflow import keras
from tensorflow.keras import losses
from tensorflow.keras import backend as K
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Dense, BatchNormalization 
from tensorflow.keras.layers import  Activation, Dropout, TimeDistributed
from tensorflow.keras.layers import LSTM, RepeatVector, Flatten, Lambda
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau


from sklearn.metrics import mean_absolute_error, mean_squared_error, roc_auc_score, f1_score
from sklearn.linear_model import LogisticRegression
import scipy
from scipy.signal import medfilt
from sklearn import decomposition
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from itertools import product

import multiprocessing

from tqdm import tqdm

In [4]:
import logging

In [5]:
import basemodels
import anomalyutils
import utils
import modeltasks_with_scores
import modeltasks_rotation_first_with_scores

### Random Function

In [6]:
utils.set_random(0)

### Load Data

In [7]:
# benchmark files checking
all_files, all_names = utils.get_files_and_names()
# datasets with anomalies loading
list_of_df, list_of_names = utils.get_anomaly_data_and_names()
# anomaly-free df loading
anomaly_free_df, anomaly_free_name = utils.get_anomaly_free_data_and_names()

In [8]:
def driver_func(mmodel, df, n_cores, name_of_model, estimators):
    # mmodel: 
    # modeltasks.ensemble_autoencoder
    # modeltasks.ensemble_conv_ae
    # modeltasks.ensemble_lstm
    # modeltasks.ensemble_lstm_ae
    # modeltasks.ensemble_lstm_vae
    # df: the raw df
    # n_cores: number of cores
    # name_of_model: 
    with multiprocessing.Pool(n_cores) as pool:
        N_ESTIMATORS_FOR_BAGGING = estimators
        list_of_features = utils.get_bagg_features_random(N_ESTIMATORS_FOR_BAGGING, True)
        tmp_list_of_feats = [list(feats) for feats in list_of_features] 
        for tmp_feat in tmp_list_of_feats:
            if "anomaly" not in tmp_feat:
                tmp_feat.append("anomaly")
            if "changepoint" not in tmp_feat:
                tmp_feat.append("changepoint")

        df_list_to_run_tmp = [df[feats] for feats in tmp_list_of_feats]
        results = [pool.apply_async(mmodel, (ds, name_of_model, str(ds_ind))) for ds_ind, ds in enumerate(df_list_to_run_tmp)]
        
        voting_prediction = pd.DataFrame(pd.Series(data=0, index=df.index)).rename(columns={0:"initialize"})
        # voting_prediction = pd.DataFrame(pd.Series(data=df.anomaly.astype(int), index=df.index))
        count=0
        for r in results:
            count+=1
            voting_prediction = voting_prediction.join(r.get())

        voting_prediction = voting_prediction.fillna(0)
        voting_prediction = voting_prediction.drop(["initialize"], axis=1)
        
    return voting_prediction

In [9]:
def driver_func_rot(mmodel, df, n_cores, name_of_model, estimators, K, fraction):
    """# mmodel: 
    # modeltasks.ensemble_autoencoder
    # modeltasks.ensemble_conv_ae
    # modeltasks.ensemble_lstm
    # modeltasks.ensemble_lstm_ae
    # modeltasks.ensemble_lstm_vae
    # df: the raw df
    # n_cores: number of cores
    # name_of_model: """
    with multiprocessing.Pool(n_cores) as pool:
        N_ESTIMATORS_FOR_BAGGING = estimators
        list_of_features = utils.get_bagg_features_random(N_ESTIMATORS_FOR_BAGGING, True)
        tmp_list_of_feats = [list(feats) for feats in list_of_features]

        results = [pool.apply_async(mmodel, (df, ds, name_of_model, str(ds_ind), K, fraction)) for ds_ind, ds in enumerate(tmp_list_of_feats)]
        
        voting_prediction = pd.DataFrame(pd.Series(data=0, index=df.index)).rename(columns={0:"initialize"})
        # voting_prediction = pd.DataFrame(pd.Series(data=df.anomaly.astype(int), index=df.index))
        count=0
        for r in results:
            count+=1
            voting_prediction = voting_prediction.join(r.get())

        voting_prediction = voting_prediction.fillna(0)
        voting_prediction = voting_prediction.drop(["initialize"], axis=1)
        
    return voting_prediction

### Main Run Loop

In [10]:
%%time
def main(num_of_cores = 11, SAVE_FOLDER = "./my_tmp", n_est=21, K=4, fraction=0.65):
    start_of_the_whole_process = time.perf_counter()

    Path(SAVE_FOLDER).mkdir(parents=True, exist_ok=True)
    X_conv_ae_window = 60
    final_aggr_pd = None
    final_aggr_pd_general = None
    start_of_loop = time.perf_counter()
    times_of_ds = []
    ESTIMATORS = n_est
    LOGS_SAVE_FOLDER = SAVE_FOLDER
    LOGS_FILENAME = "logs_info.log"
    
    log = logging.getLogger()
    for hdlr in log.handlers[:]:
        log.removeHandler(hdlr)
    
    logging.basicConfig(level=logging.INFO, filename=f"{LOGS_SAVE_FOLDER}/{LOGS_FILENAME}",
                        format="CREATED_AT: %(asctime)s - MESSAGE: %(message)s")
    
    
    logging.info(f"'Params: num_of_cores={num_of_cores}, SAVE_FOLDER={SAVE_FOLDER}, n_est={n_est}, K={K}, fraction={fraction}'")
    for df_index, df_t in enumerate(zip(list_of_df, list_of_names)):
        if df_index in [4,6,8]:
            continue
        
        start_ds = time.perf_counter()
        df = df_t[0]
        name = df_t[1]

        
        ensemble_autoencoder_result_general = driver_func(modeltasks_with_scores.ensemble_autoencoder, df, num_of_cores, "ensemble_autoencoder", ESTIMATORS)
        ensemble_conv_ae_result_general = driver_func(modeltasks_with_scores.ensemble_conv_ae, df, num_of_cores, "ensemble_conv_ae", ESTIMATORS)
        ensemble_lstm_result_general = driver_func(modeltasks_with_scores.ensemble_lstm, df, num_of_cores, "ensemble_lstm", ESTIMATORS)
        ensemble_lstm_ae_result_general = driver_func(modeltasks_with_scores.ensemble_lstm_ae, df, num_of_cores, "ensemble_lstm_ae", ESTIMATORS)
        ensemble_lstm_vae_result_general = driver_func(modeltasks_with_scores.ensemble_lstm_vae, df, num_of_cores, "ensemble_lstm_vae", ESTIMATORS)
        ensemble_autoencoder_result_rot_general = driver_func_rot(modeltasks_rotation_first_with_scores.ensemble_autoencoder, df, num_of_cores, "ensemble_rotation_autoencoder", ESTIMATORS, K, fraction)
        ensemble_conv_ae_result_rot_general = driver_func_rot(modeltasks_rotation_first_with_scores.ensemble_conv_ae, df, num_of_cores, "ensemble_rotation_conv_ae", ESTIMATORS, K, fraction)
        ensemble_lstm_result_rot_general = driver_func_rot(modeltasks_rotation_first_with_scores.ensemble_lstm, df, num_of_cores, "ensemble_rotation_lstm", ESTIMATORS, K, fraction)
        ensemble_lstm_ae_result_rot_general = driver_func_rot(modeltasks_rotation_first_with_scores.ensemble_lstm_ae, df, num_of_cores, "ensemble_rotation_lstm_ae", ESTIMATORS, K, fraction)
        ensemble_lstm_vae_result_rot_general = driver_func_rot(modeltasks_rotation_first_with_scores.ensemble_lstm_vae, df, num_of_cores, "ensemble_rotation_lstm_vae", ESTIMATORS, K, fraction)
                                      
        general_joined = pd.DataFrame(pd.Series(data=df["anomaly"].astype(int), index=df.index))
        general_joined = general_joined.join(ensemble_autoencoder_result_general)
        general_joined = general_joined.join(ensemble_conv_ae_result_general)
        general_joined = general_joined.join(ensemble_lstm_result_general)
        general_joined = general_joined.join(ensemble_lstm_ae_result_general)
        general_joined = general_joined.join(ensemble_lstm_vae_result_general)
        general_joined = general_joined.join(ensemble_autoencoder_result_rot_general)
        general_joined = general_joined.join(ensemble_conv_ae_result_rot_general)
        general_joined = general_joined.join(ensemble_lstm_result_rot_general)
        general_joined = general_joined.join(ensemble_lstm_ae_result_rot_general)
        general_joined = general_joined.join(ensemble_lstm_vae_result_rot_general)

        general_joined = general_joined.drop(['anomaly'], axis=1)

        X_train = df[:400].drop(['anomaly', 'changepoint'], axis=1)
        
        # scaler init and fitting
        StSc = StandardScaler()
        StSc.fit(X_train)

        X_ae = StSc.transform(X_train)
        X_conv_ae = utils.create_sequences(StSc.transform(X_train), X_conv_ae_window)
        X_lstm, y_lstm = utils.split_sequences(StSc.transform(X_train), 5)
        X_lstm_ae = utils.create_sequences(StSc.transform(X_train), 10)
        X_lstm_vae = utils.create_sequences(StSc.transform(X_train), 5)

        # model defining and fitting
        
        autoencoder_model = basemodels.autoencoder(X_ae)
        conv_ae_model = basemodels.conv_ae(X_conv_ae)
        lstm_model = basemodels.lstm(X_lstm, y_lstm)
        lstm_ae_model = basemodels.lstm_ae(X_lstm_ae)
        lstm_vae_model = basemodels.lstm_vae(X_lstm_vae)

        lstm_model.load_weights("lstm.h5")
        
        predictions_ae = anomalyutils.get_ae_predicts(autoencoder_model, X_ae)
        predictions_conv_ae = anomalyutils.get_conv_ae_predicts(conv_ae_model, X_conv_ae)
        predictions_lstm = anomalyutils.get_lstm_predicts(lstm_model, X_lstm)
        predictions_lstm_ae = anomalyutils.get_lstm_ae_predicts(lstm_ae_model, X_lstm_ae)
        predictions_lstm_vae = anomalyutils.get_lstm_vae_predicts(lstm_vae_model, X_lstm_vae)

        residuals_autoencoder = anomalyutils.get_ae_residuals(X_ae, predictions_ae)
        residuals_conv_ae = anomalyutils.get_conv_ae_residuals(X_conv_ae, predictions_conv_ae)
        residuals_lstm = anomalyutils.get_lstm_residuals(y_lstm, predictions_lstm)
        residuals_lstm_ae = anomalyutils.get_lstm_ae_residuals(X_lstm_ae, predictions_lstm_ae)
        residuals_lstm_vae = anomalyutils.get_lstm_vae_residuals(X_lstm_vae, predictions_lstm_vae)

        UCL_autoencoder = residuals_autoencoder.quantile(0.99)
        UCL_conv_ae = residuals_conv_ae.quantile(0.999)
        UCL_lstm = residuals_lstm.quantile(0.99)
        UCL_lstm_ae = residuals_lstm_ae.quantile(0.99)
        UCL_lstm_vae = residuals_lstm_vae.quantile(0.999)

        X_all = df.drop(['anomaly','changepoint'], axis=1)
        X_ae = StSc.transform(X_all)
        X_conv_ae = utils.create_sequences(StSc.transform(X_all), X_conv_ae_window)
        X_lstm, y_lstm = utils.split_sequences(StSc.transform(X_all), 5)
        X_lstm_ae = utils.create_sequences(StSc.transform(X_all), 10)
        X_lstm_vae = utils.create_sequences(StSc.transform(X_all), 5)
        
        predictions_ae = anomalyutils.get_ae_predicts(autoencoder_model, X_ae)
        predictions_conv_ae = anomalyutils.get_conv_ae_predicts(conv_ae_model, X_conv_ae)
        predictions_lstm = anomalyutils.get_lstm_predicts(lstm_model, X_lstm)
        predictions_lstm_ae = anomalyutils.get_lstm_ae_predicts(lstm_ae_model, X_lstm_ae)
        predictions_lstm_vae = anomalyutils.get_lstm_vae_predicts(lstm_vae_model, X_lstm_vae)

        residuals_autoencoder = anomalyutils.get_ae_residuals(X_ae, predictions_ae)
        residuals_conv_ae = anomalyutils.get_conv_ae_residuals(X_conv_ae, predictions_conv_ae)
        residuals_lstm = anomalyutils.get_lstm_residuals(y_lstm, predictions_lstm)
        residuals_lstm_ae = anomalyutils.get_lstm_ae_residuals(X_lstm_ae, predictions_lstm_ae)
        residuals_lstm_vae = anomalyutils.get_lstm_vae_residuals(X_lstm_vae, predictions_lstm_vae)


        prediction_labels_autoencoder = pd.DataFrame(pd.Series(residuals_autoencoder.values, index=df.index).fillna(0)).rename(columns={0:f"anomaly_by_autoencoder_basic_score"})
        prediction_labels_autoencoder[f"anomaly_by_autoencoder_basic_ucl"] = 3/2*UCL_autoencoder
        
        prediction_labels_conv_ae = utils.get_actual_scores_for_windows(residuals_conv_ae, df, X_conv_ae, 60, UCL_conv_ae, "anomaly_by_conv_ae_basic_score", "anomaly_by_conv_ae_basic_ucl")
        
        prediction_labels_lstm = pd.DataFrame(pd.Series(residuals_lstm.values, index=df[5:].index).fillna(0)).rename(columns={0:f"anomaly_by_lstm_basic_score"})
        df_to_append = pd.DataFrame(pd.Series(0, index=df[:5].index).fillna(0)).rename(columns={0:f"anomaly_by_lstm_basic_score"})
        prediction_labels_lstm = pd.concat([df_to_append, prediction_labels_lstm], ignore_index=False)
        prediction_labels_lstm[f"anomaly_by_lstm_basic_ucl"] = 3/2*UCL_lstm

        prediction_labels_lstm_ae = utils.get_actual_scores_for_windows(residuals_lstm_ae, df, X_lstm_ae, 10, UCL_lstm_ae, "anomaly_by_lstm_ae_basic_score", "anomaly_by_lstm_ae_basic_ucl")
        prediction_labels_lstm_vae = utils.get_actual_scores_for_windows(residuals_lstm_vae, df, X_lstm_vae, 5, UCL_lstm_vae, "anomaly_by_lstm_vae_basic_score", "anomaly_by_lstm_vae_basic_ucl")

        voting_prediction = pd.DataFrame(pd.Series(data=df.anomaly.astype(int), index=df.index))
        voting_prediction = voting_prediction.join(prediction_labels_autoencoder)
        voting_prediction = voting_prediction.join(prediction_labels_conv_ae)
        voting_prediction = voting_prediction.join(prediction_labels_lstm)
        voting_prediction = voting_prediction.join(prediction_labels_lstm_ae)
        voting_prediction = voting_prediction.join(prediction_labels_lstm_vae)
        voting_prediction = voting_prediction.join(general_joined)
        
        
        final_aggr_pd = pd.concat([final_aggr_pd, voting_prediction], ignore_index=False)

        end_ds = time.perf_counter()

        total_time_in_seconds = end_ds-start_ds
        total_time_in_minutes = total_time_in_seconds/60
        total_time_in_hours = total_time_in_minutes/60
        print(f"SECONDS: {total_time_in_seconds} - MINUTES: {total_time_in_minutes} - HOURS: {total_time_in_hours}")

        times_of_ds.append(total_time_in_seconds)
    end_of_loop = time.perf_counter()

    for i in range(0,len(times_of_ds)):
        print(f"{i}){times_of_ds[i]}")
    print(end_of_loop-start_of_loop)

    end_of_the_whole_process = time.perf_counter()
    final_of_the_whole_process = end_of_the_whole_process - start_of_the_whole_process
    logging.info(f"'Params: num_of_cores={num_of_cores}, SAVE_FOLDER={SAVE_FOLDER}, n_est={n_est}, K={K}, fraction={fraction}, time={final_of_the_whole_process}'")
    
    final_aggr_pd.to_csv(f"{SAVE_FOLDER}/df_values_scores.csv", index=False, header=True)
    
    return final_aggr_pd

Wall time: 0 ns


### Summary

In [11]:
%%time
if __name__ == '__main__':
    fold_count = 2
    t_param_1 = 17
    t_param_2 = 2
    t_param_3 = 0.75
    for t_param_1 in [17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17]:
        for t_param_3 in [0.75]:
            fold_count += 1
            SAVE_FOLDER = f"./fastfolder_{str(fold_count).rjust(2, '0')}"
            final_aggr_pd = main(num_of_cores = 11, SAVE_FOLDER=SAVE_FOLDER, n_est=t_param_1, K=t_param_2, fraction=t_param_3)
            
            print(f"fold_count={fold_count}, t_param_1={t_param_1}, t_param_2={t_param_2}, t_param_3={t_param_3}")

SECONDS: 780.0699885 - MINUTES: 13.001166475 - HOURS: 0.21668610791666668
SECONDS: 787.4243123 - MINUTES: 13.123738538333333 - HOURS: 0.21872897563888888
SECONDS: 779.1677313999999 - MINUTES: 12.986128856666664 - HOURS: 0.2164354809444444
SECONDS: 746.6323614999997 - MINUTES: 12.44387269166666 - HOURS: 0.20739787819444436
SECONDS: 730.757611 - MINUTES: 12.179293516666666 - HOURS: 0.20298822527777777
SECONDS: 782.5747178000001 - MINUTES: 13.042911963333335 - HOURS: 0.2173818660555556
SECONDS: 729.6335739999995 - MINUTES: 12.160559566666658 - HOURS: 0.20267599277777762
SECONDS: 737.5999901000005 - MINUTES: 12.29333316833334 - HOURS: 0.204888886138889
SECONDS: 740.0687921999997 - MINUTES: 12.334479869999996 - HOURS: 0.20557466449999992
SECONDS: 740.4993306999995 - MINUTES: 12.341655511666659 - HOURS: 0.20569425852777765
SECONDS: 811.7987354999996 - MINUTES: 13.529978924999993 - HOURS: 0.2254996487499999
SECONDS: 791.2414211000014 - MINUTES: 13.187357018333357 - HOURS: 0.21978928363888928


KeyboardInterrupt: 