In [1]:
import report_utils
import pandas as pd
import numpy as np
import os
from sklearn.metrics import roc_auc_score, f1_score
import time
from tqdm import tqdm
import ready_pipelines
import tensorflow as tf
import random 
import multiprocessing
import logging
from pathlib import Path
from IPython.display import clear_output
import itertools

In [2]:
def get_pn(y_actual, y_hat):
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    for i in range(len(y_hat)): 
        if y_actual[i]==y_hat[i]==1:
           TP += 1
        if y_hat[i]==1 and y_actual[i]!=y_hat[i]:
           FP += 1
        if y_actual[i]==y_hat[i]==0:
           TN += 1
        if y_hat[i]==0 and y_actual[i]!=y_hat[i]:
           FN += 1

    return({"TN": TN, "FP": FP, "FN": FN, "TP":TP})

In [3]:
def set_random(seed_value):
    # 1. Set `PYTHONHASHSEED` environment variable at a fixed value
    os.environ['PYTHONHASHSEED']=str(seed_value)
    # 2. Set `python` built-in pseudo-random generator at a fixed value
    random.seed(seed_value)
    # 3. Set `numpy` pseudo-random generator at a fixed value
    np.random.seed(seed_value)
    # 4. Set `tensorflow` pseudo-random generator at a fixed value
    tf.random.set_seed(seed_value)

The 14 waveforms ( features ) reported in the dataset in order are:
1. A + IGBT-I: The current passing through the IGBT switch of phase A + in Qa1 in Fig. 2 (unit:
A).
2. A + ∗IGBT-I: The current passing through the IGBT switch of phase A + ∗ in Qa3 in
Fig. 2 (unit: A).
3. B + IGBT-I: The current passing through the IGBT switch of phase B + in Qb1 in Fig. 2 (unit:
A).
4. B + ∗IGBT-I: The current passing through the IGBT switch of phase B + ∗ in Qb3 in
Fig. 2 (unit: A).
5. C + IGBT-I: The current passing through the IGBT switch of phase C + in Qc1 in Fig. 2 (unit:
A).
6. C + ∗IGBT-I: The current passing through the IGBT switch of phase C + ∗ in Qc3 in
Fig. 2 (unit: A).
6
M.I. Radaideh, C. Pappas and S. Cousineau / Data in Brief 43 (2022) 108473
7. A-Flux: Magnetic flux density for phase A in transformer XA in Fig. 2 (unit: -).
8. B-Flux: Magnetic flux density for phase B in transformer XB in Fig. 2 (unit: -).
9. C-Flux: Magnetic flux density for phase C in transformer XC in Fig. 2 (unit: -).
10. Mod-V: Modulator voltage (unit: V).
11. Mod-I: Modulator current (unit: A).
12. CB-I: Cap bank current (unit: -).
13. CB-V: Cap bank voltage (unit: V).
14. DV/DT: Time derivative change of the Mod-V voltage (unit: -).

In [4]:
def create_dfs(X,Y):
    original_features = ["A_IGBT_I", "A_starIGBT_I", "B_IGBT_I", "B_starIGBT_I",
                     "C_IGBT_I", "C_starIGBT_I", "A_Flux", "B_Flux", "C_Flux",
                     "Mod_V", "Mod_I", "CB_I", "CB_V", "DV_DT"]
    df_list = []
    for i_x, i_y in zip(X, Y):
        tmp_df = pd.DataFrame(i_x, 
                            columns=original_features)
        if i_y[1]== "Fault":
            tmp_df["anomaly"] = 1
        elif i_y[1]== "Run":
            tmp_df["anomaly"] = 0
        else:
            raise ValueError("SOMETHING IS WRING!!!")
        
        df_list.append(tmp_df)

    df_all = pd.concat(df_list, axis=0, ignore_index=True)
    df_Y = pd.DataFrame(Y, columns=["name", "status","reason"])
    df_Y["anomaly"] = df_Y.apply(lambda x: 1 if x["status"] == "Fault" else 0, axis=1)
    return df_all, df_Y

In [5]:
path_to_new_data = "./data/Real_Electronic_Signal_Data_from_Particle_Accelerator_Power_Systems"
system_1="CCL"
system_2="DTL"
system_3="RFQ"
system_4="SCL"
# X= np.load(f'{path_to_new_data}/{system}.npy')   #---> X array has shape: (pulses, times, features)
# Y=np.load(f'{path_to_new_data}/{system}_labels.npy', allow_pickle=True)  #---> Y array has shape: (pulses, labels) --> labels are: index, state, type
# time_steps=np.arange(X.shape[1]) * 400e-9    #create time axis based on the sampling rate 400 ns (for plotting purposes)

# The 14 waveforms ( features ) reported in the dataset in order are:
# original_features = ["A_IGBT_I", "A_starIGBT_I", "B_IGBT_I", "B_starIGBT_I",
#                      "C_IGBT_I", "C_starIGBT_I", "A_Flux", "B_Flux", "C_Flux",
#                      "Mod_V", "Mod_I", "CB_I", "CB_V", "DV_DT"]

# CCL_X = np.load(f'{path_to_new_data}/{system_1}.npy')
# CCL_Y = np.load(f'{path_to_new_data}/{system_1}_labels.npy', allow_pickle=True)
# CCL_df, CCL_df_Y = create_dfs(CCL_X,CCL_Y)

# DTL_X = np.load(f'{path_to_new_data}/{system_2}.npy')
# DTL_Y = np.load(f'{path_to_new_data}/{system_2}_labels.npy', allow_pickle=True)
# DTL_df, DTL_df_Y = create_dfs(DTL_X,DTL_Y)

RFQ_X = np.load(f'{path_to_new_data}/{system_3}.npy')
RFQ_Y = np.load(f'{path_to_new_data}/{system_3}_labels.npy', allow_pickle=True)
RFQ_df, RFQ_df_Y = create_dfs(RFQ_X,RFQ_Y)

RFQ_df = pd.concat([RFQ_df[:4500*20].reset_index(drop=True), RFQ_df[-4500*5:].reset_index(drop=True)], axis=0).reset_index(drop=True)
RFQ_df_Y = pd.concat([RFQ_df_Y[:20].reset_index(drop=True), RFQ_df_Y[-5:].reset_index(drop=True)], axis=0).reset_index(drop=True)

# SCL_X = np.load(f'{path_to_new_data}/{system_4}.npy')
# SCL_Y = np.load(f'{path_to_new_data}/{system_4}_labels.npy', allow_pickle=True)
# SCL_df, SCL_df_Y = create_dfs(SCL_X,SCL_Y)

In [6]:
def driver_func(runnable_pipe_func, SAVE_FOLDER, name_of_model, nr_subsets, nr_percentage, window_size, ucl_percentile, ucl_multiplier, family_descr, N_ESTIMATORS, N_CORES, list_of_df, random_seeds):

    with multiprocessing.Pool(N_CORES) as pool:
        results = [pool.apply_async(runnable_pipe_func, (SAVE_FOLDER, name_of_model, nr_subsets, nr_percentage, window_size, ucl_percentile, ucl_multiplier, str(family_descr), f"task_model_{str(task_ind).rjust(3,'0')}",list_of_df, seed)) for task_ind, seed in zip(range(N_ESTIMATORS), random_seeds)]
        
        all_seeds = []
        count = 0
        for r in tqdm(results, f"task_model_{str(count).rjust(3,'0')}"):
            tmp_res = r.get()
            all_seeds.append(tmp_res)
            count+=1
    
    return all_seeds

In [7]:
def get_predictd_df_Y(df, check_col):
    data_Y_list = []
    for i in range(0,len(df), 4500):
        if df.iloc[i:i+4500,:][check_col].astype(bool).any():
            data_Y_list.append((1,))
        else:
            data_Y_list.append((0,))
    data_Y = pd.DataFrame(data_Y_list, columns=[check_col]).reset_index(drop=True)
    return data_Y

In [8]:
# CCL_df_Y
# DTL_df_Y
# RFQ_df_Y
# SCL_df_Y
list_of_df_Y = [RFQ_df_Y]

# CCL_df
# DTL_df
# RFQ_df
# SCL_df
list_of_df = [RFQ_df]

In [9]:
log = logging.getLogger()
for hdlr in log.handlers[:]:
    log.removeHandler(hdlr)

logging.basicConfig(level=logging.INFO, filename=f"./logs_info.log",
                    format="CREATED_AT: %(asctime)s - MESSAGE: %(message)s")

In [10]:
def custom_run(N_ESTIMATORS, tmp_seed_number, SAVE_FOLDER, N_CORES, 
               name_of_model, nr_subsets, nr_percentage, window_size, 
               ucl_percentile, ucl_multiplier):
    
    set_random(tmp_seed_number)
    Path(SAVE_FOLDER).mkdir(parents=True, exist_ok=True)

    # log = logging.getLogger()
    # for hdlr in log.handlers[:]:
    #     log.removeHandler(hdlr)
    
    # logging.basicConfig(level=logging.INFO, filename=f"{SAVE_FOLDER}/logs_info.log",
    #                     format="CREATED_AT: %(asctime)s - MESSAGE: %(message)s")

    log_var_tmp = ({"N_ESTIMATORS": N_ESTIMATORS, "tmp_seed_number": tmp_seed_number, "SAVE_FOLDER": SAVE_FOLDER, "N_CORES": N_CORES, 
               "name_of_model": name_of_model, "nr_subsets": nr_subsets, "nr_percentage": nr_percentage, "window_size": window_size, 
               "ucl_percentile": ucl_percentile, "ucl_multiplier": ucl_multiplier})
    random_seeds = np.random.choice(9999, N_ESTIMATORS, replace=False).tolist()
    logging.info(f"'Params: general_seed: {str(tmp_seed_number)} other_seeds: [{str(random_seeds)}]' All params: [{log_var_tmp}]")
    
    res = driver_func(ready_pipelines.run_pipeline_custom_electr, SAVE_FOLDER, name_of_model, nr_subsets, nr_percentage, window_size, ucl_percentile, ucl_multiplier, str(tmp_seed_number), N_ESTIMATORS, N_CORES, list_of_df, random_seeds)

    time.sleep(10)
    
    df = report_utils.family_majority_voting(top_level_folder=SAVE_FOLDER, model_family=f"family_conv_ae_{str(tmp_seed_number)}")
    df = df[["predicted_anomaly_majority_voting"]]
    df_predicted = get_predictd_df_Y(df, "predicted_anomaly_majority_voting")
    df_original = list_of_df_Y[0]
    df_original = df_original[["anomaly"]]

    final_df = pd.concat([df_predicted,df_original], ignore_index=False, axis=1)
    roc_number = roc_auc_score(final_df["anomaly"], final_df["predicted_anomaly_majority_voting"])
    F1 = f1_score(final_df["anomaly"], final_df["predicted_anomaly_majority_voting"])
    tpfptnfn = get_pn(final_df["anomaly"], final_df["predicted_anomaly_majority_voting"])

    clear_output(wait=True)

    logging.info(f"'AUC: {roc_number} F1: {F1}")
    logging.info(f"Measures: {[tpfptnfn]}")
    
    # if roc_number>0.8300:
    #     break

In [11]:
def custom_grid_search(grid_values, **kwargs):
    N_ESTIMATORS = kwargs.get("N_ESTIMATORS", 1)
    SAVE_FOLDER = kwargs.get("SAVE_FOLDER",f"./tmp_res_{str(N_ESTIMATORS)}")
    N_CORES = kwargs.get("N_CORES", 1)
    name_of_model = kwargs.get("name_of_model","Nothing")
    nr_subsets = kwargs.get("nr_subsets",2)
    nr_percentage = kwargs.get("nr_percentage",0.75)
    window_size = kwargs.get("window_size",None)
    ucl_percentile = kwargs.get("ucl_percentile",None)
    ucl_multiplier = kwargs.get("ucl_multiplier",None)
    tmp_seed_number = kwargs.get("tmp_seed_number",0)

    
    if (grid_values is None):
        custom_run(N_ESTIMATORS, tmp_seed_number, SAVE_FOLDER, N_CORES, 
               name_of_model, nr_subsets, nr_percentage, window_size, 
               ucl_percentile, ucl_multiplier)
    elif (len(grid_values)==0):
        custom_run(N_ESTIMATORS, tmp_seed_number, SAVE_FOLDER, N_CORES, 
               name_of_model, nr_subsets, nr_percentage, window_size, 
               ucl_percentile, ucl_multiplier)
    else:
        grid_values_values = [value for key, value in grid_values.items()]
        grid_values_keys = [key for key, value in grid_values.items()]
        grid_values_combinations = list(itertools.product(*grid_values_values))
        
        for val_var in grid_values_combinations:
            for grid_value_key, val_var_value in zip(grid_values_keys, val_var):
                if "N_ESTIMATORS" == grid_value_key:
                    N_ESTIMATORS = val_var_value
                    SAVE_FOLDER = kwargs.get("SAVE_FOLDER",f"./tmp_res_{str(N_ESTIMATORS)}")
                elif "tmp_seed_number" == grid_value_key:
                    tmp_seed_number = val_var_value
                elif "SAVE_FOLDER" == grid_value_key:
                    SAVE_FOLDER = val_var_value
                elif "N_CORES" == grid_value_key:
                    N_CORES = val_var_value
                elif "name_of_model" == grid_value_key:
                    name_of_model = val_var_value
                elif "nr_subsets" == grid_value_key:
                    nr_subsets = val_var_value
                elif "nr_percentage" == grid_value_key:
                    nr_percentage = val_var_value
                elif "window_size" == grid_value_key:
                    window_size = val_var_value
                elif "ucl_percentile" == grid_value_key:
                    ucl_percentile = val_var_value
                elif "ucl_multiplier" == grid_value_key:
                    ucl_multiplier = val_var_value
            custom_run(N_ESTIMATORS, tmp_seed_number, SAVE_FOLDER, N_CORES, 
               name_of_model, nr_subsets, nr_percentage, window_size, 
               ucl_percentile, ucl_multiplier)

In [12]:
my_list_seed = [i for i in range(50)] + [i for i in range(100,1000,50)]
custom_grid_search({"N_ESTIMATORS":[1],
                    "tmp_seed_number": my_list_seed,
                    "window_size": [8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60],
                    "nr_percentage": [0.60,0.65,0.70,0.75,0.80,0.85,0.90,0.95]},
                    N_CORES=6,
                    nr_subsets=2,
                    ucl_percentile=0.999,
                    ucl_multiplier=3/2,
                    name_of_model="CONV_AE")

task_model_000:   0%|          | 0/1 [01:07<?, ?it/s]


TypeError: power() takes from 2 to 3 positional arguments but 1 were given