# Iniciando configuraciones:

### <font color='orange'>*AVISO*:</font>

#### Para correr este código con normalidad es necesario tener instalado: nodejs y ipywidgets. <br> Para instalar ejecute (una sola vez):

> conda install -c conda-forge notebook ipyparallel <br>
> ipcluster nbextension enable <br>
> jupyter nbextension install --sys-prefix --py ipyparallel <br>
> jupyter nbextension enable --sys-prefix --py ipyparallel <br>
> jupyter serverextension enable --sys-prefix --py ipyparallel <br>
> conda install -c conda-forge nodejs <br>
> conda install -c conda-forge ipywidgets <br>
> conda install -c conda-forge tqdm <br>
> jupyter labextension install @jupyter-widgets/jupyterlab-manager@0.34

### <font color='green'>Sección: Import  </font>

In [17]:
# revisar el archivo ( si fuera necesario): 
# C:\ProgramData\Anaconda3\share\jupyter\nbextensions\ipyparallel\main.js
# cambiar : $("#tabs").find('[href=#clusters]').hide()
# por esto: $("#tabs").find('[href="#clusters"]').hide()

import os
import ipyparallel as ipp
import time
import datetime
import pandas as pd
from my_lib.hmm import hmm_util as hmm_u
from my_lib.hmm import hmm_training as hmm_t
import warnings
# Interact libraries
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual, Layout
import ipywidgets as widgets
# Progress bar
from tqdm import tqdm
cwd = os.getcwd()
# print(cwd)
print("Todo fue importado correctamente")

Todo fue importado correctamente


### <font color='green'> Parámetros por defecto: </font>

In [2]:
# Default values:
DataPath = "./data/"
ModelPath = "./model/"
LogPath = "./log/"
n_col_p = 24    # allow pivot if the number of columns is less than 'n_col_p'

# DIRECTORIOS
hmm_u.printmd("**Directorios:** <br> <ul> <li>[Datos: {0}] </li> <li>[Modelos: {1}] </li> <li>[Logs: {2}] </li></ul> ".format(DataPath, ModelPath, LogPath))
data_set_file_names = os.listdir(DataPath)
data_set_file_names = [x for x in data_set_file_names if ".pkl" in x]

# default_settings:
d_set = hmm_u.open_pickle_file("default_settings.pkl")
if d_set is None:
    d_set = dict()
    d_set["n_min"] = 55
    d_set["n_max"] = 110
    d_set["n_iter"] = 5
    d_set["models_to_train"] = [data_set_file_names[0]]

# NÚMERO DE PERFILES DESEADO:
n_range = widgets.IntRangeSlider(min=40,max=160, value=[d_set["n_min"], d_set["n_max"]], continuous_update=True)
# hmm_u.printmd("**Selección del número de perfiles deseado: **")
ui_range = widgets.HBox([n_range])
def f(n_range):
    d_set["n_min"], d_set["n_max"] = n_range[0], n_range[1] 
    hmm_u.save_pickle_file("default_settings.pkl", d_set)
    print("Se encontrará el mejor modelo con K estados en el rango: [{0}, {1}]".format(n_range[0], n_range[1]))

out_range = widgets.interactive_output(f, {'n_range': n_range})
# display(ui_range, out_range)

# NÚMERO DE ITERACCIONES:
n_iter = widgets.IntSlider(min=1, max=10, value=d_set["n_iter"])
# hmm_u.printmd("<hr/> **Selección del número de iteracciones: **")
ui_iter = widgets.HBox([n_iter])
def g(n_iter):
    d_set["n_iter"] = n_iter 
    hmm_u.save_pickle_file("default_settings.pkl", d_set)
    print("Se realizará {0} iteracciones".format(n_iter))
out_iter = widgets.interactive_output(g, {'n_iter': n_iter})
#display(ui_iter, out_iter)

# SELECT MODELS TO TRAIN
set_file_names = os.listdir(DataPath)
set_file_names = [x for x in data_set_file_names if ".pkl" in x]
if  len(set_file_names) > 10 : n_rows_l = 10
else: n_rows_l = len(data_set_file_names)

# create the select list widget:
models_to_train = widgets.SelectMultiple( options=set_file_names, value=d_set["models_to_train"],
    rows=n_rows_l, description='Modelos: ', disabled=False)

def l(models_to_train):
    to_tr = [x for x in models_to_train if x in set_file_names]
    d_set["models_to_train"] = to_tr 
    hmm_u.save_pickle_file("default_settings.pkl", d_set)  
    md = pd.DataFrame(to_tr, columns=["Modelos seleccionados"])
    md.index.name = "ID"
    print(md)

ui_models = widgets.HBox([models_to_train])
out_models = widgets.interactive_output(l, {'models_to_train': models_to_train})
# display(ui_models, out_models)


**Directorios:** <br> <ul> <li>[Datos: ./data/] </li> <li>[Modelos: ./model/] </li> <li>[Logs: ./log/] </li></ul> 

#### Creando motores para correr el entrenamiento en parallelo

In [3]:
n_comp_min = n_range.value[0]
n_comp_max = n_range.value[1]
n_interaction = n_iter.value

rc = hmm_t.get_ipp_client() 
dview = rc[:]

""" ________________________________________________________
Setting engines:  
Set CWD space to work
Scatter and push variable to work with
"""
v = rc.load_balanced_view()
v.block = True
n_engines = len(v)
v.map(os.chdir, [os.getcwd()] * n_engines)
""" os.getcwd(): Get the current CWD, ex: 'C:\\Repositorios\\parallel_programing' 
    os.chdir() : Set the current CWD """

"""  Scattering the list of nComponents in current engines """
dview.scatter('var_list_nComp', list(range(n_comp_min, n_comp_max + 1)))
"""  make sure that all process contains a scattered list of nComp """
make_sure = dview.pull('var_list_nComp').get()  # important

Engines running for this client: [0, 1, 2, 3]


#### Definiendo la función a correr en parallelo

In [4]:
"""__________________________________________________________
START: Defining the parallel function for training process 
"""

#@v.parallel(block=True)
@v.parallel(block=False)
def hmm_model_training(idp):
    """
        :return the best model that was found in the training process using:
        idp, identify the interaction that is running (if is needed)
        @var_var_dataSet:       is a numpy array of shape (m x n) m features, n samples
                                1 samples contains m features
        @var_var_list_nComp:    list of components to test and evaluate
        @var_index:             list of index that defines the evaluating dataset
        Note: To pass values to this function use: push and scatter methods.
    """
    from my_lib.hmm import hmm_util as hmm_u
    import numpy as np

    # Shared variables:
    global var_dataSet  # dview.push({'var_dataSet': dataset})
    global var_list_nComp  # dview.scatter('var_list_nComp', list(range(n_comp_min,n_comp_max+1)))
    global var_index  # dview.scatter('var_index', list(len(df.index)))

    """ Taking a slide of the hole data set for validating purposes """
    validating_set = var_dataSet[var_index]
    """ Taking the rest of the dataSet for training purposes """
    ini, end = var_index[0], var_index[-1]
    training_set = np.concatenate((var_dataSet[0:ini], var_dataSet[end:-1]))

    """ Training a list of best models """
    best_model, log_register = hmm_u.select_best_HMM(training_set, validating_set, var_list_nComp, seed=idp)

    """ Send the best model and a register/log of the training process """
    return {'model': best_model, "log_register": log_register}
    # return len(var_dataSet)

print("Función de entrenamiento lista.")

Función de entrenamiento lista.


# Entrenando los modelos: 

In [5]:
hmm_u.printmd("**Seleccione los modelos a entrenar:**")
hmm_u.printmd("*> Presione (ctrl + shift) para seleccionar más de un item:*")
display(ui_models, out_models)
hmm_u.printmd("**Selección del número de perfiles deseado**")
display(ui_range, out_range)
hmm_u.printmd("**Selección del número de iteracciones**")
display(ui_iter, out_iter)

**Seleccione los modelos a entrenar:**

*> Presione (ctrl + shift) para seleccionar más de un item:*

HBox(children=(SelectMultiple(description='Modelos: ', index=(0,), options=('Bornes_P_Total.pkl', 'EE. Quito.p…

Output()

**Selección del número de perfiles deseado**

HBox(children=(IntRangeSlider(value=(116, 160), max=160, min=40),))

Output()

**Selección del número de iteracciones**

HBox(children=(IntSlider(value=2, max=10, min=1),))

Output()

In [14]:
print("Función y parámetros listos para el entrenamiento... ")
def training_models_from_file_names(data_set_file_names, time_range):

    tr = time.time()
    for fileName in data_set_file_names:

        """ Reading and preparing dataset from DataPath"""
        df = hmm_u.read_dfx_from(DataPath + fileName)
        mask = df.index.isin(time_range)
        df = df[mask]
        dataSet = df.values
        
        if len(dataSet) == 0:
            print("[{0: <21s}] Check information for {1}".format(hmm_u.time_now(), fileName))
            continue
        
        print("[{0: <21s}] Empezando el entrenamiento HMM para: \t\t{1}".format(hmm_u.time_now(), fileName))
        print("[{0: <21s}] Entrenamiento desde: \t\t\t\t{1} hasta {2}".format(hmm_u.time_now(), 
                                                                              df.index[0].strftime("%Y-%m-%d"),
                                                                              df.index[-1].strftime("%Y-%m-%d")))
        
        
        """ Setting engines for the training process"""
        # push df.values in all engines to start training process:
        dview.push({'var_dataSet': dataSet})
        # scattering indexes for validating purposes:
        dview.scatter('var_index', list(range(len(df.index))))
        dview.gather('var_index').get()  # make sure "var_index" is in engines.
        dview.pull('var_dataSet').get()

        """ Run the training process: (n_interaction) times in parallel fashion: """
        best_model_list = list()
        progress_bar = tqdm(range(n_interaction), desc="\t -> Espere... ", ncols=100) 
        for it in progress_bar:
            model_list = hmm_model_training(range(n_engines*it, n_engines*(it+1)))
            best_model_list += model_list
            progress_bar.set_description("[{0: <8s} + {1: <5} min.] Entrenamiento".format(
                hmm_u.time_now(),
                round(hmm_u.d_time(tr)/60,1)
                )
            )
            tr = time.time()
            
        final_model, log_register = hmm_u.select_best_model_from_list(best_model_list, dataSet)

        """ Ordering the best model according to a Hierarchical Clustering """
        ordered_model = hmm_u.ordered_hmm_model(final_model, method='average', metric='euclidean')

        """ Saving the best model and his log_register for posterior analysis """
        print("\tNumber of trained models: \t\t {0}".format((n_comp_max - n_comp_min)*n_interaction))
        print("\tFinal list of best trained models: \t {0}".format(len(best_model_list)))
        hmm_u.save_model_and_log(ordered_model, log_register, ModelPath, LogPath, 'hmm_' + fileName)

    hmm_u.printmd("** [{0: <21s}] Fin del entrenamiento **".format(hmm_u.time_now()))

Función y parámetros listos para el entrenamiento... 


In [18]:
start_button = widgets.Button(
    description='Click aquí para empezar entrenamiento',
    disabled=False,
    button_style='warning', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Empieza el entrenamiento de acuerdo a los parámetros seleccionados',
    layout=Layout(width='50%')
)

def tr_f(b):
    d_set = hmm_u.open_pickle_file("default_settings.pkl")
    if d_set is None:
        hmm_u.printmd("*El archivo default_settings no existe. Seleccione nuevamente*")
        return None
    
    n_comp_min = d_set["n_min"] 
    n_comp_max = d_set["n_max"] 
    n_interaction = d_set["n_iter"]
    data_set_file_names = d_set["models_to_train"]
    # print(chr(27) + "[2J")
    hmm_u.printmd("** [{0: <21s}] Procediendo con el entrenamiento: **".format(hmm_u.time_now()))
    for d in data_set_file_names:
        hmm_u.printmd("* " + d)
    #time_range = pd.date_range("2014-01-01", "2017-11-30", freq="1D")
    time_range = pd.date_range("2014-01-01", datetime.datetime.now(), freq="1D")
    training_models_from_file_names(data_set_file_names, time_range)
    

ui_start = widgets.HBox([start_button])
display(ui_start)
start_button.on_click(tr_f)



** [08:32:21             ] Procediendo con el entrenamiento: **

* Bornes_P_Total.pkl

[08:32:21             ] Empezando el entrenamiento HMM para: 		Bornes_P_Total.pkl
[08:32:21             ] Entrenamiento desde: 				2014-01-01 hasta 2018-05-22



	 -> Espere... :   0%|                                                        | 0/2 [00:00<?, ?it/s]
[08:38:44 + 6.4   min.] Entrenamiento:   0%|                                  | 0/2 [06:22<?, ?it/s]
[08:38:44 + 6.4   min.] Entrenamiento:  50%|████████████▌            | 1/2 [06:22<06:22, 382.88s/it]
[08:45:09 + 6.4   min.] Entrenamiento:  50%|████████████▌            | 1/2 [12:48<12:48, 768.21s/it]
[08:45:09 + 6.4   min.] Entrenamiento: 100%|█████████████████████████| 2/2 [12:48<00:00, 384.11s/it]


	Best model: 				nComp=151, score=0.9982, log_prob=-437159.76
	Number of trained models: 		 88
	Final list of best trained models: 	 8
	Best model saved in: 			 ./model/hmm_Bornes_P_Total.pkl
	Log register in: 			 ./log/hmm_Bornes_P_Total.json


** [08:45:20             ] Fin del entrenamiento **

</hbr>
### Ejemplo:  <br> 
A continuación se incluye un ejemplo del proceso de entrenamiento:

<font color='gray'> 
Se encontrará el mejor modelo con K estados en el rango: **[70, 85]** con **2** iteraciones <br>
<br>
[**0.0** &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;]   &nbsp;&nbsp; Starting the HMM training process for: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;		EE. Quito.pkl <br>

Trainnig process: 80%|████████████████████ &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;| 2/2  [03:41 < 00:00, 110.73s/it] <br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;	Best model: 				           nComp=75, score=0.9926, log_prob=-302653.05 <br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;	Number of trained models: 		       30 <br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;	Final list of best trained models: 	   8  <br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;	Best model saved in: 			       ./model/hmm_EE. Quito.pkl  <br>
&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;	Log register in: 			           ./log/hmm_EE. Quito.json  <br>
</font>

</hbr>
### References:  <br>
> R. G. Sánchez, “A proposed method for unsupervised anomaly detection for a multivariate building dataset”, M.S. thesis, Dep. of informatics, Fribourg University., Fribourg, Suiza, 2017.

> https://github.com/Borreguin/Master_thesis