# Notebook exploration données : metrics

Ce notebook génère 2 fichiers csv 

- complet avec les données métrics nettoyées et les colonnes json fractionnées,

- partiel avec les données métrics et ne conservant que les id des events.

## Imports

In [None]:
import os, json
import pandas as pd
import numpy as np

from azure_blob import download_blob_file
from preprocess_functions import convert_str_to_list, get_keys, add_key_column, convert_str_to_int_list

import matplotlib.pyplot as plt
from datetime import datetime

### Data

In [None]:
filename = 'metrics.csv'
path = '../data/'

In [None]:
# téléchargement dans le repertoire 'data' d'un fichiers 'csv' depuis le blob
download_blob_file(file_name=filename, local_path=path)
metrics = os.path.join(path, filename)

In [None]:
# création d'un dataframe à partir du csv de données
metrics_df = pd.read_csv(metrics).sort_values(by='created_at')
metrics_df.info()

In [None]:
metrics_df.dropna(axis=1, inplace=True)
metrics_df.drop('machineId', axis=1, inplace=True)
# metrics_df.set_index('id')

In [None]:
metrics_df.info()

In [None]:
metrics_df.head(3)

## Dataframes

### Column "connected_operators"

In [None]:
#metrics_df.connected_operators = metrics_df.connected_operators.apply(lambda x :json.loads(x)[0])
connected_operators_df = convert_str_to_list(metrics_df.connected_operators)

In [None]:
# on récupère les clés du dictionnaire dans la première ligne du dataset 'loc.[0]' dans la première case de la liste '[0]'
connected_operators_keys = get_keys(connected_operators_df.connected_operators.loc[0][0])

In [None]:
# on ajoute des colonnes pour chaque clés
for key in connected_operators_keys:
    connected_operators_df = add_key_column(connected_operators_df, 'connected_operators', connected_operators_keys,0)

In [None]:
# on supprime la colonne d'origine
connected_operators_df.drop('connected_operators', axis='columns', inplace=True)

In [None]:
connected_operators_df.rename(
    columns={
    'connected_operators_name_0': 'operators_name',
    'connected_operators_level_0': 'operators_level'
    }, 
    inplace=True)

In [None]:
connected_operators_df.head(3)

### Column "modules"

In [None]:
modules_df = convert_str_to_list(metrics_df.modules)

In [None]:
modules_keys = get_keys(modules_df.modules.loc[0][0])

In [None]:
for key in modules_keys:
    modules_df = add_key_column(modules_df, 'modules', modules_keys,0)
for key in modules_keys:
    modules_df = add_key_column(modules_df, 'modules', modules_keys,1)

In [None]:
modules_df.head(3)

In [None]:
modules_df.drop('modules', axis='columns', inplace=True)

In [None]:
modules_df.info()

#### Column "modules counters"

In [None]:
modules_counters_keys = get_keys(modules_df.modules_counters_0.loc[0][0])

In [None]:
for key in modules_counters_keys:
    modules_df = add_key_column(modules_df, 'modules_counters_0', modules_counters_keys,0)
for key in modules_counters_keys:
    modules_df = add_key_column(modules_df, 'modules_counters_1', modules_counters_keys,0)

In [None]:
modules_df.drop(['modules_counters_0', 'modules_counters_1'], axis='columns', inplace=True)

In [None]:
modules_df.info()

In [None]:
modules_df.dropna(axis='columns', how='any', inplace=True)

In [None]:
for col in modules_df.columns :
    if modules_df[col].nunique() > 1 :
        print(col, 'nombre de valeurs unique : %d' % modules_df[col].nunique())
    else :
        print(col, modules_df[col].unique())

In [None]:
columns_to_drop = []
for col in modules_df.columns :
    if modules_df[col].nunique() == 1 :
        if modules_df[col].unique().item() != '':
            columns_to_drop.append(col)
        else :
            modules_df.drop(columns=[col], axis='columns', inplace=True)

In [None]:
modules_df.rename(columns={
    'modules_counters_0_value_0': 'VarnishPrinter_3DVarnishCounter',
    'modules_counters_1_value_0': 'iFoil_TotalPagesCounter'
    }, inplace=True)
modules_df.drop(columns=columns_to_drop, axis='columns', inplace=True)

In [None]:
modules_df.head(3)

### Column "events"

In [None]:
# conversion des valeurs "events" string en list
events_df = convert_str_to_list(metrics_df.events)

#### Split events in dataframe

In [None]:
# création d'un dataframe de la colonne events fractionnées
import ast
d = {'source': [],
    'message': [],
    'timestamp': [],
    'criticality': [],
    'identification': []}
for values in events_df.events.values :
    # pour chaque list non nulle
    if len(values) > 0 :
        # ajout des valeurs dans le dictionnaire 'd'
        for event in values :
            d['source'].append(event.get('source'))
            d['message'].append(event.get('message'))
            d['timestamp'].append(event.get('timestamp'))
            d['criticality'].append(event.get('criticality'))
            d['identification'].append(event.get('identification'))
df =  pd.DataFrame(data=d)


In [None]:
# liste des codes d'identification
identification_codes_list = df['identification'].unique()
np.sort(identification_codes_list)

In [None]:
df.index

In [None]:
# liste des évènements uniques
identification_dict = {}
c = 1
id_list = []
for i in range(df.index.start, df.index.stop):
    id = df.identification.loc[i]
    if id not in id_list:
        id_list.append(id)
        identification_dict[id] = df.message.loc[i]
        c += 1
identification_dict

In [None]:
# liste des sources
source_list = df['source'].unique()
np.sort(source_list)

In [None]:
# liste des sources
criticality_list = df['criticality'].unique()
np.sort(criticality_list)

In [None]:
# on sauvegarde la liste des codes d'identification d'event
from pathlib import Path
filepath = Path('../data/metrics/metrics_events_dict.json')
with open(file=filepath, mode="r+", encoding='utf-8') as jsonFile:
    data = json.load(jsonFile)
    data['identification'] = identification_dict
    data['criticality'] = list(np.sort(criticality_list)),
    data['source'] = list(np.sort(source_list))
    jsonFile.seek(0)
    json.dump(data, jsonFile, indent=4, ensure_ascii=False)
    jsonFile.close()

In [None]:
# on liste les clés d'un dictionnaire d'event
events_keys = []
for i in range(0, len(events_df)) :
    if len(events_df.events.loc[i]) != 0 :
        event_keys = events_df.events.loc[i][0].keys()
        if event_keys not in events_keys :
            events_keys.append(event_keys)
# clés d'un dictionnaire d'event
events_keys

In [None]:
# création d'un dictionnaire d'event vide
events_dict = {}
for key in event_keys :
    events_dict[key] = np.nan

In [None]:
# ajout d'une colonne "Length" du nombre d'event pas ligne
events_df['Length'] = events_df.events.map(len)

In [None]:
events_df.head(3)

In [None]:
# fractionnement des events
events_split_df = pd.DataFrame(events_df.events.to_list(), dtype='object', index=events_df.index)
events_split_df.head(3)

In [None]:
# remplacement des valeurs "None" par des dictionnaires 
for col in events_split_df.columns :
    for i in range(0,len(events_split_df.iloc[:,col])):
        if isinstance(events_split_df.loc[i][col], type(None)) :
            events_split_df.loc[i][col] = events_dict

In [None]:
# renommage des colonnes du dataframe des events
i = 0
col_names = {}
for col in range(events_split_df.columns.start, events_split_df.columns.stop):
    col_names[col] = 'event_'+str(col)
events_split_df.rename(columns=col_names, inplace=True)

In [None]:
# impression du dataframe events
events_split_df.head(3)

In [None]:
events_split_df['event_0'].apply(pd.Series)

In [None]:
# on fractionne chaque event avec la liste des clés d'un dictionnaire event 
events_dict_split_df = pd.DataFrame()
i = 0
for col in events_split_df.columns :
    events_dict_split_df[[
        'source_'+str(col),
        'message_'+str(col),
        'timestamp_'+str(col),
        'criticality_'+str(col),
        'identification_'+str(col)
        ]] = pd.DataFrame(events_split_df.iloc[:,i].tolist(), index= events_split_df.index)
    i += 1

In [None]:
events_dict_split_df.head(3)

#### Identification

In [None]:
# creation d'un dataframe des colonnes 'identification_'
identification_data = {}
for col in events_dict_split_df.columns :
    if 'identification_' in col :
        identification_data[col] = events_dict_split_df[col].values
identification_df = pd.DataFrame(identification_data, index=events_dict_split_df.index)

In [None]:
# fusionner les valeurs des colonnes identifications dans une liste d'entiers
identification_df['events_id'] = identification_df.iloc[:,:].apply(lambda x: convert_str_to_int_list(x), axis=1)
identification_df = identification_df['events_id']

#### Criticality

In [None]:
# creation d'un dataframe des colonnes 'criticality_'
criticality_data = {}
for col in events_dict_split_df.columns :
    if 'criticality_' in col :
        criticality_data[col] = events_dict_split_df[col].values
criticality_df = pd.DataFrame(criticality_data, index=events_dict_split_df.index)
criticality_df.fillna(0, inplace=True)

In [None]:
# on print les différentes valeurs 'criticality' présentent dans chaque colonne
for col in criticality_df.columns:
    print(col, criticality_df[col].unique())

## Fusion dataframes

In [None]:
# fusion des dataframes des colonnes fracionnées
merge_df =  pd.merge(
                pd.merge(
                    pd.merge(
                        metrics_df, 
                        connected_operators_df, left_index=True, right_index=True), 
                    modules_df, left_index=True, right_index=True), 
                events_dict_split_df, left_index=True, right_index=True)
# suppression des colonnes fractionnées
merge_df.drop(['connected_operators','modules','events'], axis=1, inplace=True)
# conversion de la colonne "created_at" au format Date
merge_df['created_at'] = pd.to_datetime(merge_df['created_at'])
# indexation du dataset avec les valeurs "created_at"
merge_df.index = merge_df['created_at']
# suppression de la colonne "created_at"
del merge_df['created_at']
# sauvegarde du dataset en csv
merge_df.to_csv('../data/metrics/metrics_df.csv')

### dataset ciblage events 'identification'

In [None]:
# dataset temporaire
temp_df = pd.merge(
                pd.merge(
                    pd.merge(
                        metrics_df, 
                        connected_operators_df, left_index=True, right_index=True), 
                    modules_df, left_index=True, right_index=True), 
                identification_df, left_index=True, right_index=True)
# suppression des colonnes fractionnées
temp_df.drop(['connected_operators','modules'], axis=1, inplace=True)
# conversion de la colonne "created_at" au format Date
temp_df['created_at'] = pd.to_datetime(temp_df['created_at'])
# sauvegarde du dataset en csv
temp_df.to_csv('../data/metrics/temp_metrics_df.csv')

## Data exploration

In [None]:
temp_df.head(5)

In [None]:
def plot_timeseries(dataframe) :
    dataframe.plot(figsize=(15, 6))
    plt.savefig('../data/metrics/img/metrics_df.png')
    plt.show()

In [None]:
# creation d'une copie du dataframe avec normalisation
# copy the data
df_max_scaled = temp_df[['created_at','varnishLevelsTargetvolume','varnishLevelsTotalvolume','VarnishPrinter_3DVarnishCounter','iFoil_TotalPagesCounter']].copy(deep=True)
df_max_scaled.index = df_max_scaled['created_at']
del df_max_scaled['created_at']
# apply normalization techniques
for column in df_max_scaled.columns:
    df_max_scaled[column] = df_max_scaled[column]  / df_max_scaled[column].abs().max()
      
# view normalized data
display(df_max_scaled)

In [None]:
plot_timeseries(df_max_scaled)