# Notebook exploration données : event les plus fréquents

## Imports

### Librairies

In [35]:
import os, json, ast
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from datetime import datetime

### Data

In [36]:
# nom de fichier et chemin relatif
filename = 'temp_metrics_df.csv'
path = '../data/metrics/'
# création d'un dataframe à partir du csv de données
metrics_df = pd.read_csv(path+filename, index_col=0).sort_values(by='created_at')
metrics_df.head(2)

Unnamed: 0,id,status,created_at,varnishLevelsTargetvolume,varnishLevelsTotalvolume,events,operators_name,operators_level,VarnishPrinter_3DVarnishCounter,iFoil_TotalPagesCounter,events_id
307539,4169748,WARNING,2022-04-15 05:55:06.678,36192.322612,100000,[],Viktor,Operator,1792992,22881,[]
307540,4169749,WARNING,2022-04-15 05:55:06.829,36192.322612,100000,"[{""source"": ""PLC"", ""message"": "" JV-Ti non prêt...",Viktor,Operator,1792992,22881,[391]


## Nettoyage

In [37]:
# indexation du dataset avec les valeurs "created_at"
# metrics_df.index = metrics_df['created_at']
# indexation du dataset avec les valeurs "id"
# metrics_df.set_index('id')
# remise à zero des index
metrics_df.reset_index(level=None, drop=True, inplace=True, col_level=0, col_fill='')
# suppression des colonne contenant des NaN
metrics_df.dropna(axis=1, inplace=True)
# suppression des colonnes non pertinentes
metrics_df.drop(['status','created_at','operators_name','operators_level', 'events_id']  , axis=1, inplace=True)

metrics_df.head(2)

Unnamed: 0,id,varnishLevelsTargetvolume,varnishLevelsTotalvolume,events,VarnishPrinter_3DVarnishCounter,iFoil_TotalPagesCounter
0,4169748,36192.322612,100000,[],1792992,22881
1,4169749,36192.322612,100000,"[{""source"": ""PLC"", ""message"": "" JV-Ti non prêt...",1792992,22881


## Préparation

### Fractionnement de la colonne "events"

In [38]:
# fonction retournant un dataframe des events fractionnés
def create_split_event_dataframe(df, code=None):
    df.events.apply(lambda x :json.loads(x))

    # dicitonnaire de données
    d = {
            'source': [],
            'message': [],
            'timestamp': [],
            'criticality': [],
            'identification': [],
            'id': [],
            'varnishLevelsTargetvolume' : [],
            'varnishLevelsTotalvolume' : [],
            'VarnishPrinter_3DVarnishCounter' : [],
            'iFoil_TotalPagesCounter' : []
            }
    
    # itération dans le dataframe
    for i in range(df.index.start, df.index.stop):
        # evaluation des valeurs 'str' en 'list'
        values_eval = ast.literal_eval(df.events.loc[i])
        # pour chaque list non nulle
        if len(values_eval) > 0 :
            # ajout des valeurs dans le dictionnaire 'd'
            for event in values_eval :
                if code != None :
                    if event.get('identification') == code:
                        d['source'].append(event.get('source'))
                        d['message'].append(event.get('message'))
                        d['timestamp'].append(event.get('timestamp'))
                        d['criticality'].append(event.get('criticality'))
                        d['identification'].append(event.get('identification'))
                        d['id'].append(df.id.loc[i])
                        d['varnishLevelsTargetvolume'].append(df.varnishLevelsTargetvolume.loc[i])
                        d['varnishLevelsTotalvolume'].append(df.varnishLevelsTotalvolume.loc[i])
                        d['VarnishPrinter_3DVarnishCounter'].append(df.VarnishPrinter_3DVarnishCounter.loc[i])
                        d['iFoil_TotalPagesCounter'].append(df.iFoil_TotalPagesCounter.loc[i])
                else :
                    d['source'].append(event.get('source'))
                    d['message'].append(event.get('message'))
                    d['timestamp'].append(event.get('timestamp'))
                    d['criticality'].append(event.get('criticality'))
                    d['identification'].append(event.get('identification'))
                    d['id'].append(df.id.loc[i])
                    d['varnishLevelsTargetvolume'].append(df.varnishLevelsTargetvolume.loc[i])
                    d['varnishLevelsTotalvolume'].append(df.varnishLevelsTotalvolume.loc[i])
                    d['VarnishPrinter_3DVarnishCounter'].append(df.VarnishPrinter_3DVarnishCounter.loc[i])
                    d['iFoil_TotalPagesCounter'].append(df.iFoil_TotalPagesCounter.loc[i])
                    
    # re-assignation de la variable df
    df = pd.DataFrame(data=d)
    # conversion des valeurs 'identification' en entier
    #df['identification'] = pd.to_numeric(df['identification'])
    # conversion des valeurs 'timestamp' au format datetime
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    return df

In [39]:
df_split = create_split_event_dataframe(metrics_df)

In [40]:
df_split.head(5)

Unnamed: 0,source,message,timestamp,criticality,identification,id,varnishLevelsTargetvolume,varnishLevelsTotalvolume,VarnishPrinter_3DVarnishCounter,iFoil_TotalPagesCounter
0,PLC,JV-Ti non prêt : impression impossible,2022-04-15 05:55:23.462000+00:00,INFO,391,4169749,36192.322612,100000,1792992,22881
1,iFoil,JV-Ti non prêt : impression impossible,2022-04-15 06:06:56.278000+00:00,INFO,391,4170152,36192.322612,100000,1792992,22881
2,PLC,En attente,2022-04-15 06:06:56.418000+00:00,INFO,330,4170152,36192.322612,100000,1792992,22881
3,PLC,Disponible,2022-04-15 06:07:28.326000+00:00,INFO,332,4170167,36192.322612,100000,1792992,22881
4,PLC,Chargeur: mode auto non activé,2022-04-15 06:07:37.675000+00:00,WARNING,377,4170172,36192.322612,100000,1792992,22881


### Encodage des identification

In [41]:
# on encode les valeurs du type 'str' avec un code
events_id = []
str_code_dict = {}
str_code = 1000
for id in list(df_split['identification'].unique()) :
    try:
        events_id.append(int(id))
    except ValueError:
        str_code_dict[id] = str_code
        events_id.append(str_code)
        str_code += 1
str_code_dict

{'Kernel_Error': 1000,
 'ICB communication error': 1001,
 'RCB communication error': 1002,
 'iFoil communication error': 1003,
 'Pilot communication error': 1004}

In [49]:
# on sauvegarde l'encodage dans metrics_events_dict
from pathlib import Path
filepath = Path('../data/metrics/metrics_events_dict.json')
inv_str_code_dict = {v: k for k, v in str_code_dict.items()}

with open(file=filepath, mode="r+", encoding='utf-8') as jsonFile:
    data = json.load(jsonFile)
    data['identification encoded'] = inv_str_code_dict
    jsonFile.seek(0)
    json.dump(data, jsonFile, indent=4, ensure_ascii=False)
    jsonFile.close()

In [43]:
# on remplace dans le dataframe les valeurs du type 'str' avec un code
df_split.identification = df_split.identification.replace(str_code_dict)
df_split.identification.unique()

In [45]:
df_split.head(5)

Unnamed: 0,source,message,timestamp,criticality,identification,id,varnishLevelsTargetvolume,varnishLevelsTotalvolume,VarnishPrinter_3DVarnishCounter,iFoil_TotalPagesCounter
0,PLC,JV-Ti non prêt : impression impossible,2022-04-15 05:55:23.462000+00:00,INFO,391,4169749,36192.322612,100000,1792992,22881
1,iFoil,JV-Ti non prêt : impression impossible,2022-04-15 06:06:56.278000+00:00,INFO,391,4170152,36192.322612,100000,1792992,22881
2,PLC,En attente,2022-04-15 06:06:56.418000+00:00,INFO,330,4170152,36192.322612,100000,1792992,22881
3,PLC,Disponible,2022-04-15 06:07:28.326000+00:00,INFO,332,4170167,36192.322612,100000,1792992,22881
4,PLC,Chargeur: mode auto non activé,2022-04-15 06:07:37.675000+00:00,WARNING,377,4170172,36192.322612,100000,1792992,22881


In [46]:
df_split.to_csv('../data/metrics/metrics_events_split_df.csv')