# 02 - Creation d'un dataset des données nettoyées de metrics (après fractionnement)

Ce notebook génère :

- 1 fichier csv "merge_clean_metrics_dataset.csv"

Etapes de nettoyage :

- Suppression des colonnes n'ayant que des valeurs nulles

- Suppression des colonnes avec informations redondantes (identification=message_events) ou inutiles (id de message)

- Conversion des types de colonnes avec le type de valeurs

- Remplacement des valeurs nulles

- Encodage des codes d'identification en chaine de caractères (maj du metrics_events_dict.json) et de la criticité

# A. Imports

## Librairies

In [1]:
import os, json, ast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from pathlib import Path

## Données

In [2]:
# source path to raw metrics dataset
source_csv = '../data/metrics/raw_merge_metrics_dataset.csv'
# target path to save metrics dictionnaire
save_json ='../data/metrics/metrics_events_dict.json'
# target path to save merge raw metrics dataset
save_csv = '../data/metrics/clean_merge_metrics_dataset.csv'
encoded_save_csv = '../data/metrics/encoded_clean_merge_metrics_dataset.csv'

# B. Dataframe

## a) Import dataset

In [3]:
# création d'un dataframe à partir du csv de données
df = pd.read_csv(Path(source_csv), index_col=0)
# réindexation à 0
df.reset_index(level=None, drop=True, inplace=True, col_level=0, col_fill='')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2182421 entries, 0 to 2182420
Data columns (total 18 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   id                         int64  
 1   source_events              object 
 2   message_events             object 
 3   timestamp_events           object 
 4   criticality_events         object 
 5   identification_events      object 
 6   sn_modules                 float64
 7   name_modules               object 
 8   type_modules               object 
 9   generation_modules         object 
 10  name_counters_modules      object 
 11  value_counters_modules     int64  
 12  name_connected_operators   object 
 13  level_connected_operators  object 
 14  status                     object 
 15  created_at                 object 
 16  varnishLevelsTargetvolume  float64
 17  varnishLevelsTotalvolume   int64  
dtypes: float64(2), int64(3), object(13)
memory usage: 299.7+ MB


## b) Selection des colonnes

In [4]:
# suppression des colonnes ne contenant que des valeurs nulles
df = df.dropna(axis=1, how='all')

In [5]:
# on supprime les colonnes doublons (message=identification)
df = df.drop(['id', 'message_events'], axis=1)

In [6]:
# on converti les float en entier 64
df.varnishLevelsTargetvolume = pd.to_numeric(df.varnishLevelsTargetvolume).astype('int64')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2182421 entries, 0 to 2182420
Data columns (total 15 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   source_events              object
 1   timestamp_events           object
 2   criticality_events         object
 3   identification_events      object
 4   name_modules               object
 5   type_modules               object
 6   generation_modules         object
 7   name_counters_modules      object
 8   value_counters_modules     int64 
 9   name_connected_operators   object
 10  level_connected_operators  object
 11  status                     object
 12  created_at                 object
 13  varnishLevelsTargetvolume  int64 
 14  varnishLevelsTotalvolume   int64 
dtypes: int64(3), object(12)
memory usage: 249.8+ MB


### 1) Colonne 'timestamp'

In [8]:
# on renomme la colonne timestamp_events
df = df.rename(columns={'timestamp_events':'timestamp'})
# on remplace des valeurs maquantes de timestamp par celle de created_at
df.timestamp = df.timestamp.fillna(df['created_at'])
# on converti les valeur en datetim
df.timestamp = pd.to_datetime(df.timestamp, utc=True)
# on supprime la colonne doublon (created_at=tiemstamp)
df = df.drop(['created_at'], axis=1)

### 2) Colonne 'identification'

In [9]:
df.identification_events.unique()

array(['358', '391', nan, '330', '332', '331', '454', '333', '352', '334',
       'Kernel_Error', '386', '407', '381', '356',
       'iFoil communication error', '479', '329', '344', '440', '406',
       '388', '313', '357', '355', '430', '373', '447', '383', '359',
       'ICB communication error', '345', '444', '324', '445', '405',
       '371', '354', '343', '311', '465', '466', '374', '321', '327',
       '322', '382', '350', '325', '451', 'RCB communication error',
       '349', '372', '392', '320', '417', '351', '476', '387', '389', '0',
       '328', '408', '452', '446', '418'], dtype=object)

In [10]:
# on remplace les valeurs nulles par des 0
df.identification_events = df.identification_events.replace(np.nan, 0)

In [11]:
# on encode les valeurs du type 'str' avec un code
events_id = []
str_code_dict = {}
str_code = 1000
for id in list(df['identification_events'].unique()) :
    try:
        events_id.append(int(id))
    except ValueError:
        str_code_dict[id] = str_code
        events_id.append(str_code)
        str_code += 1
str_code_dict

{'Kernel_Error': 1000,
 'iFoil communication error': 1001,
 'ICB communication error': 1002,
 'RCB communication error': 1003}

In [12]:
# on sauvegarde l'encodage dans metrics_events_dict
inv_str_code_dict = {v: k for k, v in str_code_dict.items()}
with open(file=Path(save_json), mode="r+", encoding='utf-8') as jsonFile:
    data = json.load(jsonFile)
    data['identification encoded'] = inv_str_code_dict
    jsonFile.seek(0)
    json.dump(data, jsonFile, indent=4, ensure_ascii=False)
    jsonFile.close()

In [13]:
# on remplace dans le dataframe les valeurs du type 'str' avec un code
df.identification_events = df.identification_events.replace(str_code_dict)
# on converti toutes les valeurs en entier
df.identification_events = pd.to_numeric(df.identification_events).astype('int64')

In [14]:
df.identification_events.unique()

array([ 358,  391,    0,  330,  332,  331,  454,  333,  352,  334, 1000,
        386,  407,  381,  356, 1001,  479,  329,  344,  440,  406,  388,
        313,  357,  355,  430,  373,  447,  383,  359, 1002,  345,  444,
        324,  445,  405,  371,  354,  343,  311,  465,  466,  374,  321,
        327,  322,  382,  350,  325,  451, 1003,  349,  372,  392,  320,
        417,  351,  476,  387,  389,  328,  408,  452,  446,  418],
      dtype=int64)

### 3) Encodage des labels 'criticality'

In [15]:
encoded_df = df.copy()

In [16]:
# on remplace dans le dataframe les valeurs du type 'str' avec un code
encoded_df.criticality_events = encoded_df.criticality_events.fillna("UNDEFINED")
criticality = {'UNDEFINED': 0, 'INFO': 1, 'WARNING': 2, 'ERROR':3}
encoded_df.criticality_events.replace(criticality, inplace=True)
encoded_df.criticality_events = pd.to_numeric(encoded_df.criticality_events).astype('int64')

In [17]:
# on sauvegarde l'encodage dans metrics_events_dict
inv_criticality = {v: k for k, v in criticality.items()}
with open(file=Path(save_json), mode="r+", encoding='utf-8') as jsonFile:
    data = json.load(jsonFile)
    data['criticality encoded'] = inv_criticality
    jsonFile.seek(0)
    json.dump(data, jsonFile, indent=4, ensure_ascii=False)
    jsonFile.close()

## c) Output csv

In [18]:
# sauvegarde du dataframe avant encodage
df.to_csv(path_or_buf=Path(save_csv))

In [19]:
df.head(3)

Unnamed: 0,source_events,timestamp,criticality_events,identification_events,name_modules,type_modules,generation_modules,name_counters_modules,value_counters_modules,name_connected_operators,level_connected_operators,status,varnishLevelsTargetvolume,varnishLevelsTotalvolume
0,iFoil,2022-11-02 08:43:46.921000+00:00,INFO,358,iFoil L,iFoil,Gen. 2,Total Pages Counter,25411,User,Operator,ERR,12766,18000
1,iFoil,2022-11-02 08:43:46.921000+00:00,INFO,358,iFoil L,iFoil,Gen. 2,Foiled Pages Counter,670871,User,Operator,ERR,12766,18000
2,PLC,2022-11-02 08:43:46.845000+00:00,ERROR,358,Print Engine 1,Varnish Printer,,3D Varnish Counter,3359237,User,Operator,ERR,12766,18000


In [20]:
# sauvegarde du dataframe après encodage
encoded_df.to_csv(path_or_buf=Path(encoded_save_csv))