# 2) Creation de merge clean metrics dataset 

Ce notebook génère :

- 1 fichier csv "merge_clean_metrics_dataset.csv"

## Imports

In [240]:
import os, json, ast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from pathlib import Path

## 1. Création clean dataset metrics

Preprocessing du dataset pour l'entrainement :

- Centralisation des infos temporels dans la colonne 'timestamp'

- Transformation des colonnes de type 'object' avec encodage

- Selection des colonnes (avec suppression)

- Encodage des catégories

- Normalisation des valeurs

In [241]:
# source path to raw metrics dataset
source_csv = '../data/metrics/merge_raw_metrics_dataset.csv'
# target path to save metrics dictionnaire
save_json ='../data/metrics/metrics_events_dict.json'
# target path to save merge raw metrics dataset
save_csv = '../data/metrics/encoded_merge_clean_metrics_dataset.csv'

### a) Import dataset

In [242]:
# création d'un dataframe à partir du csv de données
df = pd.read_csv(Path(source_csv), index_col=0)
# réindexation à 0
df.reset_index(level=None, drop=True, inplace=True, col_level=0, col_fill='')
df.info()

  df = pd.read_csv(Path(source_csv), index_col=0)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3546276 entries, 0 to 3546275
Data columns (total 17 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   id                         int64  
 1   sn                         float64
 2   name                       object 
 3   type                       object 
 4   generation                 object 
 5   name_counter               object 
 6   value                      int64  
 7   name_op                    object 
 8   level                      object 
 9   source                     object 
 10  message                    object 
 11  timestamp                  object 
 12  criticality                object 
 13  identification             object 
 14  created_at                 object 
 15  varnishLevelsTargetvolume  float64
 16  varnishLevelsTotalvolume   int64  
dtypes: float64(2), int64(3), object(12)
memory usage: 460.0+ MB


### b) Selection des colonnes

In [243]:
# suppression des colonnes ne contenant que des valeurs nulles
df = df.dropna(axis=1, how='all')

In [244]:
# on supprime les colonnes doublons (message=identification)
df = df.drop(['id', 'message'], axis=1)

In [245]:
# on converti les float en entier 64
df.varnishLevelsTargetvolume = pd.to_numeric(df.varnishLevelsTargetvolume).astype('int64')

In [246]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3546276 entries, 0 to 3546275
Data columns (total 14 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   name                       object
 1   type                       object
 2   generation                 object
 3   name_counter               object
 4   value                      int64 
 5   name_op                    object
 6   level                      object
 7   source                     object
 8   timestamp                  object
 9   criticality                object
 10  identification             object
 11  created_at                 object
 12  varnishLevelsTargetvolume  int64 
 13  varnishLevelsTotalvolume   int64 
dtypes: int64(3), object(11)
memory usage: 378.8+ MB


### c) Colonne 'timestamp'

In [247]:
# on remplace des valeurs maquantes de timestamp par celle de created_at
df.timestamp = df.timestamp.fillna(df['created_at'])
# on converti les valeur en datetim
df.timestamp = pd.to_datetime(df.timestamp, utc=True)
# on supprime la colonne doublon (created_at=tiemstamp)
df = df.drop(['created_at'], axis=1)

### d) Colonne 'identification'

In [248]:
df.identification.unique()

array([nan, '391', '330', '332', '377', '333', '334', '331',
       'Kernel_Error', '315', '417', '406', '407', '352', '344',
       'ICB communication error', '376', '445', '325', '343', '345',
       '358', '453', '381', '354', '313', '447', '454', '387', '386',
       '372', '371', '323', '480', '311', '479', '351', '440', '324',
       '321', '0', '349', 'RCB communication error', '385', '357', '418',
       '446', '355', '389', '476', '356', 'iFoil communication error',
       '460', '472', '405', '380', '388', '408', 445.0, 391.0, 330.0,
       333.0, 408.0, 407.0, 406.0, 332.0, 334.0, 472.0, 331.0, 352.0,
       '320', '329', '350', '475', '466', '416', '411', '346', '471',
       '327', 430.0, '430', '444', '2', '326', '419',
       'Pilot communication error', '359', 313.0, 377.0, 453.0, 376.0,
       344.0, 325.0, 454.0, 315.0, 417.0, '322', 385.0, 371.0, 386.0,
       '384'], dtype=object)

In [249]:
# on remplace les valeurs nulles par des 0
df.identification = df.identification.replace(np.nan, 0)

In [250]:
# on encode les valeurs du type 'str' avec un code
events_id = []
str_code_dict = {}
str_code = 1000
for id in list(df['identification'].unique()) :
    try:
        events_id.append(int(id))
    except ValueError:
        str_code_dict[id] = str_code
        events_id.append(str_code)
        str_code += 1
str_code_dict

{'Kernel_Error': 1000,
 'ICB communication error': 1001,
 'RCB communication error': 1002,
 'iFoil communication error': 1003,
 'Pilot communication error': 1004}

In [251]:
# on sauvegarde l'encodage dans metrics_events_dict
inv_str_code_dict = {v: k for k, v in str_code_dict.items()}
with open(file=Path(save_json), mode="r+", encoding='utf-8') as jsonFile:
    data = json.load(jsonFile)
    data['identification encoded'] = inv_str_code_dict
    jsonFile.seek(0)
    json.dump(data, jsonFile, indent=4, ensure_ascii=False)
    jsonFile.close()

In [252]:
# on remplace dans le dataframe les valeurs du type 'str' avec un code
df.identification = df.identification.replace(str_code_dict)
# on converti toutes les valeurs en entier
df.identification = pd.to_numeric(df.identification).astype('int64')

In [253]:
df.identification.unique()

array([   0,  391,  330,  332,  377,  333,  334,  331, 1000,  315,  417,
        406,  407,  352,  344, 1001,  376,  445,  325,  343,  345,  358,
        453,  381,  354,  313,  447,  454,  387,  386,  372,  371,  323,
        480,  311,  479,  351,  440,  324,  321,  349, 1002,  385,  357,
        418,  446,  355,  389,  476,  356, 1003,  460,  472,  405,  380,
        388,  408,  320,  329,  350,  475,  466,  416,  411,  346,  471,
        327,  430,  444,    2,  326,  419, 1004,  359,  322,  384],
      dtype=int64)

### e) Encodage des labels 'criticality'

In [254]:
# on remplace dans le dataframe les valeurs du type 'str' avec un code
criticality = {np.NaN: 0, 'INFO': 1, 'WARNING': 2, 'ERROR':3}
df.criticality.replace(criticality, inplace=True)
df.criticality = pd.to_numeric(df.criticality).astype('int64')

### f) Séléction des colonnes variables de type 'object'

In [255]:
object_columns = list(df.select_dtypes(include=['object']).columns)

In [256]:
for col in object_columns :
    print(col, df[col].unique())

name ['Print Engine 1' 'iFoil L']
type ['Varnish Printer' 'iFoil']
generation [nan 'Gen. 2']
name_counter ['3D Varnish Counter' 'Total Pages Counter' 'Foiled Pages Counter']
name_op ['Viktor' 'JAN' 'Micka' 'Distributor']
level ['Operator' 'Distributor']
source [nan 'PLC' 'iFoil' 'Kernel' 'ICB n°5' 'RCB n°1' 'RCB n°2' 'RCB n°3'
 'ICB n°4' 'ICB n°7' 'ICB n°8' 'ICB n°2' 'ICB n°1' 'ICB n°6' 'Pilot']


In [257]:
# on supprime les colonnes non pertinentes
df = df.drop(['name','generation'], axis=1)
object_columns.remove('name')
object_columns.remove('generation')

#### Encodage des variables

In [258]:
# on encode en vecteur les variables
encoded_df = pd.concat(objs=[df, pd.get_dummies(df[object_columns])], axis=1)
encoded_df = encoded_df.drop(labels=object_columns, axis=1)

In [259]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3546276 entries, 0 to 3546275
Data columns (total 11 columns):
 #   Column                     Dtype              
---  ------                     -----              
 0   type                       object             
 1   name_counter               object             
 2   value                      int64              
 3   name_op                    object             
 4   level                      object             
 5   source                     object             
 6   timestamp                  datetime64[ns, UTC]
 7   criticality                int64              
 8   identification             int64              
 9   varnishLevelsTargetvolume  int64              
 10  varnishLevelsTotalvolume   int64              
dtypes: datetime64[ns, UTC](1), int64(5), object(5)
memory usage: 297.6+ MB


### g) Normalisation

In [262]:
# creation d'une copie du dataframe avec normalisation
# copy the data
df_max_scaled = encoded_df.copy(deep=True)
# df_max_scaled.index = df_max_scaled['timestamp']
del df_max_scaled['timestamp']
# apply normalization techniques
for column in df_max_scaled.drop('criticality', axis=1).columns:
    df_max_scaled[column] = df_max_scaled[column]  / df_max_scaled[column].abs().max()
# view normalized data
display(df_max_scaled)

Unnamed: 0,value,criticality,identification,varnishLevelsTargetvolume,varnishLevelsTotalvolume,type_Varnish Printer,type_iFoil,name_counter_3D Varnish Counter,name_counter_Foiled Pages Counter,name_counter_Total Pages Counter,...,source_ICB n°6,source_ICB n°7,source_ICB n°8,source_Kernel,source_PLC,source_Pilot,source_RCB n°1,source_RCB n°2,source_RCB n°3,source_iFoil
0,0.703951,0,0.000000,0.36192,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.008983,0,0.000000,0.36192,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.012207,0,0.000000,0.36192,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.703951,1,0.389442,0.36192,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.008983,1,0.389442,0.36192,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3546271,0.021746,0,0.000000,0.88999,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3546272,0.030009,0,0.000000,0.88999,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3546273,1.000000,0,0.000000,0.88999,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3546274,0.021746,0,0.000000,0.88999,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Output

In [261]:
encoded_df.to_csv(path_or_buf=Path(save_csv))