# 02 - Creation de merge clean metrics dataset 

Ce notebook génère :

- 1 fichier csv "merge_clean_metrics_dataset.csv"

# A. Imports

## Librairies

In [1]:
import os, json, ast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from pathlib import Path

## Données

In [2]:
# source path to raw metrics dataset
source_csv = '../data/metrics/raw_merge_metrics_dataset.csv'
# target path to save metrics dictionnaire
save_json ='../data/metrics/metrics_events_dict.json'
# target path to save merge raw metrics dataset
save_csv = '../data/metrics/clean_merge_metrics_dataset.csv'

# B. Dataframe

## a) Import dataset

In [3]:
# création d'un dataframe à partir du csv de données
df = pd.read_csv(Path(source_csv), index_col=0)
# réindexation à 0
df.reset_index(level=None, drop=True, inplace=True, col_level=0, col_fill='')
df.info()

  df = pd.read_csv(Path(source_csv), index_col=0)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3510431 entries, 0 to 3510430
Data columns (total 17 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   id                         int64  
 1   source_events              object 
 2   message_events             object 
 3   timestamp_events           object 
 4   criticality_events         object 
 5   identification_events      object 
 6   sn_modules                 float64
 7   name_modules               object 
 8   type_modules               object 
 9   generation_modules         object 
 10  name_counters_modules      object 
 11  value_counters_modules     int64  
 12  name_connected_operators   object 
 13  level_connected_operators  object 
 14  created_at                 object 
 15  varnishLevelsTargetvolume  float64
 16  varnishLevelsTotalvolume   int64  
dtypes: float64(2), int64(3), object(12)
memory usage: 455.3+ MB


## b) Selection des colonnes

In [4]:
# suppression des colonnes ne contenant que des valeurs nulles
df = df.dropna(axis=1, how='all')

In [5]:
# on supprime les colonnes doublons (message=identification)
df = df.drop(['id', 'message_events'], axis=1)

In [6]:
# on converti les float en entier 64
df.varnishLevelsTargetvolume = pd.to_numeric(df.varnishLevelsTargetvolume).astype('int64')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3510431 entries, 0 to 3510430
Data columns (total 14 columns):
 #   Column                     Dtype 
---  ------                     ----- 
 0   source_events              object
 1   timestamp_events           object
 2   criticality_events         object
 3   identification_events      object
 4   name_modules               object
 5   type_modules               object
 6   generation_modules         object
 7   name_counters_modules      object
 8   value_counters_modules     int64 
 9   name_connected_operators   object
 10  level_connected_operators  object
 11  created_at                 object
 12  varnishLevelsTargetvolume  int64 
 13  varnishLevelsTotalvolume   int64 
dtypes: int64(3), object(11)
memory usage: 375.0+ MB


### 1) Colonne 'timestamp'

In [8]:
# on renomme la colonne timestamp_events
df = df.rename(columns={'timestamp_events':'timestamp'})
# on remplace des valeurs maquantes de timestamp par celle de created_at
df.timestamp = df.timestamp.fillna(df['created_at'])
# on converti les valeur en datetim
df.timestamp = pd.to_datetime(df.timestamp, utc=True)
# on supprime la colonne doublon (created_at=tiemstamp)
df = df.drop(['created_at'], axis=1)

### 2) Colonne 'identification'

In [9]:
df.identification_events.unique()

array(['391', '330', '377', nan, '407', '332', '331', '313', '333', '376',
       '454', '323', '480', '386', '479', '372', 'Kernel_Error', '344',
       '343', '445', '352', '324', '321', '385', '354', '355', '440',
       '371', '358', '357', '334', '381', '311', '453',
       'iFoil communication error', '472', '356', '405', '387', '447',
       '0', 'RCB communication error', '380', '388', '460',
       'ICB communication error', '411', '408', '471', '325', '320',
       '430', '406', '417', '444', '329', '446',
       'Pilot communication error', '359', '315', '384', '322', '345',
       '351', '349', '418', '389', '476', '350', '475', '466', '416',
       '346', 445.0, 391.0, '327', 430.0, '2', '326', '419', 333.0, 332.0,
       330.0, 334.0, 331.0, 377.0, 315.0, 406.0, 407.0, 376.0, 325.0,
       454.0, 313.0, 352.0, 344.0, 385.0, 371.0, 386.0], dtype=object)

In [10]:
# on remplace les valeurs nulles par des 0
df.identification_events = df.identification_events.replace(np.nan, 0)

In [11]:
# on encode les valeurs du type 'str' avec un code
events_id = []
str_code_dict = {}
str_code = 1000
for id in list(df['identification_events'].unique()) :
    try:
        events_id.append(int(id))
    except ValueError:
        str_code_dict[id] = str_code
        events_id.append(str_code)
        str_code += 1
str_code_dict

{'Kernel_Error': 1000,
 'iFoil communication error': 1001,
 'RCB communication error': 1002,
 'ICB communication error': 1003,
 'Pilot communication error': 1004}

In [12]:
# on sauvegarde l'encodage dans metrics_events_dict
inv_str_code_dict = {v: k for k, v in str_code_dict.items()}
with open(file=Path(save_json), mode="r+", encoding='utf-8') as jsonFile:
    data = json.load(jsonFile)
    data['identification encoded'] = inv_str_code_dict
    jsonFile.seek(0)
    json.dump(data, jsonFile, indent=4, ensure_ascii=False)
    jsonFile.close()

In [13]:
# on remplace dans le dataframe les valeurs du type 'str' avec un code
df.identification_events = df.identification_events.replace(str_code_dict)
# on converti toutes les valeurs en entier
df.identification_events = pd.to_numeric(df.identification_events).astype('int64')

In [14]:
df.identification_events.unique()

array([ 391,  330,  377,    0,  407,  332,  331,  313,  333,  376,  454,
        323,  480,  386,  479,  372, 1000,  344,  343,  445,  352,  324,
        321,  385,  354,  355,  440,  371,  358,  357,  334,  381,  311,
        453, 1001,  472,  356,  405,  387,  447, 1002,  380,  388,  460,
       1003,  411,  408,  471,  325,  320,  430,  406,  417,  444,  329,
        446, 1004,  359,  315,  384,  322,  345,  351,  349,  418,  389,
        476,  350,  475,  466,  416,  346,  327,    2,  326,  419],
      dtype=int64)

### 3) Encodage des labels 'criticality'

In [15]:
# on remplace dans le dataframe les valeurs du type 'str' avec un code
df.criticality_events = df.criticality_events.fillna("UNDEFINED")
criticality = {'UNDEFINED': 0, 'INFO': 1, 'WARNING': 2, 'ERROR':3}
df.criticality_events.replace(criticality, inplace=True)
df.criticality_events = pd.to_numeric(df.criticality_events).astype('int64')

In [16]:
# on sauvegarde l'encodage dans metrics_events_dict
inv_criticality = {v: k for k, v in criticality.items()}
with open(file=Path(save_json), mode="r+", encoding='utf-8') as jsonFile:
    data = json.load(jsonFile)
    data['criticality encoded'] = inv_criticality
    jsonFile.seek(0)
    json.dump(data, jsonFile, indent=4, ensure_ascii=False)
    jsonFile.close()

## c) Output csv

In [17]:
# sauvegarde du dataframe avant encodage
df.to_csv(path_or_buf=Path(save_csv))