# 04 - Création d'un dataset des données nettoyées de job_events (après fractionnement)

Ce notebook génère 1 csv :

- clean_merge_job_events_dataset.csv

Etapes :

- Suppression des lignes avec les valeurs timestamp_start manquantes

- Suppression des colonnes ayant des valeurs uniques

- Remplacement des données maquantes

- Conversion des types des séries en fonction des types des valeurs

# A. Imports

## a) Librairies

In [1]:
import os, math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

## b) Données

In [2]:
# source path to raw metrics dataset
filename = 'raw_merge_job_events_dataset.csv'
path = '../data/jobs'
job_events = os.path.join(path, filename)
# target path to save merge raw metrics dataset
save_csv = '../data/jobs/clean_merge_job_events_dataset.csv'

# B. Dataframe

## a) Création

In [3]:
# création d'un dataframe à partir du csv de données
df = pd.read_csv(job_events, index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47852 entries, 0 to 47851
Data columns (total 46 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   jobId                                                  47852 non-null  int64  
 1   timestamp_start                                        47837 non-null  object 
 2   totalCopies_start                                      47837 non-null  float64
 3   jsonVersion_x                                          34053 non-null  float64
 4   LED_iper                                               47837 non-null  float64
 5   bars_iper                                              47837 non-null  object 
 6   drops_iper                                             47837 non-null  float64
 7   dithering_iper                                         47837 non-null  object 
 8   deadPixelsOffset_iper                         

  df = pd.read_csv(job_events, index_col=0)


In [4]:
# réindexation
df.reset_index(level=None, drop=True, inplace=True, col_level=0, col_fill='')
df.head(5)

Unnamed: 0,jobId,timestamp_start,totalCopies_start,jsonVersion_x,LED_iper,bars_iper,drops_iper,dithering_iper,deadPixelsOffset_iper,level_user,...,greenScore_fullScannerMode_remoteScannerRegistration,enable_specialSubstrate_remoteScannerRegistration,mode_remoteScannerRegistration,jobState,timestamp_end,totalCopies_end,consumption_operatorSideTanks_varnishConsumption,path,image,jsonVersion_y
0,1645522984,2022-02-22T09:43:04.4879318Z,40.0,,50.0,"[1, 2]",4.0,False,1.0,Operator,...,20.0,False,3.0,SUCCESS,2022-02-22T09:46:07.9463151Z,40.0,4.414782,D:/IMAGES/Standard/1502734#1/0000001.tif,/9j/4AAQSkZJRgABAQEASABIAAD/4gxYSUNDX1BST0ZJTE...,
1,1645523240,2022-02-22T09:47:20.6730834Z,29.0,,50.0,"[1, 2]",4.0,False,1.0,Operator,...,20.0,False,3.0,SUCCESS,2022-02-22T09:48:57.4742108Z,18.0,3.004043,D:/IMAGES/Standard/1496447#1/0000001.tif,/9j/4AAQSkZJRgABAQEASABIAAD/4gxYSUNDX1BST0ZJTE...,
2,1645523414,2022-02-22T09:50:14.5555941Z,15.0,,50.0,"[1, 2]",4.0,False,1.0,Operator,...,20.0,False,3.0,SUCCESS,2022-02-22T09:51:50.5518728Z,15.0,2.503964,D:/IMAGES/Standard/1496447#1/0000001.tif,/9j/4AAQSkZJRgABAQEASABIAAD/4gxYSUNDX1BST0ZJTE...,
3,1645523573,2022-02-22T09:52:53.4501528Z,44.0,,50.0,"[1, 2]",4.0,False,1.0,Operator,...,20.0,False,3.0,SUCCESS,2022-02-22T09:54:26.4285018Z,17.0,1.564754,D:/IMAGES/Standard/1498393#1/0000001.tif,/9j/4AAQSkZJRgABAQEASABIAAD/4gxYSUNDX1BST0ZJTE...,
4,1645523780,2022-02-22T09:56:21.1203237Z,31.0,,50.0,"[1, 2]",4.0,False,1.0,Operator,...,20.0,False,3.0,SUCCESS,2022-02-22T09:58:44.8190330Z,31.0,2.967451,D:/IMAGES/Standard/1498393#1/0000001.tif,/9j/4AAQSkZJRgABAQEASABIAAD/4gxYSUNDX1BST0ZJTE...,


## b) Nettoyage

### 1. Suppression de lignes

In [5]:
# on cherche les index des lignes sans timestamp de départ
no_start_row = list(df[df['timestamp_start'].isna()].index)
no_start_row

[47837,
 47838,
 47839,
 47840,
 47841,
 47842,
 47843,
 47844,
 47845,
 47846,
 47847,
 47848,
 47849,
 47850,
 47851]

In [6]:
# on supprime les lignes sans timestamp de départ
df.drop(no_start_row, axis=0, inplace=True)

### 2. Suppression de colonnes

In [7]:
# liste des colonnes
df.columns

Index(['jobId', 'timestamp_start', 'totalCopies_start', 'jsonVersion_x',
       'LED_iper', 'bars_iper', 'drops_iper', 'dithering_iper',
       'deadPixelsOffset_iper', 'level_user', 'operator_user', 'speed_ifoil',
       'enabled_ifoil', 'optifoil_ifoil', 'stampAreas_ifoil',
       'heater1Enabled_ifoil', 'speedTensionIn_ifoil', 'speedTensionOut_ifoil',
       'heater1Temperature_ifoil', 'x_imageLayout_layout',
       'y_imageLayout_layout', 'name_paperFormat_layout',
       'width_paperFormat_layout', 'height_paperFormat_layout', 'speed_layout',
       'power_irDryers', 'power_uvDryers',
       'redScore_gridMode_remoteScannerRegistration',
       'redScore_cropmarksMode_remoteScannerRegistration',
       'x_cropmark1_cropmarksMode_remoteScannerRegistration',
       'y_cropmark1_cropmarksMode_remoteScannerRegistration',
       'x_cropmark2_cropmarksMode_remoteScannerRegistration',
       'y_cropmark2_cropmarksMode_remoteScannerRegistration',
       'exposureTime_manualLighting_remote

In [8]:
# liste des types des colonnes
df.dtypes.unique()

array([dtype('int64'), dtype('O'), dtype('float64')], dtype=object)

In [9]:
# suppression des colonnes non pertinentes
col_to_drop = [
    'path', 
    'image']
df.drop(col_to_drop, axis=1, inplace=True)

### 3. Données maquantes

In [10]:
# on liste les colonnes qui contiennent des valeurs nulles
col_with_nan = []
for col in df.columns:
    if df[col].isnull().any():
        col_with_nan.append(col)
        print(f'{col} : {df[col].unique()}') if df[col].nunique() <= 10 else print(f'{col} : {df[col].nunique()} - dtype : {df[col].dtype}')     

jsonVersion_x : [nan  2.]
name_paperFormat_layout : [nan 'calque' 'B2']
jobState : ['SUCCESS' 'ERROR' 'CANCELED' nan]
timestamp_end : 47791 - dtype : object
totalCopies_end : 219 - dtype : float64
consumption_operatorSideTanks_varnishConsumption : 45846 - dtype : float64
jsonVersion_y : [nan  2.]


In [11]:
# on remplace les 'name_paperFormat' sans nom par UNDEFINED
df['name_paperFormat_layout'] = df['name_paperFormat_layout'].fillna('UNDEFINED')

In [12]:
# on remplace les 'jobState' non défini par UNDEFINED
df['jobState'] = df['jobState'].fillna('UNDEFINED')

In [13]:
# on converti les valeur de 'timestamp_end' en booléen
#df['timestamp_end'] = df['timestamp_end'].apply(lambda x: False if pd.isna(x) else True)
df['timestamp_end'] = df['timestamp_end'].fillna(df['timestamp_start'])

In [14]:
# on remplace les valeurs de 'totalCopies_end' nulles par 0
df['totalCopies_end'] = df['totalCopies_end'].fillna(float(0))

In [15]:
# on remplace les valeurs de 'varnishConsumption' nulles par 0
df['consumption_operatorSideTanks_varnishConsumption'] = df['consumption_operatorSideTanks_varnishConsumption'].fillna(float(0))

### 4. Conversion de type

In [16]:
# conversion des colonnes de type 'float' dont toutes les valeurs ont des décimales à 0 en type 'int'
for col in list(df.select_dtypes(exclude=['object','bool']).columns):
    if not True in (math.modf(value)[0] != float(0) for value in list(df[col].unique())):
        df[col] = df[col].astype('int64')

In [17]:
# on liste les valeurs (si pas plus de 5) des colonnes de type object
for col in list(df.select_dtypes(include=['object']).columns):
    print(f"{col} : {df[col].unique()}") if df[col].nunique() <= 5  else print(f"{col} : {df[col].nunique()} values")

timestamp_start : 47837 values
bars_iper : ['[1, 2]' '[1]' '[2]']
dithering_iper : [False True]
level_user : ['Operator' 'Distributor']
operator_user : ['User' 'Distributor']
enabled_ifoil : [True False]
optifoil_ifoil : [False True]
stampAreas_ifoil : 11 values
heater1Enabled_ifoil : [True False]
name_paperFormat_layout : ['UNDEFINED' 'calque' 'B2']
enable_specialSubstrate_remoteScannerRegistration : [False True]
jobState : ['SUCCESS' 'ERROR' 'CANCELED' 'UNDEFINED']
timestamp_end : 47837 values


In [18]:
# on peut convertir certaines colonnes au format datetime
df['timestamp_start'] = pd.to_datetime(df['timestamp_start'], utc=True)
df['timestamp_end'] = pd.to_datetime(df['timestamp_end'], utc=True)

In [19]:
# on peut convertir le type des colonnes contenant True au format booléen
for col in list(df.select_dtypes(include=['object']).columns):
    values = list(df[col].unique())
    if True in (len(values) == 2  and value == True for value in values):
        df[col] = df[col].astype('bool')

In [20]:
# on liste les valeurs (si pas plus de 5) des colonnes de type object
for col in list(df.select_dtypes(include=['object']).columns):
    print(f"{col} : {df[col].unique()}") if df[col].nunique() <= 5  else print(f"{col} : {df[col].nunique()} values")

bars_iper : ['[1, 2]' '[1]' '[2]']
level_user : ['Operator' 'Distributor']
operator_user : ['User' 'Distributor']
stampAreas_ifoil : 11 values
name_paperFormat_layout : ['UNDEFINED' 'calque' 'B2']
jobState : ['SUCCESS' 'ERROR' 'CANCELED' 'UNDEFINED']


### 5. Output csv

In [21]:
df.columns

Index(['jobId', 'timestamp_start', 'totalCopies_start', 'jsonVersion_x',
       'LED_iper', 'bars_iper', 'drops_iper', 'dithering_iper',
       'deadPixelsOffset_iper', 'level_user', 'operator_user', 'speed_ifoil',
       'enabled_ifoil', 'optifoil_ifoil', 'stampAreas_ifoil',
       'heater1Enabled_ifoil', 'speedTensionIn_ifoil', 'speedTensionOut_ifoil',
       'heater1Temperature_ifoil', 'x_imageLayout_layout',
       'y_imageLayout_layout', 'name_paperFormat_layout',
       'width_paperFormat_layout', 'height_paperFormat_layout', 'speed_layout',
       'power_irDryers', 'power_uvDryers',
       'redScore_gridMode_remoteScannerRegistration',
       'redScore_cropmarksMode_remoteScannerRegistration',
       'x_cropmark1_cropmarksMode_remoteScannerRegistration',
       'y_cropmark1_cropmarksMode_remoteScannerRegistration',
       'x_cropmark2_cropmarksMode_remoteScannerRegistration',
       'y_cropmark2_cropmarksMode_remoteScannerRegistration',
       'exposureTime_manualLighting_remote

In [22]:
df.to_csv(path_or_buf=Path(save_csv))