# Notebook de exploration des données jobs_events

Ce notebook génère 1 csv :

- clean_merge_job_events_dataset.csv

Etapes :

- Suppression des lignes avec les valeurs timestamp_start manquantes

- Suppression des colonnes ayant des valeurs uniques

- Remplacement des données maquantes

- Conversion des types des séries en fonction des types des valeurs

# A. Imports

## a) Librairies

In [1]:
import os, math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

## b) Données

In [2]:
# source path to raw metrics dataset
filename = 'raw_merge_job_events_dataset.csv'
path = '../data/jobs'
job_events = os.path.join(path, filename)
# target path to save merge raw metrics dataset
save_csv = '../data/jobs/clean_merge_job_events_dataset.csv'

# B. Dataframe

## a) Création

In [3]:
# création d'un dataframe à partir du csv de données
df = pd.read_csv(job_events, index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16297 entries, 0 to 16296
Data columns (total 39 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   jobId                                                 16297 non-null  int64  
 1   timestamp_start                                       16295 non-null  object 
 2   totalCopies_start                                     16295 non-null  float64
 3   LED_iper                                              16295 non-null  float64
 4   bars_iper                                             16295 non-null  object 
 5   drops_iper                                            16295 non-null  float64
 6   dithering_iper                                        16295 non-null  object 
 7   deadPixelsOffset_iper                                 16295 non-null  float64
 8   level_user                                            16

In [4]:
# réindexation
df.reset_index(level=None, drop=True, inplace=True, col_level=0, col_fill='')
df.head(5)

Unnamed: 0,jobId,timestamp_start,totalCopies_start,LED_iper,bars_iper,drops_iper,dithering_iper,deadPixelsOffset_iper,level_user,operator_user,...,redScore_fullScannerMode_remoteScannerRegistration,blueScore_fullScannerMode_remoteScannerRegistration,greenScore_fullScannerMode_remoteScannerRegistration,mode_remoteScannerRegistration,jobState,timestamp_end,totalCopies_end,consumption_operatorSideTanks_varnishConsumption,path,image
0,1624008166,2021-06-18T09:22:46.8661892Z,3.0,10.0,"[1, 2]",3.0,False,0.0,Distributor,Distributor,...,1500.0,16.0,16.0,1.0,,,,,D:/IMAGES/Standard/3040459-52x74-350 CM-18-06-...,/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAgGBgcGBQgHBw...
1,1624010893,2021-06-18T10:08:13.7169739Z,100.0,30.0,"[1, 2]",4.0,False,0.0,Operator,Micka,...,1500.0,16.0,10.0,1.0,ERROR,2021-06-18T10:10:22.2570778Z,3.0,1.440239,D:/IMAGES/Standard/3040459-52x74-350 CM-18-06-...,/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAgGBgcGBQgHBw...
2,1624011111,2021-06-18T10:11:52.1659526Z,100.0,30.0,"[1, 2]",4.0,False,0.0,Operator,Micka,...,1500.0,16.0,10.0,1.0,CANCELED,2021-06-18T10:18:20.2945923Z,70.0,33.607494,D:/IMAGES/Standard/3040459-52x74-350 CM-18-06-...,/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAgGBgcGBQgHBw...
3,1624012222,2021-06-18T10:30:23.0496491Z,50.0,10.0,[2],1.0,False,0.0,Operator,Micka,...,1500.0,16.0,10.0,1.0,,,,,D:/IMAGES/Standard/3035811-5coul#1/0000001 V01...,/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAgGBgcGBQgHBw...
4,1624019711,2021-06-18T12:35:11.8641173Z,1.0,10.0,[2],1.0,False,0.0,Operator,Viktor,...,1500.0,16.0,16.0,1.0,,,,,D:/IMAGES/Standard/3037332-vernis/3037332-vern...,/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAgGBgcGBQgHBw...


## b) Nettoyage

### 1. Suppression de lignes

In [5]:
# on cherche les index des lignes sans timestamp de départ
no_start_row = list(df[df['timestamp_start'].isna()].index)
no_start_row

[16295, 16296]

In [6]:
# on supprime les lignes sans timestamp de départ
df.drop(no_start_row, axis=0, inplace=True)

### 2. Suppression de colonnes

In [7]:
# liste des colonnes
df.columns

Index(['jobId', 'timestamp_start', 'totalCopies_start', 'LED_iper',
       'bars_iper', 'drops_iper', 'dithering_iper', 'deadPixelsOffset_iper',
       'level_user', 'operator_user', 'speed_ifoil', 'enabled_ifoil',
       'optifoil_ifoil', 'stampAreas_ifoil', 'heater1Enabled_ifoil',
       'speedTensionIn_ifoil', 'heater1Temperature_ifoil',
       'x_imageLayout_layout', 'y_imageLayout_layout',
       'name_paperFormat_layout', 'width_paperFormat_layout',
       'height_paperFormat_layout', 'speed_layout', 'power_irDryers',
       'power_uvDryers', 'topMargin_registration_remoteScannerRegistration',
       'leftMargin_registration_remoteScannerRegistration',
       'redScore_gridMode_remoteScannerRegistration',
       'redScore_cropmarksMode_remoteScannerRegistration',
       'redScore_fullScannerMode_remoteScannerRegistration',
       'blueScore_fullScannerMode_remoteScannerRegistration',
       'greenScore_fullScannerMode_remoteScannerRegistration',
       'mode_remoteScannerRegistra

In [8]:
# liste des types des colonnes
df.dtypes.unique()

array([dtype('int64'), dtype('O'), dtype('float64')], dtype=object)

In [9]:
# suppression des colonnes non pertinentes
col_to_drop = [
    'path', 
    'image']
df.drop(col_to_drop, axis=1, inplace=True)

### 3. Données maquantes

In [10]:
# on liste les colonnes qui contiennent des valeurs nulles
col_with_nan = []
for col in df.columns:
    if df[col].isnull().any():
        col_with_nan.append(col)
        print(f'{col} : {df[col].unique()}') if df[col].nunique() <= 10 else print(f'{col} : {df[col].nunique()} - dtype : {df[col].dtype}')     

name_paperFormat_layout : [nan 'B2']
jobState : [nan 'ERROR' 'CANCELED' 'SUCCESS']
timestamp_end : 16201 - dtype : object
totalCopies_end : 681 - dtype : float64
consumption_operatorSideTanks_varnishConsumption : 14242 - dtype : float64


In [11]:
# on remplace les 'name_paperFormat' sans nom par UNDEFINED
df['name_paperFormat_layout'] = df['name_paperFormat_layout'].fillna('UNDEFINED')

In [12]:
# on remplace les 'jobState' non défini par UNDEFINED
df['jobState'] = df['jobState'].fillna('UNDEFINED')

In [13]:
# on converti les valeur de 'timestamp_end' en booléen
#df['timestamp_end'] = df['timestamp_end'].apply(lambda x: False if pd.isna(x) else True)
df['timestamp_end'] = df['timestamp_end'].fillna(df['timestamp_start'])

In [14]:
# on remplace les valeurs de 'totalCopies_end' nulles par 0
df['totalCopies_end'] = df['totalCopies_end'].fillna(float(0))

In [15]:
# on remplace les valeurs de 'varnishConsumption' nulles par 0
df['consumption_operatorSideTanks_varnishConsumption'] = df['consumption_operatorSideTanks_varnishConsumption'].fillna(float(0))

### 4. Conversion de type

In [16]:
# conversion des colonnes de type 'float' dont toutes les valeurs ont des décimales à 0 en type 'int'
for col in list(df.select_dtypes(exclude=['object','bool']).columns):
    if not True in (math.modf(value)[0] != float(0) for value in list(df[col].unique())):
        df[col] = df[col].astype('int64')

In [17]:
# on liste les valeurs (si pas plus de 5) des colonnes de type object
for col in list(df.select_dtypes(include=['object']).columns):
    print(f"{col} : {df[col].unique()}") if df[col].nunique() <= 5  else print(f"{col} : {df[col].nunique()} values")

timestamp_start : 16295 values
bars_iper : ['[1, 2]' '[2]' '[1]']
dithering_iper : [False True]
level_user : ['Distributor' 'Operator']
operator_user : ['Distributor' 'Micka' 'Viktor' 'JAN']
enabled_ifoil : [False True]
optifoil_ifoil : [True False]
stampAreas_ifoil : 24 values
heater1Enabled_ifoil : [False True]
name_paperFormat_layout : ['UNDEFINED' 'B2']
jobState : ['UNDEFINED' 'ERROR' 'CANCELED' 'SUCCESS']
timestamp_end : 16295 values


In [18]:
# on peut convertir certaines colonnes au format datetime
df['timestamp_start'] = pd.to_datetime(df['timestamp_start'], utc=True)
df['timestamp_end'] = pd.to_datetime(df['timestamp_end'], utc=True)

In [19]:
# on peut convertir le type des colonnes contenant True au format booléen
for col in list(df.select_dtypes(include=['object']).columns):
    values = list(df[col].unique())
    if True in (len(values) == 2  and value == True for value in values):
        df[col] = df[col].astype('bool')

In [20]:
# on liste les valeurs (si pas plus de 5) des colonnes de type object
for col in list(df.select_dtypes(include=['object']).columns):
    print(f"{col} : {df[col].unique()}") if df[col].nunique() <= 5  else print(f"{col} : {df[col].nunique()} values")

bars_iper : ['[1, 2]' '[2]' '[1]']
level_user : ['Distributor' 'Operator']
operator_user : ['Distributor' 'Micka' 'Viktor' 'JAN']
stampAreas_ifoil : 24 values
name_paperFormat_layout : ['UNDEFINED' 'B2']
jobState : ['UNDEFINED' 'ERROR' 'CANCELED' 'SUCCESS']


### 5. Output csv

In [21]:
df.to_csv(path_or_buf=Path(save_csv))