# 05 - Création du dataset des données nettoyées de jobs

Ce notebook génère 1 csv :

- raw_jobs_dataset.csv

Etapes :

- Suppression des lignes avec les valeurs timestamp_start manquantes

- Suppression des colonnes ayant des valeurs uniques

- Remplacement des données maquantes

- Conversion des types des séries en fonction des types des valeurs

# A. Imports

## a) Librairies

In [1]:
import os, math
import pandas as pd
pd.set_option('display.max_columns', 500)
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

## b) Données

In [2]:
# chemin vers fichier source
filename = 'jobs.csv'
path = '../data/raw/'
# target path to save merge raw jobs dataset
save_csv = '../data/jobs/raw_jobs_dataset.csv'

In [3]:
# # téléchargement dans le repertoire 'data' d'un fichiers 'csv' depuis le blob
# from azure_blob import download_blob_file
# download_blob_file(file_name=filename, local_path=path)

# B. Dataframe

## a) Création

In [4]:
# création d'un dataframe à partir du csv de données
jobs_df = pd.read_csv(os.path.join(path, filename))
jobs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37299 entries, 0 to 37298
Data columns (total 26 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                37299 non-null  int64  
 1   thumbnail                         0 non-null      float64
 2   total_copies                      37222 non-null  float64
 3   started_at                        37299 non-null  object 
 4   ended_at                          37222 non-null  object 
 5   machineId                         37299 non-null  int64  
 6   speed                             37299 non-null  int64  
 7   operator                          37299 non-null  object 
 8   operator_level                    37299 non-null  object 
 9   first_page_image_path_on_machine  37272 non-null  object 
 10  paperHeight                       37299 non-null  int64  
 11  paperWidth                        37299 non-null  int64  
 12  pape

  jobs_df = pd.read_csv(os.path.join(path, filename))


In [5]:
jobs_df.head()

Unnamed: 0,id,thumbnail,total_copies,started_at,ended_at,machineId,speed,operator,operator_level,first_page_image_path_on_machine,paperHeight,paperWidth,paperName,paperThickness,id_on_machine,total_copies_requested,job_thumbnail_id,uses_ifoil,uses_iper,scanner_mode,iper_bvar_count,varnishConsumptionVarnish_3d,varnishConsumptionVarnish_2d,run,total_run,copies_per_run
0,28292,,6.0,2022-02-22 09:43:18.116000,2022-02-22 09:44:33.389000,18,313,User,Operator,D:/IMAGES/Standard/1504750#1/0000001.tif,483,330,,0,1645522997,6,25464.0,True,True,3,2,4.585923,,,,
1,28293,,11.0,2022-02-22 09:45:01.304000,2022-02-22 09:46:34.929000,18,313,User,Operator,D:/IMAGES/Standard/1504749#1/0000001.tif,483,330,,0,1645523101,11,25465.0,True,True,3,2,2.917403,,,,
2,28295,,7.0,2022-02-22 09:47:30.319000,2022-02-22 09:48:37.554000,18,313,User,Operator,D:/IMAGES/Standard/1505959#1/0000001 V01.tif,483,330,,0,1645523250,7,25467.0,True,True,3,2,0.423666,,,,
3,28296,,11.0,2022-02-22 09:49:56.298000,2022-02-22 09:51:14.406000,18,313,User,Operator,D:/IMAGES/Standard/1505734#1/0000001 V01.tif,483,330,,0,1645523396,11,25468.0,True,True,3,2,1.100145,,,,
4,28299,,47.0,2022-02-22 09:52:57.305000,2022-02-22 09:55:59.993000,18,313,User,Operator,D:/IMAGES/Standard/1505736#1/0000001 V01.tif,483,330,,0,1645523577,47,25471.0,True,True,3,2,4.70161,,,,


## b) Nettoyage

### 1. Suppression de lignes

In [6]:
# on cherche les index des lignes sans timestamp de départ
no_start_row = list(jobs_df[jobs_df['started_at'].isna()].index)
no_start_row

[]

### 2. Suppression de colonnes

In [7]:
# suppression des colonnes non pertinentes
col_to_drop = [
    'id',
    'thumbnail',
    'machineId',
    'first_page_image_path_on_machine',
    'job_thumbnail_id'
    ]
jobs_df = jobs_df.drop(col_to_drop, axis=1)

In [8]:
# suppression des colonnes ne contenant que des valeurs nulles
print(jobs_df.columns[jobs_df.isnull().all()].tolist())
jobs_df.dropna(axis=1, how='all', inplace=True)

['varnishConsumptionVarnish_2d']


### 3. Données manquantes

In [9]:
# on liste les colonnes qui contiennent des valeurs nulles
col_with_nan = []
for col in jobs_df.columns:
    if jobs_df[col].isnull().any():
        col_with_nan.append(col)
        print(f'{col} : {jobs_df[col].unique()}') if jobs_df[col].nunique() <= 10 else print(f'{col} : {jobs_df[col].nunique()} - dtype : {jobs_df[col].dtype}')     

total_copies : 175 - dtype : float64
ended_at : 37222 - dtype : object
paperName : [nan 'B1' 'A4']
varnishConsumptionVarnish_3d : 34125 - dtype : float64
run : [nan  1.]
total_run : [nan  1.]
copies_per_run : 107 - dtype : float64


In [10]:
# on remplace les valeurs de 'total_copies' nulles par 0
jobs_df['total_copies'] = jobs_df['total_copies'].fillna(float(0))

In [11]:
# on remplace les datetime de fin maquant par les datetime de début
jobs_df['ended_at'] = jobs_df['ended_at'].fillna(jobs_df['started_at'])

In [12]:
# on remplace les 'paperName' non défini par UNDEFINED
jobs_df['paperName'] = jobs_df['paperName'].fillna('UNDEFINED')

In [13]:
# on remplace les valeurs de 'scanner_mode' nulles par 0
jobs_df['scanner_mode'] = jobs_df['scanner_mode'].fillna(0)

In [14]:
# on remplace les valeurs de 'iper_bvar_count' nulles par 0
jobs_df['iper_bvar_count'] = jobs_df['iper_bvar_count'].fillna(0)

In [15]:
# on remplace les valeurs de 'total_copies' nulles par 0
jobs_df['varnishConsumptionVarnish_3d'] = jobs_df['varnishConsumptionVarnish_3d'].fillna(float(0))

### 4. Conversion des types

In [16]:
jobs_df.dtypes.unique()

array([dtype('float64'), dtype('O'), dtype('int64'), dtype('bool')],
      dtype=object)

#### Object

In [17]:
# on liste les valeurs (si pas plus de 5) des colonnes de type object
object_columns = list(jobs_df.select_dtypes(include=['object']).columns)
for col in object_columns :
    print(f"{col} : {jobs_df[col].unique()}") if jobs_df[col].nunique() <= 5  else print(f"{col} : {jobs_df[col].nunique()} values")

started_at : 37299 values
ended_at : 37299 values
operator : ['User' 'Distributor']
operator_level : ['Operator' 'Distributor']
paperName : ['UNDEFINED' 'B1' 'A4']


In [18]:
# on peut convertir les colonnes contenant des valeurs de temsps au format datetime
jobs_df['started_at'] = pd.to_datetime(jobs_df['started_at'], utc=True)
jobs_df['ended_at'] = pd.to_datetime(jobs_df['ended_at'], utc=True)

#### Float64

In [19]:
float_columns = list(jobs_df.select_dtypes(include=['float64']).columns)
# on liste les valeurs (si pas plus de 5) des colonnes de type object
for col in float_columns:
    print(f"{col} : {jobs_df[col].unique()}") if jobs_df[col].nunique() <= 5  else print(f"{col} : {jobs_df[col].nunique()} values")

total_copies : 175 values
varnishConsumptionVarnish_3d : 34125 values
run : [nan  1.]
total_run : [nan  1.]
copies_per_run : 107 values


In [20]:
# conversion des colonnes de type 'float' dont toutes les valeurs ont des décimales à 0 en type 'int'
for col in float_columns :
# remplacement de nan par 0
    jobs_df[col] = jobs_df[col].fillna(0)
    if not True in (math.modf(value)[0] != float(0) for value in list(jobs_df[col].unique())):
        jobs_df[col] = jobs_df[col].astype('int64')

In [21]:
jobs_df.head(5)

Unnamed: 0,total_copies,started_at,ended_at,speed,operator,operator_level,paperHeight,paperWidth,paperName,paperThickness,id_on_machine,total_copies_requested,uses_ifoil,uses_iper,scanner_mode,iper_bvar_count,varnishConsumptionVarnish_3d,run,total_run,copies_per_run
0,6,2022-02-22 09:43:18.116000+00:00,2022-02-22 09:44:33.389000+00:00,313,User,Operator,483,330,UNDEFINED,0,1645522997,6,True,True,3,2,4.585923,0,0,0
1,11,2022-02-22 09:45:01.304000+00:00,2022-02-22 09:46:34.929000+00:00,313,User,Operator,483,330,UNDEFINED,0,1645523101,11,True,True,3,2,2.917403,0,0,0
2,7,2022-02-22 09:47:30.319000+00:00,2022-02-22 09:48:37.554000+00:00,313,User,Operator,483,330,UNDEFINED,0,1645523250,7,True,True,3,2,0.423666,0,0,0
3,11,2022-02-22 09:49:56.298000+00:00,2022-02-22 09:51:14.406000+00:00,313,User,Operator,483,330,UNDEFINED,0,1645523396,11,True,True,3,2,1.100145,0,0,0
4,47,2022-02-22 09:52:57.305000+00:00,2022-02-22 09:55:59.993000+00:00,313,User,Operator,483,330,UNDEFINED,0,1645523577,47,True,True,3,2,4.70161,0,0,0


In [22]:
jobs_df.columns

Index(['total_copies', 'started_at', 'ended_at', 'speed', 'operator',
       'operator_level', 'paperHeight', 'paperWidth', 'paperName',
       'paperThickness', 'id_on_machine', 'total_copies_requested',
       'uses_ifoil', 'uses_iper', 'scanner_mode', 'iper_bvar_count',
       'varnishConsumptionVarnish_3d', 'run', 'total_run', 'copies_per_run'],
      dtype='object')

### 5. Outout csv

In [23]:
# sauvegarde du dataset en csv
jobs_df.to_csv(save_csv)