# Notebook jobs

Ce notebook génère 1 csv :

- raw_jobs_dataset.csv

Etapes :

- Suppression des lignes avec les valeurs timestamp_start manquantes

- Suppression des colonnes ayant des valeurs uniques

- Remplacement des données maquantes

- Conversion des types des séries en fonction des types des valeurs

# A. Imports

## a) Librairies

In [68]:
import os, math
import pandas as pd
pd.set_option('display.max_columns', 500)
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

## b) Données

In [69]:
# chemin vers fichier source
filename = 'jobs.csv'
path = '../data/raw/'
# target path to save merge raw jobs dataset
save_csv = '../data/jobs/raw_jobs_dataset.csv'

In [70]:
# téléchargement dans le repertoire 'data' d'un fichiers 'csv' depuis le blob
from azure_blob import download_blob_file
download_blob_file(file_name=filename, local_path=path)

jobs.csv already in path ../data/raw/.


# B. Dataframe

## a) Création

In [71]:
# création d'un dataframe à partir du csv de données
jobs_df = pd.read_csv(os.path.join(path, filename))
jobs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16295 entries, 0 to 16294
Data columns (total 23 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                16295 non-null  int64  
 1   thumbnail                         1497 non-null   object 
 2   total_copies                      16201 non-null  float64
 3   started_at                        16295 non-null  object 
 4   ended_at                          16201 non-null  object 
 5   machineId                         16295 non-null  int64  
 6   speed                             16295 non-null  int64  
 7   operator                          16295 non-null  object 
 8   operator_level                    16295 non-null  object 
 9   first_page_image_path_on_machine  16289 non-null  object 
 10  paperHeight                       16295 non-null  int64  
 11  paperWidth                        16295 non-null  int64  
 12  pape

In [72]:
jobs_df.head()

Unnamed: 0,id,thumbnail,total_copies,started_at,ended_at,machineId,speed,operator,operator_level,first_page_image_path_on_machine,paperHeight,paperWidth,paperName,paperThickness,id_on_machine,total_copies_requested,job_thumbnail_id,uses_ifoil,uses_iper,scanner_mode,iper_bvar_count,varnishConsumptionVarnish_3d,varnishConsumptionVarnish_2d
0,5021,/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAgGBgcGBQgHBw...,,2021-06-18 09:22:46.866000,,14,418,Distributor,Distributor,D:/IMAGES/Standard/3040459-52x74-350 CM-18-06-...,520,740,,0,1624008166,3,2383.0,False,True,,,,
1,5034,/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAgGBgcGBQgHBw...,3.0,2021-06-18 10:08:13.716000,2021-06-18 10:10:22.257000,14,313,Micka,Operator,D:/IMAGES/Standard/3040459-52x74-350 CM-18-06-...,740,520,,0,1624010893,100,2391.0,False,True,,,,
2,5037,/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAgGBgcGBQgHBw...,70.0,2021-06-18 10:11:52.165000,2021-06-18 10:18:20.294000,14,313,Micka,Operator,D:/IMAGES/Standard/3040459-52x74-350 CM-18-06-...,740,520,,0,1624011111,100,2393.0,False,True,,,,
3,5042,/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAgGBgcGBQgHBw...,,2021-06-18 10:30:23.049000,,14,700,Micka,Operator,D:/IMAGES/Standard/3035811-5coul#1/0000001 V01...,450,320,,0,1624012222,50,2397.0,False,True,,,,
4,5062,/9j/4AAQSkZJRgABAQEAYABgAAD/2wBDAAgGBgcGBQgHBw...,,2021-06-18 12:35:11.864000,,14,700,Viktor,Operator,D:/IMAGES/Standard/3037332-vernis/3037332-vern...,450,320,,0,1624019711,1,2416.0,False,True,,,,


## b) Nettoyage

### 1. Suppression de lignes

In [51]:
# on cherche les index des lignes sans timestamp de départ
no_start_row = list(jobs_df[jobs_df['started_at'].isna()].index)
no_start_row

[]

### 2. Suppression de colonnes

In [52]:
# suppression des colonnes non pertinentes
col_to_drop = [
    'id',
    'thumbnail',
    'machineId',
    'first_page_image_path_on_machine',
    'job_thumbnail_id'
    ]
jobs_df = jobs_df.drop(col_to_drop, axis=1)

In [53]:
# suppression des colonnes ne contenant que des valeurs nulles
jobs_df = jobs_df.dropna(axis=1, how='all')

### 3. Données manquantes

In [54]:
# on liste les colonnes qui contiennent des valeurs nulles
col_with_nan = []
for col in jobs_df.columns:
    if jobs_df[col].isnull().any():
        col_with_nan.append(col)
        print(f'{col} : {jobs_df[col].unique()}') if jobs_df[col].nunique() <= 10 else print(f'{col} : {jobs_df[col].nunique()} - dtype : {jobs_df[col].dtype}')     

total_copies : 681 - dtype : float64
ended_at : 16201 - dtype : object
paperName : [nan 'B2']
scanner_mode : [nan  1.  3.  0.]
iper_bvar_count : [nan  1.  2.]
varnishConsumptionVarnish_3d : 13886 - dtype : float64


In [55]:
# on remplace les valeurs de 'total_copies' nulles par 0
jobs_df['total_copies'] = jobs_df['total_copies'].fillna(float(0))

In [56]:
# on remplace les datetime de fin maquant par les datetime de début
jobs_df['ended_at'] = jobs_df['ended_at'].fillna(jobs_df['started_at'])

In [57]:
# on remplace les 'paperName' non défini par UNDEFINED
jobs_df['paperName'] = jobs_df['paperName'].fillna('UNDEFINED')

In [58]:
# on remplace les valeurs de 'scanner_mode' nulles par 0
jobs_df['scanner_mode'] = jobs_df['scanner_mode'].fillna(0)

In [59]:
# on remplace les valeurs de 'iper_bvar_count' nulles par 0
jobs_df['iper_bvar_count'] = jobs_df['iper_bvar_count'].fillna(0)

In [60]:
# on remplace les valeurs de 'total_copies' nulles par 0
jobs_df['varnishConsumptionVarnish_3d'] = jobs_df['varnishConsumptionVarnish_3d'].fillna(float(0))

### 4. Conversion des types

In [61]:
jobs_df.dtypes.unique()

array([dtype('float64'), dtype('O'), dtype('int64'), dtype('bool')],
      dtype=object)

#### Object

In [62]:
# on liste les valeurs (si pas plus de 5) des colonnes de type object
object_columns = list(jobs_df.select_dtypes(include=['object']).columns)
for col in object_columns :
    print(f"{col} : {jobs_df[col].unique()}") if jobs_df[col].nunique() <= 5  else print(f"{col} : {jobs_df[col].nunique()} values")

started_at : 16295 values
ended_at : 16295 values
operator : ['Distributor' 'Micka' 'Viktor' 'JAN']
operator_level : ['Distributor' 'Operator']
paperName : ['UNDEFINED' 'B2']


In [63]:
# on peut convertir les colonnes contenant des valeurs de temsps au format datetime
jobs_df['started_at'] = pd.to_datetime(jobs_df['started_at'], utc=True)
jobs_df['ended_at'] = pd.to_datetime(jobs_df['ended_at'], utc=True)

#### Float64

In [64]:
float_columns = list(jobs_df.select_dtypes(include=['float64']).columns)
# on liste les valeurs (si pas plus de 5) des colonnes de type object
for col in float_columns:
    print(f"{col} : {jobs_df[col].unique()}") if jobs_df[col].nunique() <= 5  else print(f"{col} : {jobs_df[col].nunique()} values")

total_copies : 681 values
scanner_mode : [0. 1. 3.]
iper_bvar_count : [0. 1. 2.]
varnishConsumptionVarnish_3d : 13886 values


In [65]:
# conversion des colonnes de type 'float' dont toutes les valeurs ont des décimales à 0 en type 'int'
for col in float_columns :
# remplacement de nan par 0
    jobs_df[col] = jobs_df[col].fillna(0)
    if not True in (math.modf(value)[0] != float(0) for value in list(jobs_df[col].unique())):
        jobs_df[col] = jobs_df[col].astype('int64')

In [66]:
jobs_df.head(5)

Unnamed: 0,total_copies,started_at,ended_at,speed,operator,operator_level,paperHeight,paperWidth,paperName,paperThickness,id_on_machine,total_copies_requested,uses_ifoil,uses_iper,scanner_mode,iper_bvar_count,varnishConsumptionVarnish_3d
0,0,2021-06-18 09:22:46.866000+00:00,2021-06-18 09:22:46.866000+00:00,418,Distributor,Distributor,520,740,UNDEFINED,0,1624008166,3,False,True,0,0,0.0
1,3,2021-06-18 10:08:13.716000+00:00,2021-06-18 10:10:22.257000+00:00,313,Micka,Operator,740,520,UNDEFINED,0,1624010893,100,False,True,0,0,0.0
2,70,2021-06-18 10:11:52.165000+00:00,2021-06-18 10:18:20.294000+00:00,313,Micka,Operator,740,520,UNDEFINED,0,1624011111,100,False,True,0,0,0.0
3,0,2021-06-18 10:30:23.049000+00:00,2021-06-18 10:30:23.049000+00:00,700,Micka,Operator,450,320,UNDEFINED,0,1624012222,50,False,True,0,0,0.0
4,0,2021-06-18 12:35:11.864000+00:00,2021-06-18 12:35:11.864000+00:00,700,Viktor,Operator,450,320,UNDEFINED,0,1624019711,1,False,True,0,0,0.0


### 5. Outout csv

In [67]:
# sauvegarde du dataset en csv
jobs_df.to_csv(save_csv)