# Notebook de création du dataset des données jobs et job events fusionnées

# A. Imports

## a) Librairies

In [5]:
import os, json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## b) Données

In [6]:
# source path
jobs = '../data/jobs/raw_jobs_dataset.csv'

# source path
events = '../data/jobs/clean_merge_job_events_dataset.csv'

# save target path
save_csv = '../data/jobs/merge_raw_jobs_and_clean_jobevents_dataset.csv'

# B. Dataframe

## a) Création des dataframes jobs et events

### 1. jobs

In [7]:
# création d'un dataframe à partir du csv de données
jobs_df = pd.read_csv(jobs, index_col=0)
jobs_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16295 entries, 0 to 16294
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   total_copies                  16295 non-null  int64  
 1   started_at                    16295 non-null  object 
 2   ended_at                      16295 non-null  object 
 3   speed                         16295 non-null  int64  
 4   operator                      16295 non-null  object 
 5   operator_level                16295 non-null  object 
 6   paperHeight                   16295 non-null  int64  
 7   paperWidth                    16295 non-null  int64  
 8   paperName                     16295 non-null  object 
 9   paperThickness                16295 non-null  int64  
 10  id_on_machine                 16295 non-null  int64  
 11  total_copies_requested        16295 non-null  int64  
 12  uses_ifoil                    16295 non-null  bool   
 13  u

### 2. job events

In [8]:
# création d'un dataframe à partir du csv de données
events_df = pd.read_csv(events, index_col=0)
events_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16295 entries, 0 to 16294
Data columns (total 37 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   jobId                                                 16295 non-null  int64  
 1   timestamp_start                                       16295 non-null  object 
 2   totalCopies_start                                     16295 non-null  int64  
 3   LED_iper                                              16295 non-null  int64  
 4   bars_iper                                             16295 non-null  object 
 5   drops_iper                                            16295 non-null  int64  
 6   dithering_iper                                        16295 non-null  bool   
 7   deadPixelsOffset_iper                                 16295 non-null  int64  
 8   level_user                                            16

## b) Création d'un dataframe fusionné

### 1. Suppression de colonnes

In [9]:
# suppression des colonnes avec des valeurs uniques
for col in list(jobs_df.columns):
    if jobs_df[col].nunique() == 1:
        jobs_df.drop(col, axis=1, inplace=True)

In [10]:
# suppression des colonnes avec des valeurs uniques
for col in list(events_df.columns) :
    if events_df[col].nunique() == 1:
        events_df.drop(col, axis=1, inplace=True)

### 2. Concordance des colonnes entre dataframes

In [11]:
# on renomme des colonnes de jobs
jobs_df = jobs_df.rename(columns={
 'uses_ifoil' : 'ifoil',
 'iper_bvar_count': 'bars'
 })

In [12]:
# on renomme des colonnes de events
events_df = events_df.rename(columns={
    'totalCopies_start':'total_copies_requested',
    'LED_iper' : 'LED',
    'bars_iper' : 'bars',
    'drops_iper': 'drops',
    'dithering_iper' : 'dithering',
    'deadPixelsOffset_iper' : 'deadPixelsOffset',
    'level_user' : 'operator_level',
    'operator_user':'operator',
    'enabled_ifoil' : 'ifoil',
    'x_imageLayout_layout' : 'x_imageLayout',
    'y_imageLayout_layout' : 'y_imageLayout',
    'name_paperFormat_layout' : 'paperName',
    'width_paperFormat_layout' : 'paperWidth',
    'height_paperFormat_layout' : 'paperHeight',
    'speed_layout' : 'speed',
    'topMargin_registration_remoteScannerRegistration' : 'topMargin_remoteScannerRegistration',
    'leftMargin_registration_remoteScannerRegistration' : 'leftMargin_remoteScannerRegistration',
    'totalCopies_end' : 'total_copies',
    'consumption_operatorSideTanks_varnishConsumption' : 'varnishConsumptionVarnish_3d'
    })

In [13]:
# liste des colonnes communes
common_cols = [col for col in jobs_df.columns.to_list() if col in events_df.columns.to_list()]

### 3. Concordance des valeurs entre colonnes communes

In [14]:
# on liste les colonnes des deux dataframes contenant toutes leurs valeurs identiques
col_with_duplicates = []
for col in common_cols :
    if (events_df[col].values == jobs_df[col].values).all():
        col_with_duplicates.append(col)
col_with_duplicates

['total_copies',
 'speed',
 'operator',
 'operator_level',
 'paperName',
 'total_copies_requested',
 'ifoil']

In [15]:
# suppression des colonnes de jobs en doublons dans events
jobs_df = jobs_df.drop(col_with_duplicates, axis=1)

### 4. Fusion des dataframes

In [16]:
merge_df = pd.merge(jobs_df, events_df, left_on='id_on_machine', right_on='jobId')
merge_df = merge_df.drop('id_on_machine', axis=1)

### 5. Vérification

In [17]:
# on verifie que les jobId sont uniques
merge_df.jobId.duplicated().any()

False

In [18]:
# id temoin
check_jobid = 1624008166
# détail de la ligne témoin dans chaque dataset
jobs_check_line = jobs_df[jobs_df.id_on_machine == check_jobid]
events_check_line = events_df[events_df.jobId == check_jobid]
merge_check_line = merge_df[merge_df.jobId == check_jobid]

In [19]:
jobs_check_line

Unnamed: 0,started_at,ended_at,paperHeight,paperWidth,id_on_machine,scanner_mode,bars,varnishConsumptionVarnish_3d
0,2021-06-18 09:22:46.866000+00:00,2021-06-18 09:22:46.866000+00:00,520,740,1624008166,0,0,0.0


In [20]:
events_check_line

Unnamed: 0,jobId,timestamp_start,total_copies_requested,LED,bars,drops,dithering,deadPixelsOffset,operator_level,operator,...,redScore_gridMode_remoteScannerRegistration,redScore_cropmarksMode_remoteScannerRegistration,redScore_fullScannerMode_remoteScannerRegistration,blueScore_fullScannerMode_remoteScannerRegistration,greenScore_fullScannerMode_remoteScannerRegistration,mode_remoteScannerRegistration,jobState,timestamp_end,total_copies,varnishConsumptionVarnish_3d
0,1624008166,2021-06-18 09:22:46.866189200+00:00,3,10,"[1, 2]",3,False,0,Distributor,Distributor,...,1500,1500,1500,16,16,1,UNDEFINED,2021-06-18 09:22:46.866189200+00:00,0,0.0


In [21]:
merge_check_line

Unnamed: 0,started_at,ended_at,paperHeight_x,paperWidth_x,scanner_mode,bars_x,varnishConsumptionVarnish_3d_x,jobId,timestamp_start,total_copies_requested,...,redScore_gridMode_remoteScannerRegistration,redScore_cropmarksMode_remoteScannerRegistration,redScore_fullScannerMode_remoteScannerRegistration,blueScore_fullScannerMode_remoteScannerRegistration,greenScore_fullScannerMode_remoteScannerRegistration,mode_remoteScannerRegistration,jobState,timestamp_end,total_copies,varnishConsumptionVarnish_3d_y
0,2021-06-18 09:22:46.866000+00:00,2021-06-18 09:22:46.866000+00:00,520,740,0,0,0.0,1624008166,2021-06-18 09:22:46.866189200+00:00,3,...,1500,1500,1500,16,16,1,UNDEFINED,2021-06-18 09:22:46.866189200+00:00,0,0.0


In [22]:
# on conserve les colonnes started_at et ended_at
for index, row in merge_df.iterrows():
    # on verifie que started_at à le datetime le plus petit
    if row.timestamp_start < row.started_at :
        row.started_at = row.timestamp_start
    # on verifie que ended_at à le datetime le plus grand
    if row.timestamp_end > row.ended_at :
        row.ended_at = row.timestamp_end

In [23]:
merge_df = merge_df.drop(['timestamp_start','timestamp_end'], axis=1)

In [24]:
# # conversion des colonnes contenant des valeurs de temsps au format datetime
# jobs_df['started_at'] = pd.to_datetime(jobs_df['started_at'], utc=True)
# jobs_df['ended_at'] = pd.to_datetime(jobs_df['ended_at'], utc=True)
# events_df['timestamp_start'] = pd.to_datetime(events_df['timestamp_start'], utc=True)
# events_df['timestamp_end'] = pd.to_datetime(events_df['timestamp_end'], utc=True)

### 6. Output csv

In [25]:
# sauvegarde du dataset en csv
merge_df.to_csv(save_csv)