# 06 - Création du dataset des données de jobs et de job_events fusionnées

Ce notebook génère 1 csv :

- merge_raw_jobs_and_clean_jobevents_dataset.csv

Etapes :

- Suppression des colonnes doublon dans les 2 datasets

- Fusion des datasets

# A. Imports

## a) Librairies

In [81]:
import os, json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## b) Données

In [82]:
# source path
jobs = '../data/jobs/raw_jobs_dataset.csv'

# source path
events = '../data/jobs/clean_merge_job_events_dataset.csv'

# save target path
save_csv = '../data/jobs/merge_raw_jobs_and_clean_jobevents_dataset.csv'

# B. Dataframe

## a) Création des dataframes jobs et events

### 1. jobs

In [83]:
# création d'un dataframe à partir du csv de données
jobs_df = pd.read_csv(jobs, index_col=0)
jobs_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37299 entries, 0 to 37298
Data columns (total 20 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   total_copies                  37299 non-null  int64  
 1   started_at                    37299 non-null  object 
 2   ended_at                      37299 non-null  object 
 3   speed                         37299 non-null  int64  
 4   operator                      37299 non-null  object 
 5   operator_level                37299 non-null  object 
 6   paperHeight                   37299 non-null  int64  
 7   paperWidth                    37299 non-null  int64  
 8   paperName                     37299 non-null  object 
 9   paperThickness                37299 non-null  int64  
 10  id_on_machine                 37299 non-null  int64  
 11  total_copies_requested        37299 non-null  int64  
 12  uses_ifoil                    37299 non-null  bool   
 13  u

### 2. job events

In [84]:
# création d'un dataframe à partir du csv de données
events_df = pd.read_csv(events, index_col=0)
events_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37398 entries, 0 to 37397
Data columns (total 46 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   jobId                                                  37398 non-null  int64  
 1   timestamp_start                                        37398 non-null  object 
 2   totalCopies_start                                      37398 non-null  int64  
 3   jsonVersion_x                                          22919 non-null  float64
 4   LED_iper                                               37398 non-null  int64  
 5   bars_iper                                              37398 non-null  object 
 6   drops_iper                                             37398 non-null  int64  
 7   dithering_iper                                         37398 non-null  bool   
 8   deadPixelsOffset_iper                         

## b) Création d'un dataframe fusionné

### 1. Suppression de colonnes

In [85]:
# suppression des colonnes avec des valeurs uniques
for col in list(jobs_df.columns):
    if jobs_df[col].nunique() == 1:
        jobs_df.drop(col, axis=1, inplace=True)

In [86]:
# suppression des colonnes avec des valeurs uniques
for col in list(events_df.columns) :
    if events_df[col].nunique() == 1:
        events_df.drop(col, axis=1, inplace=True)

### 2. Concordance des colonnes entre dataframes

In [87]:
# on renomme des colonnes de jobs
jobs_df = jobs_df.rename(columns={
 'uses_ifoil' : 'ifoil',
 'iper_bvar_count': 'bars'
 })

In [88]:
# on renomme des colonnes de events
events_df = events_df.rename(columns={
    'totalCopies_start':'total_copies_requested',
    'LED_iper' : 'LED',
    'bars_iper' : 'bars',
    'drops_iper': 'drops',
    'dithering_iper' : 'dithering',
    'deadPixelsOffset_iper' : 'deadPixelsOffset',
    'level_user' : 'operator_level',
    'operator_user':'operator',
    'enabled_ifoil' : 'ifoil',
    'x_imageLayout_layout' : 'x_imageLayout',
    'y_imageLayout_layout' : 'y_imageLayout',
    'name_paperFormat_layout' : 'paperName',
    'width_paperFormat_layout' : 'paperWidth',
    'height_paperFormat_layout' : 'paperHeight',
    'speed_layout' : 'speed',
    'topMargin_registration_remoteScannerRegistration' : 'topMargin_remoteScannerRegistration',
    'leftMargin_registration_remoteScannerRegistration' : 'leftMargin_remoteScannerRegistration',
    'totalCopies_end' : 'total_copies',
    'consumption_operatorSideTanks_varnishConsumption' : 'varnishConsumptionVarnish_3d'
    })

In [89]:
# Comparer les valeurs de la colonne "bars" entre les dataframes "events_df" et "jobs_df"
events_df['bars'].isin(jobs_df['bars']).any()

False

In [90]:
# liste des colonnes communes
common_cols = [col for col in jobs_df.columns.to_list() if col in events_df.columns.to_list()]

### 3. Concordance des valeurs entre colonnes communes

In [91]:
jobs_df.shape

(37299, 18)

In [92]:
events_df.shape

(37398, 44)

In [93]:
events_df.columns

Index(['jobId', 'timestamp_start', 'total_copies_requested', 'LED', 'bars',
       'drops', 'dithering', 'deadPixelsOffset', 'operator_level', 'operator',
       'speed_ifoil', 'ifoil', 'optifoil_ifoil', 'vacuumIn_ifoil',
       'vacuumOut_ifoil', 'stampAreas_ifoil', 'heater1Enabled_ifoil',
       'speedTensionIn_ifoil', 'speedTensionOut_ifoil',
       'heater1Temperature_ifoil', 'x_imageLayout', 'y_imageLayout',
       'paperName', 'paperWidth', 'paperHeight', 'speed', 'power_irDryers',
       'power_uvDryers', 'redScore_gridMode_remoteScannerRegistration',
       'redScore_cropmarksMode_remoteScannerRegistration',
       'x_cropmark1_cropmarksMode_remoteScannerRegistration',
       'y_cropmark1_cropmarksMode_remoteScannerRegistration',
       'x_cropmark2_cropmarksMode_remoteScannerRegistration',
       'y_cropmark2_cropmarksMode_remoteScannerRegistration',
       'exposureTime_manualLighting_remoteScannerRegistration',
       'redScore_fullScannerMode_remoteScannerRegistration',
   

In [94]:
# Colonnes de events_df_17
cols_events_df_17 = ['jobId', 'timestamp_start', 'total_copies_requested', 'LED', 'bars',
                     'drops', 'dithering', 'deadPixelsOffset', 'operator_level', 'operator',
                     'speed_ifoil', 'ifoil', 'optifoil_ifoil', 'stampAreas_ifoil',
                     'heater1Enabled_ifoil', 'speedTensionIn_ifoil', 'speedTensionOut_ifoil',
                     'heater1Temperature_ifoil', 'x_imageLayout', 'y_imageLayout',
                     'paperName', 'paperWidth', 'paperHeight', 'speed', 'power_irDryers',
                     'power_uvDryers', 'redScore_gridMode_remoteScannerRegistration',
                     'redScore_cropmarksMode_remoteScannerRegistration',
                     'x_cropmark1_cropmarksMode_remoteScannerRegistration',
                     'y_cropmark1_cropmarksMode_remoteScannerRegistration',
                     'x_cropmark2_cropmarksMode_remoteScannerRegistration',
                     'y_cropmark2_cropmarksMode_remoteScannerRegistration',
                     'exposureTime_manualLighting_remoteScannerRegistration',
                     'redScore_fullScannerMode_remoteScannerRegistration',
                     'blueScore_fullScannerMode_remoteScannerRegistration',
                     'greenScore_fullScannerMode_remoteScannerRegistration',
                     'enable_specialSubstrate_remoteScannerRegistration',
                     'mode_remoteScannerRegistration', 'jobState', 'timestamp_end',
                     'total_copies', 'varnishConsumptionVarnish_3d']

# Colonnes de events_df_14
cols_events_df_14 = ['jobId', 'timestamp_start', 'total_copies_requested', 'LED', 'bars',
                     'drops', 'dithering', 'deadPixelsOffset', 'operator_level', 'operator',
                     'speed_ifoil', 'ifoil', 'optifoil_ifoil', 'stampAreas_ifoil',
                     'heater1Enabled_ifoil', 'speedTensionIn_ifoil',
                     'heater1Temperature_ifoil', 'x_imageLayout', 'y_imageLayout',
                     'paperName', 'paperWidth', 'paperHeight', 'speed', 'power_irDryers',
                     'power_uvDryers', 'topMargin_remoteScannerRegistration',
                     'leftMargin_remoteScannerRegistration',
                     'redScore_gridMode_remoteScannerRegistration',
                     'redScore_cropmarksMode_remoteScannerRegistration',
                     'redScore_fullScannerMode_remoteScannerRegistration',
                     'blueScore_fullScannerMode_remoteScannerRegistration',
                     'greenScore_fullScannerMode_remoteScannerRegistration',
                     'mode_remoteScannerRegistration', 'jobState', 'timestamp_end',
                     'total_copies', 'varnishConsumptionVarnish_3d']

# Trouver les colonnes uniques à events_df_17
unique_cols_to_df_17 = set(cols_events_df_17) - set(cols_events_df_14)

print(unique_cols_to_df_17)

{'speedTensionOut_ifoil', 'exposureTime_manualLighting_remoteScannerRegistration', 'x_cropmark2_cropmarksMode_remoteScannerRegistration', 'x_cropmark1_cropmarksMode_remoteScannerRegistration', 'enable_specialSubstrate_remoteScannerRegistration', 'y_cropmark2_cropmarksMode_remoteScannerRegistration', 'y_cropmark1_cropmarksMode_remoteScannerRegistration'}


In [95]:
jobs_df.head()

Unnamed: 0,total_copies,started_at,ended_at,speed,operator,operator_level,paperHeight,paperWidth,paperName,id_on_machine,total_copies_requested,ifoil,scanner_mode,bars,varnishConsumptionVarnish_3d,run,total_run,copies_per_run
0,6,2022-02-22 09:43:18.116000+00:00,2022-02-22 09:44:33.389000+00:00,313,User,Operator,483,330,UNDEFINED,1645522997,6,True,3,2,4.585923,0,0,0
1,11,2022-02-22 09:45:01.304000+00:00,2022-02-22 09:46:34.929000+00:00,313,User,Operator,483,330,UNDEFINED,1645523101,11,True,3,2,2.917403,0,0,0
2,7,2022-02-22 09:47:30.319000+00:00,2022-02-22 09:48:37.554000+00:00,313,User,Operator,483,330,UNDEFINED,1645523250,7,True,3,2,0.423666,0,0,0
3,11,2022-02-22 09:49:56.298000+00:00,2022-02-22 09:51:14.406000+00:00,313,User,Operator,483,330,UNDEFINED,1645523396,11,True,3,2,1.100145,0,0,0
4,47,2022-02-22 09:52:57.305000+00:00,2022-02-22 09:55:59.993000+00:00,313,User,Operator,483,330,UNDEFINED,1645523577,47,True,3,2,4.70161,0,0,0


In [96]:
events_df.head()

Unnamed: 0,jobId,timestamp_start,total_copies_requested,LED,bars,drops,dithering,deadPixelsOffset,operator_level,operator,...,exposureTime_manualLighting_remoteScannerRegistration,redScore_fullScannerMode_remoteScannerRegistration,blueScore_fullScannerMode_remoteScannerRegistration,greenScore_fullScannerMode_remoteScannerRegistration,enable_specialSubstrate_remoteScannerRegistration,mode_remoteScannerRegistration,jobState,timestamp_end,total_copies,varnishConsumptionVarnish_3d
0,1645522997,2022-02-22 09:43:18.116647800+00:00,6,50,"[1, 2]",4,False,0,Operator,User,...,0,1500,24,25,False,3,SUCCESS,2022-02-22 09:44:33.389402800+00:00,6,4.585923
1,1645523101,2022-02-22 09:45:01.304103300+00:00,11,50,"[1, 2]",4,False,0,Operator,User,...,0,1500,24,25,False,3,SUCCESS,2022-02-22 09:46:34.929092800+00:00,11,2.917403
2,1645523250,2022-02-22 09:47:30.319733400+00:00,7,50,"[1, 2]",4,False,0,Operator,User,...,0,1500,24,25,False,3,SUCCESS,2022-02-22 09:48:37.554887700+00:00,7,0.423666
3,1645523396,2022-02-22 09:49:56.298588500+00:00,11,50,"[1, 2]",4,False,0,Operator,User,...,0,1500,24,25,False,3,SUCCESS,2022-02-22 09:51:14.406099700+00:00,11,1.100145
4,1645523577,2022-02-22 09:52:57.305045100+00:00,47,50,"[1, 2]",4,False,0,Operator,User,...,0,1500,24,25,False,3,SUCCESS,2022-02-22 09:55:59.993608800+00:00,47,4.70161


In [97]:
events_df.shape

(37398, 44)

In [98]:
jobs_df.shape

(37299, 18)

In [99]:
# on supprime les lignes dans events_df qui n'ont pas de JobId qui coïncident avec 'id_on_machine' de jobs_df
events_df = events_df[events_df['jobId'].isin(jobs_df['id_on_machine'])]

In [100]:
events_df.shape

(37299, 44)

In [101]:
# # on liste les colonnes des deux dataframes contenant toutes leurs valeurs identiques
# col_with_duplicates = []
# for col in common_cols :
#     if (events_df[col].values == jobs_df[col].values).all():
#         col_with_duplicates.append(col)
# col_with_duplicates
col_with_duplicates = []
for col in common_cols:
    # Vérifier si les deux colonnes ont la même longueur
    if len(events_df[col]) == len(jobs_df[col]):
        # Si elles ont la même longueur, effectuer la comparaison
        if (events_df[col].values == jobs_df[col].values).all():
            col_with_duplicates.append(col)
    else:
        print(f"Les colonnes {col} n'ont pas la même longueur.")
col_with_duplicates

['operator', 'operator_level', 'paperName']

In [102]:
# suppression des colonnes de jobs en doublons dans events
jobs_df = jobs_df.drop(col_with_duplicates, axis=1)

### 4. Fusion des dataframes

In [103]:
merge_df = pd.merge(jobs_df, events_df, left_on='id_on_machine', right_on='jobId', suffixes=['_job', '_event'])
merge_df = merge_df.drop('id_on_machine', axis=1)

### 5. Vérification

In [104]:
# on verifie que les jobId sont uniques
merge_df.jobId.duplicated().any()

False

In [105]:
merge_df.head()

Unnamed: 0,total_copies_job,started_at,ended_at,speed_job,paperHeight_job,paperWidth_job,total_copies_requested_job,ifoil_job,scanner_mode,bars_job,...,exposureTime_manualLighting_remoteScannerRegistration,redScore_fullScannerMode_remoteScannerRegistration,blueScore_fullScannerMode_remoteScannerRegistration,greenScore_fullScannerMode_remoteScannerRegistration,enable_specialSubstrate_remoteScannerRegistration,mode_remoteScannerRegistration,jobState,timestamp_end,total_copies_event,varnishConsumptionVarnish_3d_event
0,6,2022-02-22 09:43:18.116000+00:00,2022-02-22 09:44:33.389000+00:00,313,483,330,6,True,3,2,...,0,1500,24,25,False,3,SUCCESS,2022-02-22 09:44:33.389402800+00:00,6,4.585923
1,11,2022-02-22 09:45:01.304000+00:00,2022-02-22 09:46:34.929000+00:00,313,483,330,11,True,3,2,...,0,1500,24,25,False,3,SUCCESS,2022-02-22 09:46:34.929092800+00:00,11,2.917403
2,7,2022-02-22 09:47:30.319000+00:00,2022-02-22 09:48:37.554000+00:00,313,483,330,7,True,3,2,...,0,1500,24,25,False,3,SUCCESS,2022-02-22 09:48:37.554887700+00:00,7,0.423666
3,11,2022-02-22 09:49:56.298000+00:00,2022-02-22 09:51:14.406000+00:00,313,483,330,11,True,3,2,...,0,1500,24,25,False,3,SUCCESS,2022-02-22 09:51:14.406099700+00:00,11,1.100145
4,47,2022-02-22 09:52:57.305000+00:00,2022-02-22 09:55:59.993000+00:00,313,483,330,47,True,3,2,...,0,1500,24,25,False,3,SUCCESS,2022-02-22 09:55:59.993608800+00:00,47,4.70161


In [106]:
merge_df.columns

Index(['total_copies_job', 'started_at', 'ended_at', 'speed_job',
       'paperHeight_job', 'paperWidth_job', 'total_copies_requested_job',
       'ifoil_job', 'scanner_mode', 'bars_job',
       'varnishConsumptionVarnish_3d_job', 'run', 'total_run',
       'copies_per_run', 'jobId', 'timestamp_start',
       'total_copies_requested_event', 'LED', 'bars_event', 'drops',
       'dithering', 'deadPixelsOffset', 'operator_level', 'operator',
       'speed_ifoil', 'ifoil_event', 'optifoil_ifoil', 'vacuumIn_ifoil',
       'vacuumOut_ifoil', 'stampAreas_ifoil', 'heater1Enabled_ifoil',
       'speedTensionIn_ifoil', 'speedTensionOut_ifoil',
       'heater1Temperature_ifoil', 'x_imageLayout', 'y_imageLayout',
       'paperName', 'paperWidth_event', 'paperHeight_event', 'speed_event',
       'power_irDryers', 'power_uvDryers',
       'redScore_gridMode_remoteScannerRegistration',
       'redScore_cropmarksMode_remoteScannerRegistration',
       'x_cropmark1_cropmarksMode_remoteScannerRegistrati

In [107]:
# id temoin
check_jobid = 1645522984
# détail de la ligne témoin dans chaque dataset
jobs_check_line = jobs_df[jobs_df.id_on_machine == check_jobid]
events_check_line = events_df[events_df.jobId == check_jobid]
merge_check_line = merge_df[merge_df.jobId == check_jobid]

In [108]:
jobs_check_line

Unnamed: 0,total_copies,started_at,ended_at,speed,paperHeight,paperWidth,id_on_machine,total_copies_requested,ifoil,scanner_mode,bars,varnishConsumptionVarnish_3d,run,total_run,copies_per_run


In [109]:
events_check_line

Unnamed: 0,jobId,timestamp_start,total_copies_requested,LED,bars,drops,dithering,deadPixelsOffset,operator_level,operator,...,exposureTime_manualLighting_remoteScannerRegistration,redScore_fullScannerMode_remoteScannerRegistration,blueScore_fullScannerMode_remoteScannerRegistration,greenScore_fullScannerMode_remoteScannerRegistration,enable_specialSubstrate_remoteScannerRegistration,mode_remoteScannerRegistration,jobState,timestamp_end,total_copies,varnishConsumptionVarnish_3d


In [110]:
merge_check_line

Unnamed: 0,total_copies_job,started_at,ended_at,speed_job,paperHeight_job,paperWidth_job,total_copies_requested_job,ifoil_job,scanner_mode,bars_job,...,exposureTime_manualLighting_remoteScannerRegistration,redScore_fullScannerMode_remoteScannerRegistration,blueScore_fullScannerMode_remoteScannerRegistration,greenScore_fullScannerMode_remoteScannerRegistration,enable_specialSubstrate_remoteScannerRegistration,mode_remoteScannerRegistration,jobState,timestamp_end,total_copies_event,varnishConsumptionVarnish_3d_event


In [111]:
# on conserve les colonnes started_at et ended_at
for index, row in merge_df.iterrows():
    # on verifie que started_at à le datetime le plus petit
    if row.timestamp_start < row.started_at :
        row.started_at = row.timestamp_start
    # on verifie que ended_at à le datetime le plus grand
    if row.timestamp_end > row.ended_at :
        row.ended_at = row.timestamp_end

In [112]:
merge_df = merge_df.drop(['timestamp_start','timestamp_end'], axis=1)

In [113]:
# # conversion des colonnes contenant des valeurs de temsps au format datetime
# jobs_df['started_at'] = pd.to_datetime(jobs_df['started_at'], utc=True)
# jobs_df['ended_at'] = pd.to_datetime(jobs_df['ended_at'], utc=True)
# events_df['timestamp_start'] = pd.to_datetime(events_df['timestamp_start'], utc=True)
# events_df['timestamp_end'] = pd.to_datetime(events_df['timestamp_end'], utc=True)

### 6. Output csv

In [114]:
# sauvegarde du dataset en csv
merge_df.to_csv(save_csv)

In [115]:
merge_df.head(3)

Unnamed: 0,total_copies_job,started_at,ended_at,speed_job,paperHeight_job,paperWidth_job,total_copies_requested_job,ifoil_job,scanner_mode,bars_job,...,y_cropmark2_cropmarksMode_remoteScannerRegistration,exposureTime_manualLighting_remoteScannerRegistration,redScore_fullScannerMode_remoteScannerRegistration,blueScore_fullScannerMode_remoteScannerRegistration,greenScore_fullScannerMode_remoteScannerRegistration,enable_specialSubstrate_remoteScannerRegistration,mode_remoteScannerRegistration,jobState,total_copies_event,varnishConsumptionVarnish_3d_event
0,6,2022-02-22 09:43:18.116000+00:00,2022-02-22 09:44:33.389000+00:00,313,483,330,6,True,3,2,...,0,0,1500,24,25,False,3,SUCCESS,6,4.585923
1,11,2022-02-22 09:45:01.304000+00:00,2022-02-22 09:46:34.929000+00:00,313,483,330,11,True,3,2,...,0,0,1500,24,25,False,3,SUCCESS,11,2.917403
2,7,2022-02-22 09:47:30.319000+00:00,2022-02-22 09:48:37.554000+00:00,313,483,330,7,True,3,2,...,0,0,1500,24,25,False,3,SUCCESS,7,0.423666


In [116]:
for col in merge_df.columns:
    unique_values = merge_df[col].nunique()
    print(f"Column {col} has {unique_values} unique values.")


Column total_copies_job has 175 unique values.
Column started_at has 37299 unique values.
Column ended_at has 37299 unique values.
Column speed_job has 56 unique values.
Column paperHeight_job has 11 unique values.
Column paperWidth_job has 11 unique values.
Column total_copies_requested_job has 179 unique values.
Column ifoil_job has 2 unique values.
Column scanner_mode has 4 unique values.
Column bars_job has 2 unique values.
Column varnishConsumptionVarnish_3d_job has 34125 unique values.
Column run has 2 unique values.
Column total_run has 2 unique values.
Column copies_per_run has 107 unique values.
Column jobId has 37299 unique values.
Column total_copies_requested_event has 179 unique values.
Column LED has 20 unique values.
Column bars_event has 3 unique values.
Column drops has 8 unique values.
Column dithering has 2 unique values.
Column deadPixelsOffset has 5 unique values.
Column operator_level has 2 unique values.
Column operator has 2 unique values.
Column speed_ifoil has

In [117]:
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37299 entries, 0 to 37298
Data columns (total 56 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   total_copies_job                                       37299 non-null  int64  
 1   started_at                                             37299 non-null  object 
 2   ended_at                                               37299 non-null  object 
 3   speed_job                                              37299 non-null  int64  
 4   paperHeight_job                                        37299 non-null  int64  
 5   paperWidth_job                                         37299 non-null  int64  
 6   total_copies_requested_job                             37299 non-null  int64  
 7   ifoil_job                                              37299 non-null  bool   
 8   scanner_mode                                  

# C. Visualisation

In [118]:
df = merge_df.copy()

In [119]:
df.jobState.unique()

array(['SUCCESS', 'ERROR', 'CANCELED', 'UNDEFINED'], dtype=object)

In [120]:
df.columns

Index(['total_copies_job', 'started_at', 'ended_at', 'speed_job',
       'paperHeight_job', 'paperWidth_job', 'total_copies_requested_job',
       'ifoil_job', 'scanner_mode', 'bars_job',
       'varnishConsumptionVarnish_3d_job', 'run', 'total_run',
       'copies_per_run', 'jobId', 'total_copies_requested_event', 'LED',
       'bars_event', 'drops', 'dithering', 'deadPixelsOffset',
       'operator_level', 'operator', 'speed_ifoil', 'ifoil_event',
       'optifoil_ifoil', 'vacuumIn_ifoil', 'vacuumOut_ifoil',
       'stampAreas_ifoil', 'heater1Enabled_ifoil', 'speedTensionIn_ifoil',
       'speedTensionOut_ifoil', 'heater1Temperature_ifoil', 'x_imageLayout',
       'y_imageLayout', 'paperName', 'paperWidth_event', 'paperHeight_event',
       'speed_event', 'power_irDryers', 'power_uvDryers',
       'redScore_gridMode_remoteScannerRegistration',
       'redScore_cropmarksMode_remoteScannerRegistration',
       'x_cropmark1_cropmarksMode_remoteScannerRegistration',
       'y_cropmark1_c

In [121]:
# Colonnes de events_df_17
df17 = ['total_copies_job', 'started_at', 'ended_at', 'paperHeight_job',
       'paperWidth_job', 'scanner_mode', 'bars_job',
       'varnishConsumptionVarnish_3d_job', 'run', 'total_run',
       'copies_per_run', 'jobId', 'total_copies_requested', 'LED',
       'bars_event', 'drops', 'dithering', 'deadPixelsOffset',
       'operator_level', 'operator', 'speed_ifoil', 'ifoil', 'optifoil_ifoil',
       'stampAreas_ifoil', 'heater1Enabled_ifoil', 'speedTensionIn_ifoil',
       'speedTensionOut_ifoil', 'heater1Temperature_ifoil', 'x_imageLayout',
       'y_imageLayout', 'paperName', 'paperWidth_event', 'paperHeight_event',
       'speed', 'power_irDryers', 'power_uvDryers',
       'redScore_gridMode_remoteScannerRegistration',
       'redScore_cropmarksMode_remoteScannerRegistration',
       'x_cropmark1_cropmarksMode_remoteScannerRegistration',
       'y_cropmark1_cropmarksMode_remoteScannerRegistration',
       'x_cropmark2_cropmarksMode_remoteScannerRegistration',
       'y_cropmark2_cropmarksMode_remoteScannerRegistration',
       'exposureTime_manualLighting_remoteScannerRegistration',
       'redScore_fullScannerMode_remoteScannerRegistration',
       'blueScore_fullScannerMode_remoteScannerRegistration',
       'greenScore_fullScannerMode_remoteScannerRegistration',
       'enable_specialSubstrate_remoteScannerRegistration',
       'mode_remoteScannerRegistration', 'jobState', 'total_copies_event',
       'varnishConsumptionVarnish_3d_event']

# Colonnes de events_df_18
df18 = ['total_copies_job', 'started_at', 'ended_at', 'speed_job',
       'paperHeight_job', 'paperWidth_job', 'total_copies_requested_job',
       'ifoil_job', 'scanner_mode', 'bars_job',
       'varnishConsumptionVarnish_3d_job', 'run', 'total_run',
       'copies_per_run', 'jobId', 'total_copies_requested_event', 'LED',
       'bars_event', 'drops', 'dithering', 'deadPixelsOffset',
       'operator_level', 'operator', 'speed_ifoil', 'ifoil_event',
       'optifoil_ifoil', 'vacuumIn_ifoil', 'vacuumOut_ifoil',
       'stampAreas_ifoil', 'heater1Enabled_ifoil', 'speedTensionIn_ifoil',
       'speedTensionOut_ifoil', 'heater1Temperature_ifoil', 'x_imageLayout',
       'y_imageLayout', 'paperName', 'paperWidth_event', 'paperHeight_event',
       'speed_event', 'power_irDryers', 'power_uvDryers',
       'redScore_gridMode_remoteScannerRegistration',
       'redScore_cropmarksMode_remoteScannerRegistration',
       'x_cropmark1_cropmarksMode_remoteScannerRegistration',
       'y_cropmark1_cropmarksMode_remoteScannerRegistration',
       'x_cropmark2_cropmarksMode_remoteScannerRegistration',
       'y_cropmark2_cropmarksMode_remoteScannerRegistration',
       'exposureTime_manualLighting_remoteScannerRegistration',
       'redScore_fullScannerMode_remoteScannerRegistration',
       'blueScore_fullScannerMode_remoteScannerRegistration',
       'greenScore_fullScannerMode_remoteScannerRegistration',
       'enable_specialSubstrate_remoteScannerRegistration',
       'mode_remoteScannerRegistration', 'jobState', 'total_copies_event',
       'varnishConsumptionVarnish_3d_event']

In [122]:
# df14 et df17 sont vos listes de colonnes

# Convertir les listes en ensembles
colonnes_df17 = set(df17)
colonnes_df18 = set(df18)

# Trouver les colonnes uniques à df17
colonnes_uniques_df17 = colonnes_df17 - colonnes_df18
print("Colonnes uniques à df17 (machine 17) :")
print(colonnes_uniques_df17)

# Trouver les colonnes uniques à df14
colonnes_uniques_df18 = colonnes_df18 - colonnes_df17
print("\nColonnes uniques à df18 (machine 18) :")
print(colonnes_uniques_df18)


Colonnes uniques à df17 (machine 17) :
{'ifoil', 'total_copies_requested', 'speed'}

Colonnes uniques à df18 (machine 18) :
{'speed_event', 'ifoil_job', 'total_copies_requested_job', 'vacuumIn_ifoil', 'vacuumOut_ifoil', 'total_copies_requested_event', 'ifoil_event', 'speed_job'}
