# 08 - Création du dataset de données traitées pour entrainement <br> avec filtre des évènements par id
(sans encodage, ni normalisation)

Ce notebook génère 1 csv :

- dataset_for_training.csv : analyse et nettoyage des variables explicatives

Etapes : 




## A) Imports

In [95]:
import os
import pandas as pd
pd.set_option('display.max_columns', 500)
import numpy as np
# from ydata_profiling import ProfileReport # pip install ydata_profiling ipywidgets
import matplotlib.pyplot as plt

In [96]:
# source path to datasets
path = '../data/'
data = 'dataset_for_preprocess_id_events_filtered_07.csv'
save_csv = '../data/dataset_for_training_id_events_filtered_08.csv'

In [97]:
# création d'un dataframe à partir du csv de données
df = pd.read_csv(os.path.join(path, data), header=0, parse_dates=True, index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2336 entries, 0 to 2335
Data columns (total 60 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   total_copies_job                                       2336 non-null   int64  
 1   started_at                                             2336 non-null   object 
 2   ended_at                                               2336 non-null   object 
 3   speed_job                                              2336 non-null   int64  
 4   paperHeight_job                                        2336 non-null   int64  
 5   paperWidth_job                                         2336 non-null   int64  
 6   total_copies_requested_job                             2336 non-null   int64  
 7   ifoil_job                                              2336 non-null   bool   
 8   scanner_mode                                    

In [98]:
df.columns

Index(['total_copies_job', 'started_at', 'ended_at', 'speed_job',
       'paperHeight_job', 'paperWidth_job', 'total_copies_requested_job',
       'ifoil_job', 'scanner_mode', 'bars_job',
       'varnishConsumptionVarnish_3d_job', 'run', 'total_run',
       'copies_per_run', 'jobId', 'total_copies_requested_event', 'LED',
       'bars_event', 'drops', 'dithering', 'deadPixelsOffset',
       'operator_level', 'operator', 'speed_ifoil', 'ifoil_event',
       'optifoil_ifoil', 'vacuumIn_ifoil', 'vacuumOut_ifoil',
       'stampAreas_ifoil', 'heater1Enabled_ifoil', 'speedTensionIn_ifoil',
       'speedTensionOut_ifoil', 'heater1Temperature_ifoil', 'x_imageLayout',
       'y_imageLayout', 'paperName', 'paperWidth_event', 'paperHeight_event',
       'speed_event', 'power_irDryers', 'power_uvDryers',
       'redScore_gridMode_remoteScannerRegistration',
       'redScore_cropmarksMode_remoteScannerRegistration',
       'x_cropmark1_cropmarksMode_remoteScannerRegistration',
       'y_cropmark1_c

In [99]:
# Liste des colonnes à vérifier
colonnes_a_verifier = ['x_cropmark2_cropmarksMode_remoteScannerRegistration', 'speedTensionOut_ifoil', 'y_cropmark1_cropmarksMode_remoteScannerRegistration', 'enable_specialSubstrate_remoteScannerRegistration', 'x_cropmark1_cropmarksMode_remoteScannerRegistration', 'y_cropmark2_cropmarksMode_remoteScannerRegistration', 'exposureTime_manualLighting_remoteScannerRegistration']


# Afficher la répartition des valeurs pour chaque colonne dans la liste
for col in colonnes_a_verifier:
    print(f"Valeurs uniques pour {col}: {df[col].unique()}")
    print(f"Répartition des valeurs pour {col}:")
    print(df[col].value_counts())
    print("\n") 

Valeurs uniques pour x_cropmark2_cropmarksMode_remoteScannerRegistration: [   0 4499 4492 4507 4415 4496]
Répartition des valeurs pour x_cropmark2_cropmarksMode_remoteScannerRegistration:
0       1519
4492     672
4507     122
4496      21
4499       1
4415       1
Name: x_cropmark2_cropmarksMode_remoteScannerRegistration, dtype: int64


Valeurs uniques pour speedTensionOut_ifoil: [1.  0.5 0.6 0.2]
Répartition des valeurs pour speedTensionOut_ifoil:
1.0    1877
0.2     438
0.6      12
0.5       9
Name: speedTensionOut_ifoil, dtype: int64


Valeurs uniques pour y_cropmark1_cropmarksMode_remoteScannerRegistration: [  0 141 145 149 129]
Répartition des valeurs pour y_cropmark1_cropmarksMode_remoteScannerRegistration:
0      1519
145     693
149     120
141       3
129       1
Name: y_cropmark1_cropmarksMode_remoteScannerRegistration, dtype: int64


Valeurs uniques pour enable_specialSubstrate_remoteScannerRegistration: [False]
Répartition des valeurs pour enable_specialSubstrate_remoteSca

## B) Analyse des variables

In [100]:
# remise à zero des index
df.reset_index(drop=True,inplace=True)

In [101]:
# suppression des colonnes de metrics
# metrics_cols = [
#     'status', 
#     'source_events', 
#     'timestamp', 
#     'criticality_events', 
#     'name_modules', 
#     'type_modules',
#     'generation_modules', 
#     'value_counters_modules', 
#     'name_counters_modules',
#     'name_connected_operators',
#     'level_connected_operators',
#     'varnishLevelsTargetvolume',
#     'varnishLevelsTotalvolume'
#     ]
metrics_cols = [
    'timestamp', 
    'criticality_events', 
    'index',
    ]
df.drop(columns=metrics_cols, axis=0, inplace=True)

In [102]:
# suppression des colonnes en doublon entre job_events et jobs
job_events_cols = [
    'bars_event',
    'paperWidth_event', 
    'paperHeight_event',
    'varnishConsumptionVarnish_3d_event'
    ]
df.drop(columns=job_events_cols, axis=0, inplace=True)

In [103]:
# suppression des colonnes de variables catégorielles
cat_cols = [
    'operator_level', 
    'operator',
    'paperName', 
    'jobState',
    'jobId'
   ]
df.drop(columns=cat_cols, axis=0, inplace=True)

In [104]:
# suppression des colonnes datetime et ajout d'une colonne duration
df['started_at'] = pd.to_datetime(df['started_at'])
df['ended_at'] = pd.to_datetime(df['ended_at'])
# Calculer la différence de temps entre les deux colonnes
df['duration'] = (df['ended_at'] - df['started_at']).dt.total_seconds()
df.drop(columns=['started_at', 'ended_at'], axis=0, inplace=True)

In [105]:
df.head(3)

Unnamed: 0,total_copies_job,speed_job,paperHeight_job,paperWidth_job,total_copies_requested_job,ifoil_job,scanner_mode,bars_job,varnishConsumptionVarnish_3d_job,run,total_run,copies_per_run,total_copies_requested_event,LED,drops,dithering,deadPixelsOffset,speed_ifoil,ifoil_event,optifoil_ifoil,vacuumIn_ifoil,vacuumOut_ifoil,stampAreas_ifoil,heater1Enabled_ifoil,speedTensionIn_ifoil,speedTensionOut_ifoil,heater1Temperature_ifoil,x_imageLayout,y_imageLayout,speed_event,power_irDryers,power_uvDryers,redScore_gridMode_remoteScannerRegistration,redScore_cropmarksMode_remoteScannerRegistration,x_cropmark1_cropmarksMode_remoteScannerRegistration,y_cropmark1_cropmarksMode_remoteScannerRegistration,x_cropmark2_cropmarksMode_remoteScannerRegistration,y_cropmark2_cropmarksMode_remoteScannerRegistration,exposureTime_manualLighting_remoteScannerRegistration,redScore_fullScannerMode_remoteScannerRegistration,blueScore_fullScannerMode_remoteScannerRegistration,greenScore_fullScannerMode_remoteScannerRegistration,enable_specialSubstrate_remoteScannerRegistration,mode_remoteScannerRegistration,total_copies_event,identification_events,duration
0,24,313,483,330,26,True,3,2,1.549901,0,0,0,26,40,4,False,1,20.0,True,False,100,100,"[{'id': 1, 'end': 483, 'start': 0, 'height': 4...",True,-0.6,1.0,95,1487,-69,313,20,60,1500,1500,0,0,0,0,100,1500,24,25,False,3,24,454,112.035
1,6,313,483,330,26,True,3,2,0.350555,0,0,0,26,40,4,False,1,20.0,True,False,100,100,"[{'id': 1, 'end': 483, 'start': 0, 'height': 4...",True,-0.6,1.0,95,1487,-69,313,20,60,1500,1500,0,0,0,0,100,1500,24,25,False,3,6,454,54.043
2,3,313,483,330,26,True,3,2,0.192394,0,0,0,26,40,4,False,1,20.0,True,False,100,100,"[{'id': 1, 'end': 483, 'start': 0, 'height': 4...",True,-0.6,1.0,95,1487,-69,313,20,60,1500,1500,0,0,0,0,100,1500,24,25,False,3,3,454,64.875


### a) Profiling report

In [106]:
# profile = ProfileReport(df, title="Profiling Report")
# profile.to_notebook_iframe()

## C) Nettoyage

In [107]:
df.head()

Unnamed: 0,total_copies_job,speed_job,paperHeight_job,paperWidth_job,total_copies_requested_job,ifoil_job,scanner_mode,bars_job,varnishConsumptionVarnish_3d_job,run,total_run,copies_per_run,total_copies_requested_event,LED,drops,dithering,deadPixelsOffset,speed_ifoil,ifoil_event,optifoil_ifoil,vacuumIn_ifoil,vacuumOut_ifoil,stampAreas_ifoil,heater1Enabled_ifoil,speedTensionIn_ifoil,speedTensionOut_ifoil,heater1Temperature_ifoil,x_imageLayout,y_imageLayout,speed_event,power_irDryers,power_uvDryers,redScore_gridMode_remoteScannerRegistration,redScore_cropmarksMode_remoteScannerRegistration,x_cropmark1_cropmarksMode_remoteScannerRegistration,y_cropmark1_cropmarksMode_remoteScannerRegistration,x_cropmark2_cropmarksMode_remoteScannerRegistration,y_cropmark2_cropmarksMode_remoteScannerRegistration,exposureTime_manualLighting_remoteScannerRegistration,redScore_fullScannerMode_remoteScannerRegistration,blueScore_fullScannerMode_remoteScannerRegistration,greenScore_fullScannerMode_remoteScannerRegistration,enable_specialSubstrate_remoteScannerRegistration,mode_remoteScannerRegistration,total_copies_event,identification_events,duration
0,24,313,483,330,26,True,3,2,1.549901,0,0,0,26,40,4,False,1,20.0,True,False,100,100,"[{'id': 1, 'end': 483, 'start': 0, 'height': 4...",True,-0.6,1.0,95,1487,-69,313,20,60,1500,1500,0,0,0,0,100,1500,24,25,False,3,24,454,112.035
1,6,313,483,330,26,True,3,2,0.350555,0,0,0,26,40,4,False,1,20.0,True,False,100,100,"[{'id': 1, 'end': 483, 'start': 0, 'height': 4...",True,-0.6,1.0,95,1487,-69,313,20,60,1500,1500,0,0,0,0,100,1500,24,25,False,3,6,454,54.043
2,3,313,483,330,26,True,3,2,0.192394,0,0,0,26,40,4,False,1,20.0,True,False,100,100,"[{'id': 1, 'end': 483, 'start': 0, 'height': 4...",True,-0.6,1.0,95,1487,-69,313,20,60,1500,1500,0,0,0,0,100,1500,24,25,False,3,3,454,64.875
3,8,418,483,330,14,True,3,2,1.265435,0,0,0,14,50,3,False,0,26.0,True,False,100,100,"[{'id': 1, 'end': 483, 'start': 1, 'height': 4...",True,-0.6,1.0,90,1492,-16,418,20,54,1500,1500,0,0,0,0,0,1500,24,25,False,3,8,454,62.011
4,2,418,483,330,5,True,3,2,0.319629,0,0,0,5,50,3,False,0,24.0,True,False,100,100,"[{'id': 1, 'end': 483, 'start': 1, 'height': 4...",True,-0.6,1.0,90,1492,-16,418,20,54,1500,1500,0,0,0,0,0,1500,24,25,False,3,2,454,64.731


In [108]:
# Liste des colonnes à vérifier
colonnes_a_verifier = ['dithering','deadPixelsOffset','speedTensionIn_ifoil']

# Vérifier le nombre de valeurs uniques pour chaque colonne dans la liste
nombres_uniques = df[colonnes_a_verifier].nunique()

print(nombres_uniques)

dithering               2
deadPixelsOffset        2
speedTensionIn_ifoil    9
dtype: int64


In [109]:
# Liste des colonnes pour lesquelles vous voulez voir les valeurs uniques
colonnes_a_verifier = ['dithering','deadPixelsOffset','speedTensionIn_ifoil']

# Afficher les valeurs uniques pour chaque colonne dans la liste
for col in colonnes_a_verifier:
    print(f"Valeurs uniques pour {col}: {df[col].unique()}")


Valeurs uniques pour dithering: [False  True]
Valeurs uniques pour deadPixelsOffset: [1 0]
Valeurs uniques pour speedTensionIn_ifoil: [-0.6  0.   0.9 -1.   0.5 -0.4 -0.2 -0.5  0.2]


In [110]:
# Liste des colonnes pour lesquelles vous voulez voir la répartition des valeurs
colonnes_a_verifier = ['dithering','deadPixelsOffset','speedTensionIn_ifoil']

# Afficher la répartition des valeurs pour chaque colonne dans la liste
for col in colonnes_a_verifier:
    print(f"Valeurs uniques pour {col}: {df[col].unique()}")
    print(f"Répartition des valeurs pour {col}:")
    print(df[col].value_counts())
    print("\n") 


Valeurs uniques pour dithering: [False  True]
Répartition des valeurs pour dithering:
False    2118
True      218
Name: dithering, dtype: int64


Valeurs uniques pour deadPixelsOffset: [1 0]
Répartition des valeurs pour deadPixelsOffset:
1    1224
0    1112
Name: deadPixelsOffset, dtype: int64


Valeurs uniques pour speedTensionIn_ifoil: [-0.6  0.   0.9 -1.   0.5 -0.4 -0.2 -0.5  0.2]
Répartition des valeurs pour speedTensionIn_ifoil:
-0.6    1761
-0.2     437
-1.0      95
-0.4      25
 0.5       8
 0.0       6
 0.9       2
-0.5       1
 0.2       1
Name: speedTensionIn_ifoil, dtype: int64




In [111]:
# supression des colonnes dont les valeurs sont constantes
# contstant_cols = ['dithering','deadPixelsOffset','speedTensionIn_ifoil', 'topMargin_remoteScannerRegistration']

constant_cols = [col for col in df.columns if df[col].nunique() == 1]
print(constant_cols)
df.drop(columns=constant_cols, axis=0, inplace=True)

['optifoil_ifoil', 'vacuumIn_ifoil', 'vacuumOut_ifoil', 'enable_specialSubstrate_remoteScannerRegistration']


In [112]:
df.head(5)

Unnamed: 0,total_copies_job,speed_job,paperHeight_job,paperWidth_job,total_copies_requested_job,ifoil_job,scanner_mode,bars_job,varnishConsumptionVarnish_3d_job,run,total_run,copies_per_run,total_copies_requested_event,LED,drops,dithering,deadPixelsOffset,speed_ifoil,ifoil_event,stampAreas_ifoil,heater1Enabled_ifoil,speedTensionIn_ifoil,speedTensionOut_ifoil,heater1Temperature_ifoil,x_imageLayout,y_imageLayout,speed_event,power_irDryers,power_uvDryers,redScore_gridMode_remoteScannerRegistration,redScore_cropmarksMode_remoteScannerRegistration,x_cropmark1_cropmarksMode_remoteScannerRegistration,y_cropmark1_cropmarksMode_remoteScannerRegistration,x_cropmark2_cropmarksMode_remoteScannerRegistration,y_cropmark2_cropmarksMode_remoteScannerRegistration,exposureTime_manualLighting_remoteScannerRegistration,redScore_fullScannerMode_remoteScannerRegistration,blueScore_fullScannerMode_remoteScannerRegistration,greenScore_fullScannerMode_remoteScannerRegistration,mode_remoteScannerRegistration,total_copies_event,identification_events,duration
0,24,313,483,330,26,True,3,2,1.549901,0,0,0,26,40,4,False,1,20.0,True,"[{'id': 1, 'end': 483, 'start': 0, 'height': 4...",True,-0.6,1.0,95,1487,-69,313,20,60,1500,1500,0,0,0,0,100,1500,24,25,3,24,454,112.035
1,6,313,483,330,26,True,3,2,0.350555,0,0,0,26,40,4,False,1,20.0,True,"[{'id': 1, 'end': 483, 'start': 0, 'height': 4...",True,-0.6,1.0,95,1487,-69,313,20,60,1500,1500,0,0,0,0,100,1500,24,25,3,6,454,54.043
2,3,313,483,330,26,True,3,2,0.192394,0,0,0,26,40,4,False,1,20.0,True,"[{'id': 1, 'end': 483, 'start': 0, 'height': 4...",True,-0.6,1.0,95,1487,-69,313,20,60,1500,1500,0,0,0,0,100,1500,24,25,3,3,454,64.875
3,8,418,483,330,14,True,3,2,1.265435,0,0,0,14,50,3,False,0,26.0,True,"[{'id': 1, 'end': 483, 'start': 1, 'height': 4...",True,-0.6,1.0,90,1492,-16,418,20,54,1500,1500,0,0,0,0,0,1500,24,25,3,8,454,62.011
4,2,418,483,330,5,True,3,2,0.319629,0,0,0,5,50,3,False,0,24.0,True,"[{'id': 1, 'end': 483, 'start': 1, 'height': 4...",True,-0.6,1.0,90,1492,-16,418,20,54,1500,1500,0,0,0,0,0,1500,24,25,3,2,454,64.731


In [113]:
# Vérifier si toutes les valeurs dans les deux colonnes sont identiques
same_col = (df['total_copies_job'] == df['total_copies_event']).all()
print("Les colonnes sont-elles identiques ?", same_col)


Les colonnes sont-elles identiques ? True


In [114]:
df.shape

(2336, 43)

In [115]:
# Supprimer la colonne 'total_copies_event'
df.drop(columns='total_copies_event', inplace=True)


df.rename(columns={'total_copies_job': 'total_copies', 'ifoil_job': 'ifoil' , 'speed_job': 'speed','total_copies_requested_job' : 'total_copies_requested'}, inplace=True)




In [116]:
df.head()

Unnamed: 0,total_copies,speed,paperHeight_job,paperWidth_job,total_copies_requested,ifoil,scanner_mode,bars_job,varnishConsumptionVarnish_3d_job,run,total_run,copies_per_run,total_copies_requested_event,LED,drops,dithering,deadPixelsOffset,speed_ifoil,ifoil_event,stampAreas_ifoil,heater1Enabled_ifoil,speedTensionIn_ifoil,speedTensionOut_ifoil,heater1Temperature_ifoil,x_imageLayout,y_imageLayout,speed_event,power_irDryers,power_uvDryers,redScore_gridMode_remoteScannerRegistration,redScore_cropmarksMode_remoteScannerRegistration,x_cropmark1_cropmarksMode_remoteScannerRegistration,y_cropmark1_cropmarksMode_remoteScannerRegistration,x_cropmark2_cropmarksMode_remoteScannerRegistration,y_cropmark2_cropmarksMode_remoteScannerRegistration,exposureTime_manualLighting_remoteScannerRegistration,redScore_fullScannerMode_remoteScannerRegistration,blueScore_fullScannerMode_remoteScannerRegistration,greenScore_fullScannerMode_remoteScannerRegistration,mode_remoteScannerRegistration,identification_events,duration
0,24,313,483,330,26,True,3,2,1.549901,0,0,0,26,40,4,False,1,20.0,True,"[{'id': 1, 'end': 483, 'start': 0, 'height': 4...",True,-0.6,1.0,95,1487,-69,313,20,60,1500,1500,0,0,0,0,100,1500,24,25,3,454,112.035
1,6,313,483,330,26,True,3,2,0.350555,0,0,0,26,40,4,False,1,20.0,True,"[{'id': 1, 'end': 483, 'start': 0, 'height': 4...",True,-0.6,1.0,95,1487,-69,313,20,60,1500,1500,0,0,0,0,100,1500,24,25,3,454,54.043
2,3,313,483,330,26,True,3,2,0.192394,0,0,0,26,40,4,False,1,20.0,True,"[{'id': 1, 'end': 483, 'start': 0, 'height': 4...",True,-0.6,1.0,95,1487,-69,313,20,60,1500,1500,0,0,0,0,100,1500,24,25,3,454,64.875
3,8,418,483,330,14,True,3,2,1.265435,0,0,0,14,50,3,False,0,26.0,True,"[{'id': 1, 'end': 483, 'start': 1, 'height': 4...",True,-0.6,1.0,90,1492,-16,418,20,54,1500,1500,0,0,0,0,0,1500,24,25,3,454,62.011
4,2,418,483,330,5,True,3,2,0.319629,0,0,0,5,50,3,False,0,24.0,True,"[{'id': 1, 'end': 483, 'start': 1, 'height': 4...",True,-0.6,1.0,90,1492,-16,418,20,54,1500,1500,0,0,0,0,0,1500,24,25,3,454,64.731


In [117]:
df.shape

(2336, 42)

Données disponibles au démarrage d'un job :
- 1  thumbnail* : chemin vignette projet
- 2  total_copies : nombre de copies imprimées
- 3  started_at : date de début
- 4  ended_at : date de fin
- 5  machineId* : identifiant machine
- 6  speed : vitesse d'impression
- 7  operator : opérateur
- 8  operator_level : niveau de l'opérateur
- 9  first_page_image_path_on_machine* : chemin de l'image
- 10 paperHeight : hauteur du substrat
- 11 paperWidth : largeur du substrat
- 12 paperName : appelation du substrat
- 13 paperThickness* : épaisseur du substrat (valeur unique à 0)
- 14 id_on_machine : identifiant unique du travail d'impression
- 15 total_copies_requested : nombre de copies demandées
- 16 job_thumbnail_id* : identifiant de vignette image
- 17 uses_ifoil : impression utilisant de la dorure
- 18 uses_iper* : impression nécéssitant l'iper (valeur unique à True)
- 19 scanner_mode : niveau de config du scanner
- 20 iper_bvar_count : compteur # TODO vérifier la fusion notebook 06
- 21 varnishConsumptionVarnish_3d : consommation de vernis en 3d
- 22 varnishConsumptionVarnish_2d* : consommation de vernis en 2d

*\*variables déjà supprimées dans le notebook 05 ou 06*

In [118]:
# colonnes à conserver dispos dans jobs
jobs_cols_to_keep = [
    'duration',
    'total_copies',
    'speed',
    'paperHeight_job', 
    'paperWidth_job', 
    'total_copies_requested',
    'ifoil',
    'scanner_mode', 
    'bars_job', 
    'varnishConsumptionVarnish_3d_job'
    ]
# colonnes à conserver dispos dans job events avec le tag start
jobevents_cols_to_keep = [
    'LED',
    'drops', 
    'speed_ifoil',
    'stampAreas_ifoil',
    'heater1Enabled_ifoil', 
    'heater1Temperature_ifoil',
    'x_imageLayout',
    'y_imageLayout', 
    'power_irDryers', 
    'power_uvDryers',
    'redScore_gridMode_remoteScannerRegistration',
    'redScore_cropmarksMode_remoteScannerRegistration',
    'redScore_fullScannerMode_remoteScannerRegistration',
    'blueScore_fullScannerMode_remoteScannerRegistration',
    'greenScore_fullScannerMode_remoteScannerRegistration',
    'mode_remoteScannerRegistration'
    ]
# colonnes à conserver pour la prédiction
metrics_cols_to_keep = ['identification_events']

# fusion des listes de colonnes à conserver
cols_to_keep = jobs_cols_to_keep + jobevents_cols_to_keep + metrics_cols_to_keep

# dataframe des variables conservées
df_to_encode = df[cols_to_keep].copy()

In [119]:
df_to_encode.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2336 entries, 0 to 2335
Data columns (total 27 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   duration                                              2336 non-null   float64
 1   total_copies                                          2336 non-null   int64  
 2   speed                                                 2336 non-null   int64  
 3   paperHeight_job                                       2336 non-null   int64  
 4   paperWidth_job                                        2336 non-null   int64  
 5   total_copies_requested                                2336 non-null   int64  
 6   ifoil                                                 2336 non-null   bool   
 7   scanner_mode                                          2336 non-null   int64  
 8   bars_job                                              2336

In [120]:
df_to_encode.drop(columns=['stampAreas_ifoil'], axis=0, inplace=True)

In [121]:
# cols = df_to_encode.columns
# for col in cols:
#     if df_to_encode[col].nunique() > 50 :
#         print(col, df_to_encode[col].nunique(), 'min', df_to_encode[col].min(), 'max', df_to_encode[col].max())
#     else :
#         print(col, df_to_encode[col].unique())

In [122]:
df_to_encode.identification_events.unique()

array([ 454,  355,  359,  386,  356, 1003,  357, 1002, 1000,  383,  354,
        387,  325,  451,  445,  328,  351,  446,  382,  476], dtype=int64)

## D) Output

In [123]:
df_to_encode.to_csv(save_csv)