# 07 - Notebook de fusion des dataset metrics et jobs/job_events

# A. Imports

## a) Librairies

In [140]:
import os, json, ast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## b) Datasets

In [141]:
# source path to datasets
path = '../data/'
metrics = 'metrics/clean_merge_metrics_dataset.csv'
jobs = 'jobs/merge_raw_jobs_and_clean_jobevents_dataset.csv'

# B. Jeux de données

### a) Metrics

In [142]:
# création d'un dataframe à partir du csv de données
metrics_df = pd.read_csv(os.path.join(path, metrics), index_col=0)
print(f'metrics dataset shape {metrics_df.shape}')

metrics dataset shape (3510431, 14)


In [143]:
# la colonne timestamp contient-elle des valeurs en double ?
metrics_df['timestamp'].duplicated().any()

True

In [144]:
metrics_df['timestamp'].min()

'2022-04-15 05:55:06.678000+00:00'

In [145]:
# Grouper les lignes par la colonne "timestamp" et obtenir les index correspondants
groupes = metrics_df.groupby('timestamp').groups
print(len(groupes.keys()))

1242037


In [146]:
metrics_df.head(3)

Unnamed: 0,source_events,timestamp,criticality_events,identification_events,name_modules,type_modules,generation_modules,name_counters_modules,value_counters_modules,name_connected_operators,level_connected_operators,status,varnishLevelsTargetvolume,varnishLevelsTotalvolume
0,iFoil,2022-04-15 06:06:56.278000+00:00,INFO,391,iFoil L,iFoil,Gen. 2,Total Pages Counter,22881,Viktor,Operator,IDLE,36192,100000
1,iFoil,2022-04-15 06:06:56.278000+00:00,INFO,391,iFoil L,iFoil,Gen. 2,Foiled Pages Counter,31092,Viktor,Operator,IDLE,36192,100000
2,PLC,2022-04-15 06:06:56.418000+00:00,INFO,330,Print Engine 1,Varnish Printer,,3D Varnish Counter,1792992,Viktor,Operator,IDLE,36192,100000


### b) Jobs

In [147]:
# création d'un dataframe à partir du csv de données
jobs_df = pd.read_csv(os.path.join(path, jobs), index_col=0)
print(f'jobs dataset shape {jobs_df.shape}')

jobs dataset shape (16295, 42)


In [148]:
# la colonne started_at contient-elle des valeurs en double ?
jobs_df['started_at'].duplicated().any()

False

In [149]:
jobs_df['started_at'].min()

'2021-06-18 09:22:46.866000+00:00'

In [150]:
jobs_df.head(3)

Unnamed: 0,started_at,ended_at,paperHeight_job,paperWidth_job,scanner_mode,bars_job,varnishConsumptionVarnish_3d_job,jobId,total_copies_requested,LED,...,leftMargin_remoteScannerRegistration,redScore_gridMode_remoteScannerRegistration,redScore_cropmarksMode_remoteScannerRegistration,redScore_fullScannerMode_remoteScannerRegistration,blueScore_fullScannerMode_remoteScannerRegistration,greenScore_fullScannerMode_remoteScannerRegistration,mode_remoteScannerRegistration,jobState,total_copies,varnishConsumptionVarnish_3d_event
0,2021-06-18 09:22:46.866000+00:00,2021-06-18 09:22:46.866000+00:00,520,740,0,0,0.0,1624008166,3,10,...,0,1500,1500,1500,16,16,1,UNDEFINED,0,0.0
1,2021-06-18 10:08:13.716000+00:00,2021-06-18 10:10:22.257000+00:00,740,520,0,0,0.0,1624010893,100,30,...,0,1500,1500,1500,16,10,1,ERROR,3,1.440239
2,2021-06-18 10:11:52.165000+00:00,2021-06-18 10:18:20.294000+00:00,740,520,0,0,0.0,1624011111,100,30,...,0,1500,1500,1500,16,10,1,CANCELED,70,33.607494


Les tailles des datasets sont déséquilibrés :

- 3510431 lignes pour metrics

- 16295 lignes pour jobs

Les dates de début sont différentes :

- '2022-04-15 05:55:06.678000+00:00' pour metrics

- '2021-06-18 09:22:46.866000+00:00' pour jobs

Le dataset metrics compte 1242037 doublons pour la colonne timestamp

# C. Equilibrage des jeux de données

## a) Réduction de jobs

In [151]:
# concordance des données de temps dans un même cadre
jobs_reduced = jobs_df[jobs_df.started_at > metrics_df.timestamp.min()]
jobs_reduced.shape

(10516, 42)

## b) Réduction de metrics

#### Par source d'évènement

In [152]:
metrics_df['source_events'].unique()

array(['iFoil', 'PLC', nan, 'Kernel', 'RCB n°1', 'RCB n°2', 'ICB n°1',
       'RCB n°3', 'ICB n°5', 'ICB n°7', 'ICB n°8', 'Pilot', 'ICB n°4',
       'ICB n°2', 'ICB n°6'], dtype=object)

In [153]:
# on se concentre sur les données de metrics qui ont une source d'évènement
metrics_by_source = metrics_df[metrics_df['source_events'].notna()]
metrics_by_source.shape

(94166, 14)

In [154]:
metrics_by_source['criticality_events'].value_counts()

INFO       68501
ERROR      12817
Name: criticality_events, dtype: int64

#### Par identifiant d'évènement

In [155]:
# lignes avec des évènements identifiant une maintenance
id_to_drop_1 = [391, 330, 377, 407, 332, 331, 313, 333, 376, 372, 344, 343, 371, 358, 334, 311, 472, 0, 408, 406, 350, 2, 352, 346]
# lignes avec des évènements identifiant une intervention humaine
id_to_drop_2 = [352, 324, 381, 440, 385, 405, 447, 388, 320, 417, 444, 329, 315, 384, 345, 349, 466, 419]

id_to_drop = id_to_drop_1 + id_to_drop_2
# on supprime les lignes avec des évènements identifiant une maintenance
metrics_by_identification = metrics_by_source[~metrics_by_source['identification_events'].isin(id_to_drop)]
metrics_by_identification.shape

(8062, 14)

In [156]:
metrics_by_identification['criticality_events'].value_counts()

ERROR      5933
INFO       1530
Name: criticality_events, dtype: int64

In [157]:
metrics_by_identification_1 = metrics_by_source[~metrics_by_source['identification_events'].isin([331,330,334,332])]
metrics_by_identification_1['criticality_events'].value_counts()

INFO       13872
ERROR      12357
Name: criticality_events, dtype: int64

In [158]:
# Grouper les lignes par la colonne "timestamp" et obtenir les index correspondants
#len(metrics_by_criticity.groupby('timestamp').groups.keys())

# D. Fusion ou concaténation des jeux de données réduits

In [159]:
metrics_reduced = metrics_by_identification_1.copy()

## a) Comparaison des dates entre dataset

In [160]:
# Vérifier si des dates communes existent
dates_communes_exist = jobs_reduced['started_at'].isin(metrics_reduced['timestamp']).any()

# Afficher le résultat
if dates_communes_exist:
    # Filtrer les dates communes
    dates_communes = jobs_df['started_at'][jobs_df['started_at'].isin(metrics_df['timestamp'])]
    
    # Compter le nombre de dates communes
    nombre_dates_communes = len(dates_communes)
    
    print(f"Des dates communes existent entre les ensembles de données: {nombre_dates_communes}")
else:
    print("Aucune date commune n'a été trouvée entre les ensembles de données.")


Aucune date commune n'a été trouvée entre les ensembles de données.


### Nombre de lignes par mois pour chaque dataset

In [161]:
def compare_datetime_series_shapes(s1, s2):
    s1_serie = {}
    s2_serie = {}
    s1 = pd.to_datetime(s1)
    s2 = pd.to_datetime(s2)
    for i in range(1,13):
        s1_month = s1[s1.dt.month == i]
        s2_month = s2[s2.dt.month == i]
        print(i, s1_month.shape, s2_month.shape)
        s1_serie[i] = s1_month.index
        s2_serie[i] = s2_month.index
    return s1_serie, s2_serie

In [162]:
# on liste les index des lignes par mois
jobs_indexes_by_month, metrics_indexes_by_month = compare_datetime_series_shapes(jobs_reduced['started_at'], metrics_reduced['timestamp'])

1 (0,) (0,)
2 (0,) (0,)
3 (0,) (0,)
4 (728,) (2760,)
5 (1378,) (4920,)
6 (1615,) (5556,)
7 (1290,) (4985,)
8 (973,) (3527,)
9 (1276,) (3740,)
10 (1371,) (4905,)
11 (1239,) (4842,)
12 (646,) (2554,)


### Analyse du nombre de données sur un mois

In [25]:
# month = 5

# Selection des lignes de jobs en Mai
# jobs_df['started_at'] = pd.to_datetime(jobs_df['started_at'])
# jobs_batch = jobs_df[ (jobs_df['started_at'].dt.month == month)]
# jobs_batch = jobs_df.loc[jobs_df.index.isin(jobs_indexes_by_month.get(month))]
# jobs_batch.shape

# Selection des lignes de metrics en Mai
# metrics_df['timestamp'] = pd.to_datetime(metrics_df['timestamp'])
# metrics_batch = metrics_df[ (metrics_df['timestamp'].dt.month == month)]
# metrics_batch = metrics_df.loc[metrics_df.index.isin(metrics_indexes_by_month.get(month))]
# metrics_batch.shape

## b) Concaténation des datasets

In [26]:
# # Conversion des colonnes en datetime en utilisant .loc
# jobs_reduced.loc['started_at'] = pd.to_datetime(jobs_reduced['started_at'])
# jobs_reduced.loc['ended_at'] = pd.to_datetime(jobs_reduced['ended_at'])
# metrics_reduced.loc['timestamp'] = pd.to_datetime(metrics_reduced['timestamp'])
# # matrice de comparaison
# compare = (jobs_reduced['started_at'].values[:, None] < metrics_reduced['timestamp'].values.T) & (jobs_reduced['ended_at'].values[:, None] > metrics_reduced['timestamp'].values.T)
# ## get cell numbers which is in range 0 to matrix size which meets the condition
# ind = np.arange(len(metrics_reduced)*len(jobs_reduced))[compare.ravel()]

# ## calculate row and column index from cell number
# concat_df = pd.concat([jobs_reduced.iloc[ind//len(metrics_reduced)].reset_index(drop=True), metrics_reduced.iloc[ind%len(metrics_reduced)].reset_index(drop=True)], axis=1, sort=False)
# concat_df['timestamp'].duplicated().any()
# duplicate_indices = concat_df[concat_df['timestamp'].duplicated()].index

# duplicate_rows = concat_df[concat_df.timestamp.duplicated()]
# duplicate_rows

In [163]:
# on prépare la concaténation des données de jobs et metrics du mois de juin

month = 0

jobs_reduced['started_at'] = pd.to_datetime(jobs_reduced['started_at'])
metrics_reduced['timestamp'] = pd.to_datetime(metrics_reduced['timestamp'])

if month == 0 :
    # DataFrame 1 avec les intervalles de temps
    dataframe1 = jobs_reduced.copy()
    # DataFrame 2 avec les valeurs de date
    dataframe2 = metrics_reduced.copy()
else :
     # DataFrame 1 avec les intervalles de temps
    dataframe1 = jobs_reduced[(jobs_reduced['started_at'].dt.month == month)].copy()
    # DataFrame 2 avec les valeurs de date
    dataframe2 = metrics_reduced[(metrics_reduced['timestamp'].dt.month == month)].copy()
   

# Conversion des colonnes en format de date
dataframe1['started_at'] = pd.to_datetime(dataframe1['started_at'])
dataframe1['ended_at'] = pd.to_datetime(dataframe1['ended_at'])
dataframe2['timestamp'] = pd.to_datetime(dataframe2['timestamp'])

# Création de la nouvelle colonne dans dataframe2 pour l'association
dataframe2['jobId'] = 0

# Parcours des lignes de metrics
for index2, row2 in dataframe2.iterrows():
    metrics_timestamp = row2['timestamp']
    # Vérification pour chaque ligne de job
    for index1, row1 in dataframe1.iterrows():
        if row2['timestamp'] >= row1['started_at'] and row2['timestamp']  <= row1['ended_at']:
            dataframe2.loc[index2, 'jobId'] = dataframe1.loc[index1, 'jobId']
            break

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jobs_reduced['started_at'] = pd.to_datetime(jobs_reduced['started_at'])


In [164]:
dataframe2['criticality_events'].value_counts()

INFO       13872
ERROR      12357
Name: criticality_events, dtype: int64

In [165]:
# on concat les données de metrics et jobs
concat_df = pd.concat([dataframe1, dataframe2]).reset_index(drop=True)
concat_df.shape

(48305, 56)

In [172]:
# Compter le nombre de clés dont len(valeur) > 1
count = 0

for _, group in concat_df.groupby('jobId'):
    if len(group) > 1:
        count += 1

# Afficher le nombre de clés
print("Nombre de clés dont len(valeur) > 1 :", count)


Nombre de clés dont len(valeur) > 1 : 5894


In [173]:
# on ne conserve que les lignes qui ont un jobId
concat_df = concat_df[concat_df.jobId != 0]
concat_df.shape

(24390, 56)

In [174]:
# on remplace les valeurs maquantes des lignes metrics par celles de jobs avec le jobId correspondant
for job_id in concat_df['jobId'].unique():
    if concat_df['jobId'].value_counts().get(job_id, 0) > 1:
        concat_df.loc[concat_df['jobId'] == job_id] = concat_df.loc[concat_df['jobId'] == job_id].fillna(method='ffill')

In [120]:
# # on remplace les valeurs manquantes de criticality et identification par celle de la ligne précédente
# concat_df.sort_values('started_at',ascending=True, inplace=True)
# # on remplace les valeurs nulles de la colonne "criticality_events" par 'UNDEFINED'
# concat_df['criticality_events'] = concat_df['criticality_events'].fillna(method='ffill')
# # on remplace les valeurs nulles de la colonne "criticality_events" par 'UNDEFINED'
# concat_df['identification_events'] = concat_df['identification_events'].fillna(method='ffill')

In [176]:
concat_df_1 = concat_df[~concat_df['timestamp'].isnull()]

In [177]:
concat_df_1

Unnamed: 0,started_at,ended_at,paperHeight_job,paperWidth_job,scanner_mode,bars_job,varnishConsumptionVarnish_3d_job,jobId,total_copies_requested,LED,...,name_modules,type_modules,generation_modules,name_counters_modules,value_counters_modules,name_connected_operators,level_connected_operators,status,varnishLevelsTargetvolume,varnishLevelsTotalvolume
10697,2022-04-20 18:19:10.324000+00:00,2022-04-20 18:22:53.589000+00:00,450.0,320.0,3.0,2.0,5.143471,1650478750,120.0,30.0,...,iFoil L,iFoil,Gen. 2,Total Pages Counter,35755.0,Viktor,Operator,WARNING,29412.0,100000.0
10698,2022-04-20 18:19:10.324000+00:00,2022-04-20 18:22:53.589000+00:00,450.0,320.0,3.0,2.0,5.143471,1650478750,120.0,30.0,...,iFoil L,iFoil,Gen. 2,Foiled Pages Counter,31155.0,Viktor,Operator,WARNING,29412.0,100000.0
10699,2022-04-20 18:19:10.324000+00:00,2022-04-20 18:22:53.589000+00:00,450.0,320.0,3.0,2.0,5.143471,1650478750,120.0,30.0,...,Print Engine 1,Varnish Printer,Gen. 2,3D Varnish Counter,1805874.0,Viktor,Operator,WARNING,29412.0,100000.0
10700,2022-04-20 18:19:10.324000+00:00,2022-04-20 18:22:53.589000+00:00,450.0,320.0,3.0,2.0,5.143471,1650478750,120.0,30.0,...,iFoil L,iFoil,Gen. 2,Total Pages Counter,35755.0,Viktor,Operator,ERR,29407.0,100000.0
10701,2022-04-20 18:19:10.324000+00:00,2022-04-20 18:22:53.589000+00:00,450.0,320.0,3.0,2.0,5.143471,1650478750,120.0,30.0,...,iFoil L,iFoil,Gen. 2,Foiled Pages Counter,31155.0,Viktor,Operator,ERR,29407.0,100000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48293,2022-12-12 08:03:35.216000+00:00,2022-12-12 08:04:30.963000+00:00,450.0,320.0,1.0,2.0,1.227373,1670832215,80.0,30.0,...,Print Engine 1,Varnish Printer,,3D Varnish Counter,2547017.0,Viktor,Operator,ERR,89030.0,100000.0
48296,2022-12-12 08:07:50.660000+00:00,2022-12-12 08:11:26.877000+00:00,450.0,320.0,1.0,2.0,22.095466,1670832470,80.0,30.0,...,Print Engine 1,Varnish Printer,,3D Varnish Counter,2547017.0,Viktor,Operator,IDLE,89029.0,100000.0
48299,2022-12-12 08:13:29.110000+00:00,2022-12-12 08:14:41.168000+00:00,450.0,320.0,1.0,2.0,4.245281,1670832808,80.0,30.0,...,Print Engine 1,Varnish Printer,,3D Varnish Counter,2547039.0,Viktor,Operator,ERR,89002.0,100000.0
48302,2022-12-12 08:17:32.467000+00:00,2022-12-12 08:17:45.689000+00:00,450.0,320.0,1.0,2.0,0.000000,1670833052,3.0,30.0,...,Print Engine 1,Varnish Printer,,3D Varnish Counter,2547039.0,Viktor,Operator,ERR,89002.0,100000.0


In [182]:
concat_df_1['criticality_events'].value_counts()

ERROR      4211
INFO       2774
Name: criticality_events, dtype: int64

### Output

In [179]:
concat_df.to_csv(path_or_buf='../data/training_dataset.csv')

In [180]:
concat_df.shape

(24390, 56)

In [181]:
concat_df['criticality_events'].value_counts()

ERROR      4211
INFO       2774
Name: criticality_events, dtype: int64