In [143]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler , OneHotEncoder
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support


from keras.optimizers import Adam

import pickle
import joblib
import mlflow
import mlflow.keras
from mlflow.models.signature import ModelSignature
from mlflow.types import ColSpec, TensorSpec
from mlflow.types import Schema


import datetime
import io

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
from tensorflow.keras.regularizers import L2


In [144]:
# nom de fichier et chemin relatif
filename = 'raw_merge_metrics_dataset.csv'
path = '../../data/metrics/'
# création d'un dataframe à partir du csv de données
df = pd.read_csv(
    path+filename, index_col=0).sort_values(by='created_at', ascending=True)
df.head(2)


  df = pd.read_csv(


Unnamed: 0,id,source_events,message_events,timestamp_events,criticality_events,identification_events,sn_modules,name_modules,type_modules,generation_modules,name_counters_modules,value_counters_modules,name_connected_operators,level_connected_operators,created_at,varnishLevelsTargetvolume,varnishLevelsTotalvolume
35738,4169748,,,,,,,Print Engine 1,Varnish Printer,,3D Varnish Counter,1792992,Viktor,Operator,2022-04-15 05:55:06.678000,36192.322612,100000
35737,4169748,,,,,,,iFoil L,iFoil,Gen. 2,Foiled Pages Counter,31092,Viktor,Operator,2022-04-15 05:55:06.678000,36192.322612,100000


In [145]:
df['created_at'] = pd.to_datetime(df['created_at'])

df = df.sort_values(by='created_at')
df.set_index('created_at', inplace=True)

df.head(5)

Unnamed: 0_level_0,id,source_events,message_events,timestamp_events,criticality_events,identification_events,sn_modules,name_modules,type_modules,generation_modules,name_counters_modules,value_counters_modules,name_connected_operators,level_connected_operators,varnishLevelsTargetvolume,varnishLevelsTotalvolume
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2022-04-15 05:55:06.678,4169748,,,,,,,Print Engine 1,Varnish Printer,,3D Varnish Counter,1792992,Viktor,Operator,36192.322612,100000
2022-04-15 05:55:06.678,4169748,,,,,,,iFoil L,iFoil,Gen. 2,Foiled Pages Counter,31092,Viktor,Operator,36192.322612,100000
2022-04-15 05:55:06.678,4169748,,,,,,,iFoil L,iFoil,Gen. 2,Total Pages Counter,22881,Viktor,Operator,36192.322612,100000
2022-04-15 05:55:06.829,4169749,,,,,,,iFoil L,iFoil,Gen. 2,Foiled Pages Counter,31092,Viktor,Operator,36192.322612,100000
2022-04-15 05:55:06.829,4169749,,,,,,,iFoil L,iFoil,Gen. 2,Total Pages Counter,22881,Viktor,Operator,36192.322612,100000


In [146]:
df.columns

Index(['id', 'source_events', 'message_events', 'timestamp_events',
       'criticality_events', 'identification_events', 'sn_modules',
       'name_modules', 'type_modules', 'generation_modules',
       'name_counters_modules', 'value_counters_modules',
       'name_connected_operators', 'level_connected_operators',
       'varnishLevelsTargetvolume', 'varnishLevelsTotalvolume'],
      dtype='object')

In [147]:
df.shape

(3510431, 16)

In [148]:
null_columns = df.isna().any()
print("Columns with missing values:")
print(null_columns[null_columns == True])


Columns with missing values:
source_events            True
message_events           True
timestamp_events         True
criticality_events       True
identification_events    True
sn_modules               True
generation_modules       True
dtype: bool


In [149]:
df.drop_duplicates()
df.shape


(3510431, 16)

In [150]:
# cela va supprimer la colonne sn car vide
df.dropna(axis='columns', how='all', inplace=True)

In [151]:
df['varnishLevelsTotalvolume'].nunique()

3

In [152]:
df['generation_modules'].nunique()

1

In [153]:
# on supprime les colonnes :
# message events car redondante avec la colonne identification_events 
# id , name_connected_operators, level_connected_operators car inutiles pour entrainer le model
# timestamp_events : on a déjà une date pour l'index qui servi pour les prediction dans le temps
# A VERIFIER : generation_modules , la colonne dispose d'une seul valeur qui est 'gen2' 
# 'name_modules' information redondante avec 'type_modules'

#df.drop(columns=['message_events','type_modules','id','name_connected_operators','level_connected_operators','generation_modules','timestamp_events'], inplace=True)

# on conserve la colonne 'timestamp_events'
df.drop(columns=['message_events','type_modules','id','name_connected_operators','level_connected_operators','generation_modules'], inplace=True)

In [155]:
df.shape

(3510431, 9)

In [156]:
df['identification_events'].unique()

array([nan, '391', '330', '332', '333', '377', '334', '331',
       'Kernel_Error', '315', '417', '406', '407', '352', '344',
       'ICB communication error', '376', '445', '325', '343', '345',
       '358', '453', '381', '354', '313', '447', '454', '387', '386',
       '372', '371', '480', '323', '311', '479', '351', '440', '324',
       '321', '0', '349', 'RCB communication error', '385', '357', '418',
       '446', '355', '389', '476', '356', 'iFoil communication error',
       '460', '472', '405', '380', '388', '408', '320', '329', '350',
       '475', '466', '416', '411', '346', '471', 445.0, 391.0, '327',
       430.0, '430', '444', '2', '326', '419',
       'Pilot communication error', '359', '322', 333.0, 330.0, 332.0,
       334.0, 331.0, 377.0, 315.0, 406.0, 407.0, 376.0, 325.0, 454.0,
       313.0, 352.0, 344.0, 385.0, 386.0, 371.0, '384'], dtype=object)

In [157]:
# Initialize a dictionary to store the mapping for non-integer strings
non_int_string_mapping = {}
next_mapping_value = 1000
# 'Kernel_Error' = 1000 , 'ICB communication error' = 1001 ; 'RCB communication error' = 1002 , 'iFoil communication error' = 1003 , 'Pilot communication error' = 

# Function to convert the value
def convert_value(value):
    global next_mapping_value

    if pd.isna(value):
        return value

    if isinstance(value, (int, float)):
        return int(value)
    
    if value.isdigit():
        return int(value)
    
    if value not in non_int_string_mapping:
        non_int_string_mapping[value] = next_mapping_value
        next_mapping_value += 1
        
    return non_int_string_mapping[value]

# Apply the conversion function to the 'identification_events' column
df['identification_events'] = df['identification_events'].apply(convert_value)

# Convert the column to integers, keeping NaN values as float
df['identification_events'] = df['identification_events'].astype(pd.Int64Dtype())

In [158]:
df['identification_events'].unique()

<IntegerArray>
[<NA>,  391,  330,  332,  333,  377,  334,  331, 1000,  315,  417,  406,  407,
  352,  344, 1001,  376,  445,  325,  343,  345,  358,  453,  381,  354,  313,
  447,  454,  387,  386,  372,  371,  480,  323,  311,  479,  351,  440,  324,
  321,    0,  349, 1002,  385,  357,  418,  446,  355,  389,  476,  356, 1003,
  460,  472,  405,  380,  388,  408,  320,  329,  350,  475,  466,  416,  411,
  346,  471,  327,  430,  444,    2,  326,  419, 1004,  359,  322,  384]
Length: 77, dtype: Int64

In [159]:
df['source_events'].unique()

array([nan, 'PLC', 'iFoil', 'Kernel', 'ICB n°5', 'RCB n°1', 'RCB n°3',
       'RCB n°2', 'ICB n°4', 'ICB n°7', 'ICB n°8', 'ICB n°2', 'ICB n°1',
       'ICB n°6', 'Pilot'], dtype=object)

In [160]:
# count null values in each column
null_values_count = df.isnull().sum()

for column, value in null_values_count.items():
    if value > 0:
        print(f"{column}: {value} valeurs manquantes sur {df.shape[0]}")


source_events: 3416265 valeurs manquantes sur 3510431
timestamp_events: 3416265 valeurs manquantes sur 3510431
criticality_events: 3416265 valeurs manquantes sur 3510431
identification_events: 3416265 valeurs manquantes sur 3510431


In [161]:
df['criticality_events'].value_counts()

INFO       68501
ERROR      12817
Name: criticality_events, dtype: int64

In [162]:
# Identify categorical columns (excluding the target column 'criticality')
categorical_columns = ['name_modules',  'name_counters_modules', 'source_events']

categorical_columns

['name_modules', 'name_counters_modules', 'source_events']

In [163]:
# Drop rows with 'nan' in 'criticality_events'
df.dropna(subset=['criticality_events'], inplace=True)

In [164]:
df.index

DatetimeIndex(['2022-04-15 05:55:06.829000', '2022-04-15 06:06:35.404000',
               '2022-04-15 06:06:35.404000', '2022-04-15 06:06:35.404000',
               '2022-04-15 06:07:05.443000', '2022-04-15 06:07:15.471000',
               '2022-04-15 06:07:15.471000', '2022-04-15 06:08:25.616000',
               '2022-04-15 06:09:45.826000', '2022-04-15 06:09:55.832000',
               ...
               '2022-12-12 08:17:37.527000', '2022-12-12 08:17:47.537000',
               '2022-12-12 08:18:07.564000', '2022-12-12 08:18:07.564000',
               '2022-12-12 08:18:57.633000', '2022-12-12 08:19:07.632000',
               '2022-12-12 08:19:07.632000', '2022-12-12 08:19:48.688000',
               '2022-12-12 08:20:17.777000', '2022-12-12 08:21:18.076000'],
              dtype='datetime64[ns]', name='created_at', length=94166, freq=None)

## Analyse du temps

In [175]:
from datetime import datetime, timedelta
# conversion du type de la colonne timestamp_events
df['timestamp_events'] = pd.to_datetime(df['timestamp_events'])
# ordonnancement des dates
df.sort_values(by='timestamp_events', ascending=True, inplace=True)

In [184]:
# dict des données d'ecart de temps entre les lignes timestamp_events
time_diff = {'diff': [], 'days': [], 'seconds': []}
for i in range(len(df.timestamp_events) - 1):
    ecart = df.timestamp_events[i+1] - df.timestamp_events[i]
    time_diff['diff'].append(ecart)
    time_diff['days'].append(ecart.days)
    time_diff['seconds'].append(ecart.seconds)

In [185]:
# Dataframe des données d'ecart de temps entre les lignes
df_time_diff = pd.DataFrame(time_diff)

In [192]:
# moyenne d'ecart de temps entre chaque event
df_time_diff.mean()

diff       0 days 00:03:41.219586619
days                        0.000733
seconds                   157.476546
dtype: object

In [193]:
# moyenne d'ecart de temps entre chaque event des events ayant 0 jours d'écart
df_time_diff[df_time_diff.days == 0].mean()

diff       0 days 00:02:25.615173030
days                             0.0
seconds                   145.182316
dtype: object

In [194]:
# moyenne d'ecart de temps entre chaque event des events ayant plus de 0 jours d'écart
df_time_diff[df_time_diff.days != 0].mean()

diff       2 days 05:29:18.847756756
days                        1.864865
seconds                      31434.0
dtype: object

In [195]:
print(f'Nombre de lignes avec un écart de temps supérieur à 0 days : {len(df_time_diff[df_time_diff.days != 0].values)}')

Nombre de lignes avec un ecart de temps supérieur à 0 days : 37


## Encodage

In [24]:
# One-hot encode categorical columns except for 'criticality_events'
cat_columns = ['name_modules', 'name_counters_modules', 'source_events']

# Initialize OneHotEncoder
ohe = OneHotEncoder(sparse=False)

# Fit and transform the categorical columns
cat_data_encoded = ohe.fit_transform(df[cat_columns])

# Convert the encoded data to a DataFrame
cat_data_encoded_df = pd.DataFrame(cat_data_encoded, columns=ohe.get_feature_names_out(cat_columns))

# Drop the original categorical columns from the DataFrame
df_encoded = df.drop(cat_columns, axis=1)

# Reset the index of both DataFrames to avoid index-related issues
df_encoded.reset_index(drop=True, inplace=True)
cat_data_encoded_df.reset_index(drop=True, inplace=True)

# Concatenate the one-hot encoded DataFrame with the original DataFrame
df_encoded = pd.concat([df_encoded, cat_data_encoded_df], axis=1)

# Label encode 'criticality_events' column
le = LabelEncoder()
le.classes_ = np.array(['INFO', 'WARNING', 'ERROR'])
df_encoded['criticality_events'] = le.fit_transform(df_encoded['criticality_events'])




In [25]:
# Normalize numerical columns
numerical_columns = ['value_counters_modules', 'identification_events', 'varnishLevelsTargetvolume', 'varnishLevelsTotalvolume']
scaler = MinMaxScaler()
df_encoded[numerical_columns] = scaler.fit_transform(df_encoded[numerical_columns])

# Split dataset into X and y
X = df_encoded.drop('criticality_events', axis=1)
y = df_encoded['criticality_events']

In [26]:
X.columns

Index(['identification_events', 'value_counters_modules',
       'varnishLevelsTargetvolume', 'varnishLevelsTotalvolume',
       'name_modules_Print Engine 1', 'name_modules_iFoil L',
       'name_counters_modules_3D Varnish Counter',
       'name_counters_modules_Foiled Pages Counter',
       'name_counters_modules_Total Pages Counter', 'source_events_ICB n°1',
       'source_events_ICB n°2', 'source_events_ICB n°4',
       'source_events_ICB n°5', 'source_events_ICB n°6',
       'source_events_ICB n°7', 'source_events_ICB n°8',
       'source_events_Kernel', 'source_events_PLC', 'source_events_Pilot',
       'source_events_RCB n°1', 'source_events_RCB n°2',
       'source_events_RCB n°3', 'source_events_iFoil'],
      dtype='object')

**!!! LA DATE EN INDEX N'EXISTE PLUS DANS X**

In [27]:
X.shape

(94166, 23)

## Validation croisée

In [29]:
# TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Store the original index information for X_test and y_test
X_test_index = X_test.index
y_test_index = y_test.index

## Séquences

In [None]:
# Function to create sequences
def create_sequences(X_data, y_data, seq_length):
    xs, ys = [], []
    for i in range(len(X_data) - seq_length):
        x = X_data.iloc[i:(i + seq_length)].values
        y = y_data.iloc[i + seq_length]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

In [30]:
# Reshape the input data into sequences of the specified length
sequence_length = 1
X_train, y_train = create_sequences(X_train, y_train, sequence_length)
X_test, y_test = create_sequences(X_test, y_test, sequence_length)


In [31]:
# Define the input schema
input_schema = Schema([
    TensorSpec(type=np.dtype(np.float64), shape=(-1, X_train.shape[1], X_train.shape[2]), name='input')
])

# Define the output schema
output_schema = Schema([
    ColSpec(type='integer', name='output')
])

# Create the signature
signature = ModelSignature(inputs=input_schema, outputs=output_schema)

In [32]:
# Reshape for LSTM
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2])
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2])

In [33]:
X_test.shape

(15693, 1, 23)

In [34]:
import mlflow
import mlflow.keras
import pandas as pd

# id du run à charger
model_id = "7e33dd8ce5104fc3b194f2c564f42202"

# spécifier le chemin d'accès au modèle enregistré
model_uri = f"runs:/{model_id}/trained_model"

# charger le modèle enregistré
loaded_model = mlflow.keras.load_model(model_uri)

# faire des prédictions avec le modèle chargé
predictions = loaded_model.predict(X_test)

# afficher les prédictions
print(predictions)

[[0.1402995  0.7024219  0.15727861]
 [0.14033735 0.7023685  0.15729421]
 [0.14007603 0.70273757 0.15718646]
 ...
 [0.13429974 0.71091443 0.15478586]
 [0.13429974 0.71091443 0.15478586]
 [0.13433489 0.710865   0.1548001 ]]


In [35]:
# Print class names and their corresponding numerical values
class_mapping = dict(zip(le.classes_, range(len(le.classes_))))
print("Class mapping:", class_mapping)




In [57]:
for prediction in predictions:
    status_id = list(prediction).index(max(prediction))
    status_name = list(class_mapping.keys())[list(class_mapping.values()).index(status_id)]
    print(status_name)

INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
