# Test d'un model LSTM (run de mlflow)

## Librairies

In [169]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler , OneHotEncoder
from sklearn.model_selection import train_test_split, TimeSeriesSplit

import mlflow
import mlflow.keras
import mlflow.tensorflow

## Fonctions

In [170]:
# fonction retournant le dataframe d'une colonne fractionnée
# col=colonne à fractionner
# df=dataframe source
# data=dict des colonnes du df à conserver dans le df à retourner
def convert_col_to_df(col, df, data=None):
    
    # création du dictionnaire de données vide
    if data == None :
        data = {}
    # ou liste des clés du dictionnaire input
    else :
        data_keys = list(data.keys())

    # on converti le type des valeurs str en list
    if not isinstance(df[col].loc[0], list):
        df[col] = df[col].apply(lambda x : json.loads(x))

    # liste des clés du dictionnaire de la colonne à partir de la première occurence
    # on recherche la première occurence non vide et de type list 
    # pour l'affecter à une variable first
    for i in range(0, (len(df[col]))):
        value = df[col].loc[i]
        if len(value) > 0 and isinstance(value, list):
            first = value[0]
            break

    # on liste les clés du dictionnaire de l'occurence
    col_keys = first.keys()
    for ck in col_keys :
        data[ck+'_'+col] = []

    # on itére dans la serie pour récupérer les valeurs et les stocker dans le dictionnaire data
    for i in range(df.index.start, df.index.stop):
        # evaluation des valeurs 'str' en 'list'
        values = df[col].loc[i]
        if isinstance(values, list) and len(values) > 0 :
            # ajout des valeurs dans le dictionnaire 'd'
            for value in values :
                for k in value.keys():
                    data[k+'_'+col].append(value.get(k))
                for dk in data_keys:
                    data[dk].append(df[dk].loc[i])

    # re-assignation de la variable df
    df = pd.DataFrame(data)

    return df


def create_dataframe_from_file(file):

    dataframe = pd.read_csv(filepath_or_buffer=file, index_col=0).sort_values(by='created_at', ascending=True)
    # reindexation du dataframe
    dataframe.reset_index(level=None, drop=True, inplace=True, col_level=0, col_fill='')
    # suppression des colonnes ne contenant aucune valeurs
    dataframe.dropna(axis=1, inplace=True)
    # suppression de la colonne machineId
    dataframe.drop('machineId', axis=1, inplace=True)

    # creation d'un dataframe connected_operators
    connected_operators_df = convert_col_to_df('connected_operators', dataframe, {'id':[]})
    # creation d'un dataframe events
    events_df = convert_col_to_df('events', dataframe, {'id':[]})
    # creation d'un dataframe modules
    modules_df = convert_col_to_df('modules', dataframe, {'id':[]})
    # creation d'un dataframe counters (~2m7s)
    counters_df = convert_col_to_df('counters_modules', modules_df, {'type_modules':[], 'id': []})

    # fusion du df modue et du df counter
    merge_modules_df = pd.merge(modules_df, counters_df, on=['id','type_modules'])
    # suppression de la colonne fractionnées
    merge_modules_df = merge_modules_df.drop(['counters_modules'], axis=1)
    # on merge avec le df operators en fonction de l'id de message
    merge_operators_df = pd.merge(merge_modules_df, connected_operators_df, on='id', suffixes=['','_op'])

    # on crée un df, à partir du df events, ne contenant que les lignes ayant un évènement de source Ifoil
    events_ifoil = events_df[events_df.source_events == 'iFoil']
    # on crée un df, à partir du df merge, ne contenant que les lignes ayant un module de type Ifoil
    module_ifoil = merge_operators_df[merge_operators_df.type_modules == 'iFoil']
    # on merge les deux df des lignes Ifoil en focntion de l'id de message
    merge_ifoil_df = pd.merge(events_ifoil, module_ifoil, how='outer', on='id', suffixes=['_event','_module'])

    # on crée un df, à partir du df events, ne contenant que les lignes ayant un évènement de source Ifoil
    events_no_ifoil = events_df[events_df.source_events != 'iFoil']
    # on crée un df, à partir du df merge, ne contenant que les lignes ayant un module de type Ifoil
    module_no_ifoil = merge_operators_df[merge_operators_df.type_modules != 'iFoil']
    # on merge les deux df des lignes Ifoil en focntion de l'id de message
    merge_no_ifoil_df = pd.merge(events_no_ifoil, module_no_ifoil, how='outer', on='id', suffixes=['_event','_module'])
    # on concatene les df ifoi et no_ifoil pour ne perdre aucune valeur
    concat_events_df = pd.concat([merge_ifoil_df, merge_no_ifoil_df])
    
    # dernier merge de toutes les colonnes
    merge_dataframe = pd.merge(concat_events_df, dataframe, how='outer', on='id', suffixes=['','_metrics'])
    # suppression des colonnes fractionnées
    merge_dataframe = merge_dataframe.drop(['connected_operators','modules','events'], axis=1)

    return merge_dataframe



## Dataset

In [171]:
# nom de fichier et chemin relatif
filename = '2023_metrics.csv'
path = '../../data/test_model/'
# création d'un dataframe à partir du csv de données
df = create_dataframe_from_file(path+filename)
df.head(2)

Unnamed: 0,id,source_events,message_events,timestamp_events,criticality_events,identification_events,sn_modules,name_modules,type_modules,generation_modules,name_counters_modules,value_counters_modules,name_connected_operators,level_connected_operators,status,created_at,varnishLevelsTargetvolume,varnishLevelsTotalvolume
0,10918491,iFoil,Bourrage : module impression E-0354,2023-01-12T07:49:14.306Z,INFO,354,,iFoil L,iFoil,Gen. 2,Total Pages Counter,55355,Viktor,Operator,ERR,2023-01-12 07:49:26.459000,89047.452779,100000
1,10918491,iFoil,Bourrage : module impression E-0354,2023-01-12T07:49:14.306Z,INFO,354,,iFoil L,iFoil,Gen. 2,Foiled Pages Counter,76433,Viktor,Operator,ERR,2023-01-12 07:49:26.459000,89047.452779,100000


## Encodage

In [172]:
df['created_at'] = pd.to_datetime(df['created_at'])
df = df.sort_values(by='created_at')
df.set_index('created_at', inplace=True)

In [173]:
# on supprime les colonnes :
df.dropna(axis='columns', how='all', inplace=True)
df.drop(columns=['message_events','type_modules','sn_modules','id','name_connected_operators','level_connected_operators','generation_modules','timestamp_events'], inplace=True)
df = df.drop(["status"], axis=1)

In [174]:
df.columns

Index(['source_events', 'criticality_events', 'identification_events',
       'name_modules', 'name_counters_modules', 'value_counters_modules',
       'varnishLevelsTargetvolume', 'varnishLevelsTotalvolume'],
      dtype='object')

In [175]:
# Initialize a dictionary to store the mapping for non-integer strings
non_int_string_mapping = {}
next_mapping_value = 1000
# 'Kernel_Error' = 1000 , 'ICB communication error' = 1001 ; 'RCB communication error' = 1002 , 'iFoil communication error' = 1003 , 'Pilot communication error' = 

# Function to convert the value
def convert_value(value):
    global next_mapping_value

    if pd.isna(value):
        return value

    if isinstance(value, (int, float)):
        return int(value)
    
    if value.isdigit():
        return int(value)
    
    if value not in non_int_string_mapping:
        non_int_string_mapping[value] = next_mapping_value
        next_mapping_value += 1
        
    return non_int_string_mapping[value]

# Apply the conversion function to the 'identification_events' column
df['identification_events'] = df['identification_events'].apply(convert_value)
# Convert the column to integers, keeping NaN values as float
df['identification_events'] = df['identification_events'].astype(pd.Int64Dtype())


# Identify categorical columns (excluding the target column 'criticality')
categorical_columns = ['name_modules',  'name_counters_modules', 'source_events']


# Drop rows with 'nan' in 'criticality_events'
df.dropna(subset=['criticality_events'], inplace=True)


# One-hot encode categorical columns except for 'criticality_events'
cat_columns = ['name_modules', 'name_counters_modules', 'source_events']
# Initialize OneHotEncoder
ohe = OneHotEncoder(sparse=False)
# Fit and transform the categorical columns
cat_data_encoded = ohe.fit_transform(df[cat_columns])
# Convert the encoded data to a DataFrame
cat_data_encoded_df = pd.DataFrame(cat_data_encoded, columns=ohe.get_feature_names_out(cat_columns))
# Drop the original categorical columns from the DataFrame
df_encoded = df.drop(cat_columns, axis=1)
# Reset the index of both DataFrames to avoid index-related issues
df_encoded.reset_index(drop=True, inplace=True)
cat_data_encoded_df.reset_index(drop=True, inplace=True)
# Concatenate the one-hot encoded DataFrame with the original DataFrame
df_encoded = pd.concat([df_encoded, cat_data_encoded_df], axis=1)
# Label encode 'criticality_events' column
le = LabelEncoder()
le.classes_ = np.array(['INFO', 'WARNING', 'ERROR'])
df_encoded['criticality_events'] = le.fit_transform(df_encoded['criticality_events'])


# Normalize numerical columns
numerical_columns = ['value_counters_modules', 'identification_events', 'varnishLevelsTargetvolume', 'varnishLevelsTotalvolume']
scaler = MinMaxScaler()
df_encoded[numerical_columns] = scaler.fit_transform(df_encoded[numerical_columns])



In [176]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69 entries, 0 to 68
Data columns (total 13 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   criticality_events                          69 non-null     int32  
 1   identification_events                       69 non-null     float64
 2   value_counters_modules                      69 non-null     float64
 3   varnishLevelsTargetvolume                   69 non-null     float64
 4   varnishLevelsTotalvolume                    69 non-null     float64
 5   name_modules_Print Engine 1                 69 non-null     float64
 6   name_modules_iFoil L                        69 non-null     float64
 7   name_counters_modules_3D Varnish Counter    69 non-null     float64
 8   name_counters_modules_Foiled Pages Counter  69 non-null     float64
 9   name_counters_modules_Total Pages Counter   69 non-null     float64
 10  source_events_Ke

## Prédiction

In [177]:
# id du run à charger
model_id = "7e33dd8ce5104fc3b194f2c564f42202"
# spécifier le chemin d'accès au modèle enregistré
model_uri = f"runs:/{model_id}/trained_model"
# charger le modèle enregistré
loaded_model = mlflow.tensorflow.load_model(model_uri)
print(type(loaded_model))

  metadata.ParseFromString(file_content)


<class 'tensorflow.python.saved_model.load.Loader._recreate_base_user_object.<locals>._UserObject'>


In [178]:
# to fetch the model's environment
mlflow.pyfunc.get_model_dependencies(model_uri)

2023/05/12 11:58:10 INFO mlflow.pyfunc: To install the dependencies that were used to train the model, run the following command: '%pip install -r D:\Fabeon_msi\Project JetVarnish3d-ErrorPrediction\Prediction-Erreur-JetVarnish3D\models\LTSM_model_MLFLOW\mlruns\0\7e33dd8ce5104fc3b194f2c564f42202\artifacts\trained_model\requirements.txt'.


'D:\\Fabeon_msi\\Project JetVarnish3d-ErrorPrediction\\Prediction-Erreur-JetVarnish3D\\models\\LTSM_model_MLFLOW\\mlruns\\0\\7e33dd8ce5104fc3b194f2c564f42202\\artifacts\\trained_model\\requirements.txt'

In [179]:
# to install the dependencies that were used to train the model
%pip install -r mlruns/0/7e33dd8ce5104fc3b194f2c564f42202/artifacts/trained_model/requirements.txt --quiet

Note: you may need to restart the kernel to use updated packages.


In [183]:
# Obtenir les informations sur le modèle
predict_fn = loaded_model.signatures["serving_default"]
print(predict_fn)

ConcreteFunction signature_wrapper(*, lstm_input)
  Args:
    lstm_input: float32 Tensor, shape=(None, 1, 23)
  Returns:
    {'dense': <1>}
      <1>: float32 Tensor, shape=(None, 3)


In [192]:
# Convertir le DataFrame en un tableau NumPy
data_array = df_encoded.values
# Réorganiser les dimensions du tableau
input = np.reshape(data_array, (-1, 1, 23)).astype('float32')
# Afficher la forme du tenseur
print(f'input {input.dtype}, shape={input.shape}')

input float32, shape=(39, 1, 23)


*(39, 1, 23) signifie que le DataFrame df_encoded a 39 échantillons, 1 dimension supplémentaire et 23 caractéristiques (ou colonnes).*

*Le tenseur résultant a la forme (39, 1, 23), où :*

*- La première dimension 39 correspond à la taille de l'échantillon, c'est-à-dire le nombre de lignes dans le DataFrame.*

*- La deuxième dimension 1 correspond à la dimension supplémentaire spécifiée lors de la réorganisation du tableau.*

*-La troisième dimension 23 correspond au nombre de caractéristiques (colonnes) dans du DataFrame.*

In [181]:
predictions = predict_fn(lstm_input=input)

InvalidArgumentError: cannot compute __inference_signature_wrapper_305208 as input #0(zero-based) was expected to be a float tensor but is a double tensor [Op:__inference_signature_wrapper_305208]

In [None]:
# faire des prédictions avec le modèle chargé
predictions = loaded_model.predict(lstm_input=input)
# afficher les prédictions
print(predictions)

AttributeError: '_UserObject' object has no attribute 'predict'

In [None]:
# Print class names and their corresponding numerical values
class_mapping = dict(zip(le.classes_, range(len(le.classes_))))
print("Class mapping:", class_mapping)




In [None]:
for prediction in predictions:
    status_id = list(prediction).index(max(prediction))
    status_name = list(class_mapping.keys())[list(class_mapping.values()).index(status_id)]
    print(status_name)

INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
INFO
