In [1]:
import pandas as pd
import numpy as np

import os

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from datetime import datetime
from babel.dates import format_date
import seaborn as sns

from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

from sklearn.model_selection import train_test_split , TimeSeriesSplit, cross_val_score , GridSearchCV

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

from keras.models import load_model
from keras.callbacks import ModelCheckpoint

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

from pprint import pprint

In [2]:
# nom de fichier et chemin relatif
filename = 'merge_raw_metrics_dataset.csv'
path = '../data/metrics/'
# création d'un dataframe à partir du csv de données
df = pd.read_csv(
    path+filename, index_col=0).sort_values(by='created_at', ascending=True)
df.head(2)


  df = pd.read_csv(


Unnamed: 0,id,sn_modules,name_modules,type_modules,generation_modules,name_counters_modules,value_counters_modules,name_connected_operators,level_connected_operators,source_events,message_events,timestamp_events,criticality_events,identification_events,created_at,varnishLevelsTargetvolume,varnishLevelsTotalvolume
0,4169748,,Print Engine 1,Varnish Printer,,3D Varnish Counter,1792992,Viktor,Operator,,,,,,2022-04-15 05:55:06.678000,36192.322612,100000
1,4169748,,iFoil L,iFoil,Gen. 2,Total Pages Counter,22881,Viktor,Operator,,,,,,2022-04-15 05:55:06.678000,36192.322612,100000


In [3]:
df['created_at'] = pd.to_datetime(df['created_at'])

df = df.sort_values(by='created_at')
df.set_index('created_at', inplace=True)

df.head(5)

Unnamed: 0_level_0,id,sn_modules,name_modules,type_modules,generation_modules,name_counters_modules,value_counters_modules,name_connected_operators,level_connected_operators,source_events,message_events,timestamp_events,criticality_events,identification_events,varnishLevelsTargetvolume,varnishLevelsTotalvolume
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2022-04-15 05:55:06.678,4169748,,Print Engine 1,Varnish Printer,,3D Varnish Counter,1792992,Viktor,Operator,,,,,,36192.322612,100000
2022-04-15 05:55:06.678,4169748,,iFoil L,iFoil,Gen. 2,Total Pages Counter,22881,Viktor,Operator,,,,,,36192.322612,100000
2022-04-15 05:55:06.678,4169748,,iFoil L,iFoil,Gen. 2,Foiled Pages Counter,31092,Viktor,Operator,,,,,,36192.322612,100000
2022-04-15 05:55:06.829,4169749,,Print Engine 1,Varnish Printer,,3D Varnish Counter,1792992,Viktor,Operator,PLC,JV-Ti non prêt : impression impossible,2022-04-15T05:55:23.462Z,INFO,391.0,36192.322612,100000
2022-04-15 05:55:06.829,4169749,,iFoil L,iFoil,Gen. 2,Total Pages Counter,22881,Viktor,Operator,PLC,JV-Ti non prêt : impression impossible,2022-04-15T05:55:23.462Z,INFO,391.0,36192.322612,100000


In [4]:
df.columns

Index(['id', 'sn_modules', 'name_modules', 'type_modules',
       'generation_modules', 'name_counters_modules', 'value_counters_modules',
       'name_connected_operators', 'level_connected_operators',
       'source_events', 'message_events', 'timestamp_events',
       'criticality_events', 'identification_events',
       'varnishLevelsTargetvolume', 'varnishLevelsTotalvolume'],
      dtype='object')

In [5]:
df.shape

(3546276, 16)

In [6]:
null_columns = df.isna().any()
print("Columns with missing values:")
print(null_columns[null_columns == True])


Columns with missing values:
sn_modules               True
generation_modules       True
source_events            True
message_events           True
timestamp_events         True
criticality_events       True
identification_events    True
dtype: bool


In [7]:
df.drop_duplicates()
df.shape


(3546276, 16)

In [8]:
# cela va supprimer la colonne sn car vide
df.dropna(axis='columns', how='all', inplace=True)

In [9]:
df['varnishLevelsTotalvolume'].nunique()

3

In [10]:
df['generation_modules'].nunique()

1

In [11]:
# on supprime les colonnes :
# message events car redondante avec la colonne identification_events 
# id , name_connected_operators, level_connected_operators car inutiles pour entrainer le model
# timestamp_events : on a déjà une date pour l'index qui servi pour les prediction dans le temps
# A VERIFIER : generation_modules , la colonne dispose d'une seul valeur qui est 'gen2' 
# 'name_modules' information redondante avec 'type_modules'
df.drop(columns=['message_events','type_modules','id','name_connected_operators','level_connected_operators','generation_modules','timestamp_events'], inplace=True)

In [12]:
df.shape

(3546276, 8)

In [13]:
df['identification_events'].unique()

array([nan, '391', '330', '332', '377', '333', '334', '331',
       'Kernel_Error', '315', '417', '406', '407', '352', '344',
       'ICB communication error', '376', '445', '325', '343', '345',
       '358', '453', '381', '354', '313', '447', '454', '387', '386',
       '372', '371', '323', '480', '311', '479', '351', '440', '324',
       '321', '0', '349', 'RCB communication error', '385', '357', '418',
       '446', '355', '389', '476', '356', 'iFoil communication error',
       '460', '472', '405', '380', '388', '408', 445.0, 391.0, 330.0,
       333.0, 408.0, 407.0, 406.0, 332.0, 334.0, 472.0, 331.0, 352.0,
       '320', '329', '350', '475', '466', '416', '411', '346', '471',
       '327', 430.0, '430', '444', '2', '326', '419',
       'Pilot communication error', '359', 313.0, 377.0, 453.0, 376.0,
       344.0, 325.0, 454.0, 315.0, 417.0, '322', 385.0, 371.0, 386.0,
       '384'], dtype=object)

In [14]:
# Initialize a dictionary to store the mapping for non-integer strings
non_int_string_mapping = {}
next_mapping_value = 1000
# 'Kernel_Error' = 1000 , 'ICB communication error' = 1001 ; 'RCB communication error' = 1002 , 'iFoil communication error' = 1003 , 'Pilot communication error' = 

# Function to convert the value
def convert_value(value):
    global next_mapping_value

    if pd.isna(value):
        return value

    if isinstance(value, (int, float)):
        return int(value)
    
    if value.isdigit():
        return int(value)
    
    if value not in non_int_string_mapping:
        non_int_string_mapping[value] = next_mapping_value
        next_mapping_value += 1
        
    return non_int_string_mapping[value]

# Apply the conversion function to the 'identification_events' column
df['identification_events'] = df['identification_events'].apply(convert_value)

# Convert the column to integers, keeping NaN values as float
df['identification_events'] = df['identification_events'].astype(pd.Int64Dtype())

In [15]:
df['identification_events'].unique()

<IntegerArray>
[<NA>,  391,  330,  332,  377,  333,  334,  331, 1000,  315,  417,  406,  407,
  352,  344, 1001,  376,  445,  325,  343,  345,  358,  453,  381,  354,  313,
  447,  454,  387,  386,  372,  371,  323,  480,  311,  479,  351,  440,  324,
  321,    0,  349, 1002,  385,  357,  418,  446,  355,  389,  476,  356, 1003,
  460,  472,  405,  380,  388,  408,  320,  329,  350,  475,  466,  416,  411,
  346,  471,  327,  430,  444,    2,  326,  419, 1004,  359,  322,  384]
Length: 77, dtype: Int64

In [16]:
df['source_events'].unique()

array([nan, 'PLC', 'iFoil', 'Kernel', 'ICB n°5', 'RCB n°1', 'RCB n°2',
       'RCB n°3', 'ICB n°7', 'ICB n°4', 'ICB n°8', 'ICB n°2', 'ICB n°1',
       'ICB n°6', 'Pilot'], dtype=object)

In [17]:
# count null values in each column
null_values_count = df.isnull().sum()

for column, value in null_values_count.items():
    if value > 0:
        print(f"{column}: {value} valeurs manquantes sur {df.shape[0]}")


source_events: 3302193 valeurs manquantes sur 3546276
criticality_events: 3302193 valeurs manquantes sur 3546276
identification_events: 3302193 valeurs manquantes sur 3546276


In [18]:
df['criticality_events'].value_counts()

INFO       179610
ERROR       29976
Name: criticality_events, dtype: int64

In [19]:
# Identify categorical columns (excluding the target column 'criticality')
categorical_columns = ['name_modules',  'name_counters_modules', 'source_events']

categorical_columns

['name_modules', 'name_counters_modules', 'source_events']

In [20]:
# Initialize OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Perform one-hot encoding on categorical columns
one_hot_encoded_columns = ohe.fit_transform(df[categorical_columns])

# Create a new DataFrame with the one-hot encoded columns and the appropriate column names
one_hot_encoded_df = pd.DataFrame(
    one_hot_encoded_columns,
    columns=ohe.get_feature_names_out(categorical_columns),
    index=df.index
)

# Merge the one-hot encoded columns with the original dataframe and drop the original categorical columns
df = pd.concat([df.drop(categorical_columns, axis=1),
               one_hot_encoded_df], axis=1)




In [21]:
# Label encode the target variable 'criticality'
le_criticality = LabelEncoder()
df['criticality_events'] = le_criticality.fit_transform(df['criticality_events'])

In [22]:
train_size = int(0.8 * len(df))
train_df, test_df = df.iloc[:train_size], df.iloc[train_size:]

In [23]:
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training set and transform both the training and testing sets
X_train = train_df.drop('criticality_events', axis=1)
y_train = train_df['criticality_events']
X_test = test_df.drop('criticality_events', axis=1)
y_test = test_df['criticality_events']

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [24]:
def create_time_series_data(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i: (i + time_steps)])
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)

time_steps = 30  # You can choose an appropriate number of time steps for your problem

X_train, y_train = create_time_series_data(X_train, y_train, time_steps)
X_test, y_test = create_time_series_data(X_test, y_test, time_steps)


In [25]:
#Define the LSTM model function
def build_model(input_shape, units=50, dropout_rate=0.2):
    model = Sequential()
    model.add(LSTM(units=units, return_sequences=True, input_shape=input_shape))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(units=units, return_sequences=True))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(units=units))
    model.add(Dropout(dropout_rate))
    model.add(Dense(units=1, activation='linear'))
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
    return model

In [32]:
# Set hyperparameters manually (you can change these according to your preference)
best_units = 25
best_dropout_rate = 0.2

In [33]:
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [34]:
# Train the model with the chosen hyperparameters
input_shape = (time_steps, X_train.shape[2])
model = build_model(input_shape, units=best_units, dropout_rate=best_dropout_rate)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True, verbose=1)
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1, callbacks=[checkpoint], verbose=1)

Epoch 1/10

In [None]:
model.save('lstm_model.h5')

In [None]:
model.evaluate(X_test, y_test, verbose=1)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred_original = le_criticality.inverse_transform(y_pred.round().astype(int))
y_test_original = le_criticality.inverse_transform(y_test)


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test_original, y_pred_original)
mse = mean_squared_error(y_test_original, y_pred_original)
r2 = r2_score(y_test_original, y_pred_original)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")
