Importer les Freamworks

In [11]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
import mlflow

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score, accuracy_score
from sklearn.preprocessing import label_binarize

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

from xgboost import XGBClassifier

from keras.models import Sequential
from keras.layers import InputLayer, Dense, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras import backend as K


Division des données pour les models de ML

In [10]:
data_train = pd.read_csv('train_data.csv')
data_test = pd.read_csv('test_data.csv')

X_train = data_train.copy().drop(columns=['class'])
y_train = data_train['class'] 

X_test = data_test.copy().drop(columns=['class'])
y_test = data_test['class'] 

In [18]:
x_50 = X_train.head(50000) # For XGBOOST
y_50 = y_train.head(50000)

x_10 = X_train.head(10000) # For RF
y_10 = y_train.head(10000)

Creation du serveur MLFlow pour le suivis des models

In [3]:
mlflow.set_experiment("Forecaster Models")
mlflow.set_tracking_uri('http://127.0.0.1:5000')

Variable pour la validation croisé

In [4]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

Pipeline LogisticRegression

In [5]:
pipe_lr = Pipeline([
    ('smote', SMOTE(random_state=13, k_neighbors=5)),
    ('model', LogisticRegression())
])

param_grid_lr = {
    'model__C': [0.01, 0.1, 1],
    'model__penalty': ['l2', None],
    'model__solver' : ['lbfgs', 'sag'],
    'smote__k_neighbors': [3, 5, 7, 9]
}

random_search_lr = RandomizedSearchCV(
    estimator=pipe_lr,
    param_distributions=param_grid_lr, 
    cv=cv,
    n_iter=3,
    random_state=13, 
    n_jobs=-1,  
    scoring='f1_macro',
    error_score=np.nan 
)

random_search_lr.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluer le model

In [6]:
y_pred_lr = random_search_lr.best_estimator_.predict(X_test)
y_score = random_search_lr.predict_proba(X_test)

report_lr = classification_report(y_test, y_pred_lr)
report_dict_lr = classification_report(y_test, y_pred_lr, output_dict=True)

roc_auc_lr = roc_auc_score(y_test, y_score, multi_class='ovr', average='macro')
avg_precision_lr = average_precision_score(y_test, y_score)

print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))
print(f"ROC AUC Score: {roc_auc_lr:.2f}")
print(f"Average Precision Score: {avg_precision_lr:.2f}")

Accuracy: 0.5855184461674827
Confusion Matrix:
 [[27949  7302     8     0  1678   225  5270]
 [13620 28057   880    16 11232  2261   521]
 [    0    39  4150   723   315  2006     0]
 [    0     0    40   504     0    17     0]
 [    7   301    25     0  1399   118     0]
 [    0    33   782   204    95  2329     0]
 [  426     0     0     0    20     0  3651]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.67      0.66      0.66     42432
         1.0       0.79      0.50      0.61     56587
         2.0       0.71      0.57      0.63      7233
         3.0       0.35      0.90      0.50       561
         4.0       0.09      0.76      0.17      1850
         5.0       0.33      0.68      0.45      3443
         6.0       0.39      0.89      0.54      4097

    accuracy                           0.59    116203
   macro avg       0.47      0.71      0.51    116203
weighted avg       0.70      0.59      0.61    116203

ROC AUC Score:

Ajouter le model et ses métriques à MlFlow

In [None]:
with mlflow.start_run(run_name= 'Logistic Regression'):
    mlflow.log_params(random_search_lr.best_params_)
    mlflow.log_metrics({
        'accuracy' : report_dict_lr['accuracy'],
        'recall_class_0' : report_dict_lr['0']['recall'],
        'recall_class_1' : report_dict_lr['1']['recall'],
        'recall_class_2' : report_dict_lr['2']['recall'],
        'recall_class_3' : report_dict_lr['3']['recall'],
        'recall_class_4' : report_dict_lr['4']['recall'],
        'recall_class_5' : report_dict_lr['5']['recall'],
        'recall_class_6' : report_dict_lr['6']['recall'],
        'fi_score_macro' : report_dict_lr['macro avg']['f1-score'],

    })
    mlflow.sklearn.log_model(random_search_lr, 'Logistic Regression')

Pipeline XGBoost Classifier

In [14]:
pipe_xgb = Pipeline([
    ('smote', SMOTE(random_state=13, k_neighbors=5)),
    ('model', XGBClassifier(eval_metric='logloss'))
])

param_grid_xgb = {
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'model__n_estimators': np.arange(100, 500, 10),
    'model__max_depth': np.arange(3, 10, 2),
    'model__gamma': np.arange(0,5,1),
    'model__min_child_weight': np.arange(1, 10, 1),
    'model__subsample' : np.arange(0.5, 1.0, 1),
    'model__colsample_bytree' : np.arange(0.5, 1.0, 0.1),
    'model__reg_alpha' : np.arange(0, 10, 2),
    'smote__k_neighbors': [3, 5, 7, 9]
}

random_search_xgb = RandomizedSearchCV(
    estimator=pipe_xgb,
    param_distributions=param_grid_xgb, 
    cv=cv,
    n_iter=3,
    random_state=13, 
    n_jobs=-1,  
    scoring='f1_macro',
    error_score=np.nan #ignorer les combinaisons invalides
)

random_search_xgb.fit(x_50, y_50)

Evaluer le model

In [15]:
y_pred_xgb = random_search_xgb.best_estimator_.predict(X_test)
y_score = random_search_xgb.predict_proba(X_test)

report_xgb = classification_report(y_test, y_pred_xgb)
report_dict_xgb = classification_report(y_test, y_pred_xgb, output_dict=True)

roc_auc_xgb = roc_auc_score(y_test, y_score, multi_class='ovr', average='macro')
avg_precision_xgb = average_precision_score(y_test, y_score)

print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))
print(f"ROC AUC Score: {roc_auc_xgb:.2f}")
print(f"Average Precision Score: {avg_precision_xgb:.2f}")

Accuracy: 0.8181544366324449
Confusion Matrix:
 [[34342  6929    15     0   233    39   874]
 [ 8010 45923   659     6  1100   803    86]
 [    0   179  6328   111    20   595     0]
 [    0     0    67   477     0    17     0]
 [   11   418    40     0  1365    16     0]
 [    6   126   417    32     2  2860     0]
 [  280    40     0     0     0     0  3777]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.81      0.81      0.81     42432
         1.0       0.86      0.81      0.83     56587
         2.0       0.84      0.87      0.86      7233
         3.0       0.76      0.85      0.80       561
         4.0       0.50      0.74      0.60      1850
         5.0       0.66      0.83      0.74      3443
         6.0       0.80      0.92      0.86      4097

    accuracy                           0.82    116203
   macro avg       0.75      0.83      0.78    116203
weighted avg       0.82      0.82      0.82    116203

ROC AUC Score:

Ajouter le model et ses métriques à MlFlow

In [None]:
with mlflow.start_run(run_name= 'XGBoost Classifier'):
    mlflow.log_params(random_search_xgb.best_params_)
    mlflow.log_metrics({
        'accuracy' : report_dict_xgb['accuracy'],
        'recall_class_0' : report_dict_xgb['0']['recall'],
        'recall_class_1' : report_dict_xgb['1']['recall'],
        'recall_class_2' : report_dict_xgb['2']['recall'],
        'recall_class_3' : report_dict_xgb['3']['recall'],
        'recall_class_4' : report_dict_xgb['4']['recall'],
        'recall_class_5' : report_dict_xgb['5']['recall'],
        'recall_class_6' : report_dict_xgb['6']['recall'],
        'fi_score_macro' : report_dict_xgb['macro avg']['f1-score'],

    })
    mlflow.sklearn.log_model(random_search_xgb, 'XGBoost Classifier')

Pipeline RandomForest

In [20]:
pipe_rf = Pipeline([
    ('smote', SMOTE(random_state=13, k_neighbors=5)),
    ('model', RandomForestClassifier(oob_score=True, random_state=13))
])

param_grid_rf = {
    'model__n_estimators': list(range(100, 600, 5)),
    'model__criterion': ['gini', 'entropy'],
    'model__max_depth': list(range(5, 25, 1)),
    'model__max_features': ['sqrt', 'log2', None],
    'model__max_samples': [0.1, 0.2, 0.3, 0.5, 1.0],
    'smote__k_neighbors': [3, 5, 7, 9]
}

random_search_rf = RandomizedSearchCV(
    estimator=pipe_rf,
    param_distributions=param_grid_rf, 
    cv=cv,
    n_iter=3,
    random_state=13, 
    n_jobs=2,  # Réduire pour économiser la mémoire
    scoring='f1_macro',
    error_score=np.nan
)

random_search_rf.fit(x_10, y_10)

Evaluer le model

In [21]:
y_pred_rf = random_search_rf.best_estimator_.predict(X_test)
y_score = random_search_rf.predict_proba(X_test)

report_rf = classification_report(y_test, y_pred_rf)
report_dict_rf = classification_report(y_test, y_pred_rf, output_dict=True)

roc_auc_rf = roc_auc_score(y_test, y_score, multi_class='ovr', average='macro')
avg_precision_rf = average_precision_score(y_test, y_score)

print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print(f"ROC AUC Score: {roc_auc_rf:.2f}")
print(f"Average Precision Score: {avg_precision_rf:.2f}")

Accuracy: 0.7974406856965827
Confusion Matrix:
 [[34326  6698    28     0   472    54   854]
 [ 8326 45173   930     3  1322   764    69]
 [    0   249  6023    75    15   871     0]
 [    0     1   154   377     0    29     0]
 [   11   633    71     0  1122    13     0]
 [    3   213   854    41    26  2306     0]
 [  717    42     0     0     0     0  3338]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.79      0.81      0.80     42432
         1.0       0.85      0.80      0.82     56587
         2.0       0.75      0.83      0.79      7233
         3.0       0.76      0.67      0.71       561
         4.0       0.38      0.61      0.47      1850
         5.0       0.57      0.67      0.62      3443
         6.0       0.78      0.81      0.80      4097

    accuracy                           0.80    116203
   macro avg       0.70      0.74      0.72    116203
weighted avg       0.80      0.80      0.80    116203

ROC AUC Score:

Ajouter le model et ses métriques à MlFlow

In [None]:
with mlflow.start_run(run_name= 'RandomForest Classifier'):
    mlflow.log_params(random_search_rf.best_params_)
    mlflow.log_metrics({
        'accuracy' : report_dict_rf['accuracy'],
        'recall_class_0' : report_dict_rf['0']['recall'],
        'recall_class_1' : report_dict_rf['1']['recall'],
        'recall_class_2' : report_dict_rf['2']['recall'],
        'recall_class_3' : report_dict_rf['3']['recall'],
        'recall_class_4' : report_dict_rf['4']['recall'],
        'recall_class_5' : report_dict_rf['5']['recall'],
        'recall_class_6' : report_dict_rf['6']['recall'],
        'fi_score_macro' : report_dict_rf['macro avg']['f1-score'],

    })
    mlflow.sklearn.log_model(random_search_rf, 'Random Forest Classifier')

Préparation des données pour le Reseau de neurones

In [12]:
batch_size = 32
column_names = ['Elevation','Aspect','Slope','Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology','Horizontal_Distance_To_Roadways','Hillshade_9am','Hillshade_Noon','Hillshade_3pm','Horizontal_Distance_To_Fire_Points','Wilderness_Area1','Wilderness_Area2','Wilderness_Area3','Wilderness_Area4','Soil_Type1','Soil_Type2','Soil_Type3','Soil_Type4','Soil_Type5','Soil_Type6','Soil_Type7','Soil_Type8','Soil_Type9','Soil_Type10','Soil_Type11','Soil_Type12','Soil_Type13','Soil_Type14','Soil_Type15','Soil_Type16','Soil_Type17','Soil_Type18','Soil_Type19','Soil_Type20','Soil_Type21','Soil_Type22','Soil_Type23','Soil_Type24','Soil_Type25','Soil_Type26','Soil_Type27','Soil_Type28','Soil_Type29','Soil_Type30','Soil_Type31','Soil_Type32','Soil_Type33','Soil_Type34','Soil_Type35','Soil_Type36','Soil_Type37','Soil_Type38','Soil_Type39','Soil_Type40','class']
label_name = 'class'

full_train_dataset = tf.data.experimental.make_csv_dataset(
    'train_data.csv',
    batch_size=batch_size,
    column_names = column_names,
    label_name = label_name,
    shuffle_buffer_size=10000,
    num_epochs=1,
    shuffle=True,
    shuffle_seed=42
)

test_dataset = tf.data.experimental.make_csv_dataset(
    'test_data.csv',
    batch_size=batch_size,
    column_names = column_names,
    label_name = label_name,
    num_epochs=1,
    shuffle=False,
)

Création de la métrique F1 personalisé

In [13]:
class F1Score(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', **kwargs):
        super().__init__(name=name, **kwargs)
        self.precision = tf.keras.metrics.Precision()
        self.recall = tf.keras.metrics.Recall()

    def update_state(self, y_true, y_pred, sample_weight=None):
        y_pred = tf.argmax(y_pred, axis=1)  
        self.precision.update_state(y_true, y_pred)
        self.recall.update_state(y_true, y_pred)

    def result(self):
        p = self.precision.result()
        r = self.recall.result()
        return 2 * ((p * r) / (p + r + K.epsilon()))

    def reset_state(self):
        self.precision.reset_state()
        self.recall.reset_state()

Creation de la variable class_weight pour mieux gerer le déséquilibre des données

In [26]:
data_train['class'].value_counts()

class
1.0    226714
0.0    169408
2.0     28521
6.0     16413
5.0     13924
4.0      7643
3.0      2186
Name: count, dtype: int64

In [14]:
class_counts = {1:226714, 0:169408, 2:28521, 6:16413, 5:13924, 4:7643, 3:2186}
total_samples = sum(class_counts.values())
class_weight = {
    cls: (1 / count) * (total_samples / len(class_counts)) 
    for cls, count in class_counts.items()
}

print(class_weight)

{1: 0.2928856873165562, 0: 0.39196074396891356, 2: 2.328154192149143, 6: 4.0456519657762575, 5: 4.768836951614889, 4: 8.687856301751369, 3: 30.37570252254607}


Transformation des differents datasets

In [38]:
len(data_train)

464809

In [15]:
train_size = int(0.8 * 464809)
n_train_batch = train_size // batch_size

train_dataset = full_train_dataset.take(n_train_batch)  
val_dataset = full_train_dataset.skip(n_train_batch)

In [16]:
def pack_features_vector(features, labels):
    features = tf.stack(list(features.values()), axis=1) 
    return features, labels

train_dataset = train_dataset.map(pack_features_vector).cache().prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.map(pack_features_vector).cache().prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.map(pack_features_vector).cache().prefetch(tf.data.AUTOTUNE)

Création du model NN

In [17]:
def model_nn():
    model = Sequential()
    model.add(InputLayer(input_shape=(54,)))

    model.add(Dense(512, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(.25))

    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(.25))

    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(.25))

    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(.25))

    model.add(Dense(32, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(.25))
    
    model.add(Dense(7, activation='softmax'))
    model.compile(
        loss = 'sparse_categorical_crossentropy', 
    metrics=['accuracy', F1Score(name='f1_score')],
    optimizer = Adam(learning_rate=0.0005)
    )
    
    return model

model = model_nn()
model.summary()



Entrainement du NN

In [18]:
early_stop = EarlyStopping(monitor='val_f1_score', mode='max', patience=15, verbose=1) #val _accuracy, patience =10 pour un model de classification équilibré
reduce_lr = ReduceLROnPlateau(monitor='val_loss', mode='min', factor=0.5, patience=3, verbose=1)

history = model.fit(
    train_dataset, 
    epochs=100, 
    verbose=1, 
    validation_data= val_dataset,
    class_weight=class_weight, # Pour re équilibrer 
    callbacks=[early_stop, reduce_lr]
)

test_loss, test_accuracy, test_f1 = model.evaluate(test_dataset, verbose=0)
print(f"Test - Loss: {test_loss}, Accuracy: {test_accuracy}, F1: {test_f1}")

Epoch 1/100
  11612/Unknown [1m52s[0m 4ms/step - accuracy: 0.4535 - f1_score: 0.7815 - loss: 1.0954



[1m11620/11620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 6ms/step - accuracy: 0.4536 - f1_score: 0.7815 - loss: 1.0952 - val_accuracy: 0.6113 - val_f1_score: 0.8195 - val_loss: 0.9070 - learning_rate: 5.0000e-04
Epoch 2/100
[1m11620/11620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 4ms/step - accuracy: 0.5997 - f1_score: 0.8108 - loss: 0.6867 - val_accuracy: 0.6462 - val_f1_score: 0.8365 - val_loss: 0.8162 - learning_rate: 5.0000e-04
Epoch 3/100
[1m11620/11620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 4ms/step - accuracy: 0.6298 - f1_score: 0.8211 - loss: 0.6140 - val_accuracy: 0.6780 - val_f1_score: 0.8428 - val_loss: 0.7650 - learning_rate: 5.0000e-04
Epoch 4/100
[1m11620/11620[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 4ms/step - accuracy: 0.6457 - f1_score: 0.8267 - loss: 0.5782 - val_accuracy: 0.6867 - val_f1_score: 0.8486 - val_loss: 0.7301 - learning_rate: 5.0000e-04
Epoch 5/100
[1m11620/11620[0m [32m━━━━━━━━━━━━━━━━━━━━[0

Evaluation du model

In [19]:
y_score_nn = model.predict(test_dataset)
y_pred_nn = np.argmax(y_score_nn, axis=1)
y_true_nn = np.concatenate([y.numpy() for _, y in test_dataset], axis=0)

roc_auc_nn = roc_auc_score(y_true_nn, y_score_nn, multi_class='ovr', average='macro')

y_true_bin_nn = label_binarize(y_true_nn, classes=range(7))
avg_precision_nn = average_precision_score(y_true_bin_nn, y_score_nn, average='macro')

print("Accuracy:", accuracy_score(y_true_nn, y_pred_nn))
print("Confusion Matrix:\n", confusion_matrix(y_true_nn, y_pred_nn))
print("\nClassification Report:\n", classification_report(y_true_nn, y_pred_nn))
print(f"ROC AUC Score: {roc_auc_nn:.2f}")
print(f"Average Precision Score: {avg_precision_nn:.2f}")

[1m3632/3632[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 967us/step




Accuracy: 0.8184728449351566
Confusion Matrix:
 [[34236  6256    26     0   466    47  1401]
 [ 6442 44982   866     4  2833  1238   222]
 [    0     8  6310   151    58   706     0]
 [    0     0    22   525     0    14     0]
 [    0    32    11     0  1793    14     0]
 [    1    13   163    40     9  3217     0]
 [   51     0     0     0     0     0  4046]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.84      0.81      0.82     42432
         1.0       0.88      0.79      0.83     56587
         2.0       0.85      0.87      0.86      7233
         3.0       0.73      0.94      0.82       561
         4.0       0.35      0.97      0.51      1850
         5.0       0.61      0.93      0.74      3443
         6.0       0.71      0.99      0.83      4097

    accuracy                           0.82    116203
   macro avg       0.71      0.90      0.77    116203
weighted avg       0.84      0.82      0.82    116203

ROC AUC Score:

Ajouter le model et ses métriques à MlFlow

In [None]:
with mlflow.start_run(run_name= 'Nerural Network')
    mlflow.log_params(random_search_lr.best_params_)
    mlflow.log_metrics({
        'accuracy' : report_dict_lr['accuracy'],
        'recall_class_0' : report_dict_lr['0']['recall'],
        'recall_class_1' : report_dict_lr['1']['recall'],
        'recall_class_2' : report_dict_lr['2']['recall'],
        'recall_class_3' : report_dict_lr['3']['recall'],
        'recall_class_4' : report_dict_lr['4']['recall'],
        'recall_class_5' : report_dict_lr['5']['recall'],
        'recall_class_6' : report_dict_lr['6']['recall'],
        'fi_score_macro' : report_dict_lr['macro avg']['f1-score'],

    })
    mlflow.sklearn.log_model(random_search_lr, 'Logistic Regression')