# Porto Seguro : équilibrer les échantillons en mini-lots avec Keras

Cet exemple compare deux stratégies pour entraîner un réseau neuronal sur l'ensemble de données Porto Seguro Kaggle [1]. L'ensemble de données est déséquilibré et nous montrons que l'équilibrage de chaque mini-lot permet d'améliorer les performances et de réduire le temps d'entraînement.

## Références

[1] https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data

In [2]:
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# License: MIT

print(__doc__)

Automatically created module for IPython interactive environment


### Chargement des données

In [3]:
from collections import Counter

import numpy as np
import pandas as pd

Tout d'abord, vous devriez télécharger l'ensemble de données Porto Seguro depuis Kaggle. Voir le lien dans l'introduction.

In [5]:
training_data = pd.read_csv("./input/train.csv")
testing_data = pd.read_csv("./input/test.csv")

y_train = training_data[["id", "target"]].set_index("id")
X_train = training_data.drop(["target"], axis=1).set_index("id")
X_test = testing_data.set_index("id")

L'ensemble de données est déséquilibré et cela aura un impact sur l'ajustement.

In [6]:
print(f"The data set is imbalanced: {Counter(y_train['target'])}")

The data set is imbalanced: Counter({0: 573518, 1: 21694})


### Définir la chaîne de prétraitement

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler


def convert_float64(X):
    return X.astype(np.float64)

Nous voulons mettre à l'échelle de manière standard les caractéristiques numériques tout en encodant de manière one-hot les caractéristiques catégorielles. À cet égard, nous utilisons la classe `~sklearn.compose.ColumnTransformer`.

In [8]:
numerical_columns = [
    name for name in X_train.columns if "_calc_" in name and "_bin" not in name
]
numerical_pipeline = make_pipeline(
    FunctionTransformer(func=convert_float64, validate=False), StandardScaler()
)

categorical_columns = [name for name in X_train.columns if "_cat" in name]
categorical_pipeline = make_pipeline(
    SimpleImputer(missing_values=-1, strategy="most_frequent"),
    OneHotEncoder(categories="auto"),
)

preprocessor = ColumnTransformer(
    [
        ("numerical_preprocessing", numerical_pipeline, numerical_columns),
        (
            "categorical_preprocessing",
            categorical_pipeline,
            categorical_columns,
        ),
    ],
    remainder="drop",
)

# Create an environment variable to avoid using the GPU. This can be changed.
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

from tensorflow.keras.layers import Activation, BatchNormalization, Dense, Dropout

### Créer un réseau neuronal

In [9]:
from tensorflow.keras.models import Sequential


def make_model(n_features):
    model = Sequential()
    model.add(Dense(200, input_shape=(n_features,), kernel_initializer="glorot_normal"))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(Dropout(0.5))
    model.add(Dense(100, kernel_initializer="glorot_normal", use_bias=False))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(Dropout(0.25))
    model.add(Dense(50, kernel_initializer="glorot_normal", use_bias=False))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(Dropout(0.15))
    model.add(Dense(25, kernel_initializer="glorot_normal", use_bias=False))
    model.add(BatchNormalization())
    model.add(Activation("relu"))
    model.add(Dropout(0.1))
    model.add(Dense(1, activation="sigmoid"))

    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

    return model

Nous créons un décorateur pour reporter le temps de calcul

In [10]:
import time
from functools import wraps


def timeit(f):
    @wraps(f)
    def wrapper(*args, **kwds):
        start_time = time.time()
        result = f(*args, **kwds)
        elapsed_time = time.time() - start_time
        print(f"Elapsed computation time: {elapsed_time:.3f} secs")
        return (elapsed_time, result)

    return wrapper

Le premier modèle sera entraîné en utilisant la méthode `fit` et avec des mini-lots déséquilibrés.

In [11]:
import tensorflow
from sklearn.metrics import roc_auc_score
from sklearn.utils import parse_version

tf_version = parse_version(tensorflow.__version__)


@timeit
def fit_predict_imbalanced_model(X_train, y_train, X_test, y_test):
    model = make_model(X_train.shape[1])
    model.fit(X_train, y_train, epochs=2, verbose=1, batch_size=1000)
    if tf_version < parse_version("2.6"):
        # predict_proba was removed in tensorflow 2.6
        predict_method = "predict_proba"
    else:
        predict_method = "predict"
    y_pred = getattr(model, predict_method)(X_test, batch_size=1000)
    return roc_auc_score(y_test, y_pred)

Au contraire, nous utiliserons imbalanced-learn pour créer un générateur de mini-lots qui générera des mini-lots équilibrés.

In [12]:
from imblearn.keras import BalancedBatchGenerator


@timeit
def fit_predict_balanced_model(X_train, y_train, X_test, y_test):
    model = make_model(X_train.shape[1])
    training_generator = BalancedBatchGenerator(
        X_train, y_train, batch_size=1000, random_state=42
    )
    model.fit(training_generator, epochs=5, verbose=1)
    y_pred = model.predict(X_test, batch_size=1000)
    return roc_auc_score(y_test, y_pred)

### Boucle de classification

Nous effectuons une validation croisée à 10 plis et entraînons le réseau neuronal avec les deux stratégies différentes présentées précédemment.

In [13]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10)

cv_results_imbalanced = []
cv_time_imbalanced = []
cv_results_balanced = []
cv_time_balanced = []
for train_idx, valid_idx in skf.split(X_train, y_train):
    X_local_train = preprocessor.fit_transform(X_train.iloc[train_idx])
    y_local_train = y_train.iloc[train_idx].values.ravel()
    X_local_test = preprocessor.transform(X_train.iloc[valid_idx])
    y_local_test = y_train.iloc[valid_idx].values.ravel()

    elapsed_time, roc_auc = fit_predict_imbalanced_model(
        X_local_train, y_local_train, X_local_test, y_local_test
    )
    cv_time_imbalanced.append(elapsed_time)
    cv_results_imbalanced.append(roc_auc)

    elapsed_time, roc_auc = fit_predict_balanced_model(
        X_local_train, y_local_train, X_local_test, y_local_test
    )
    cv_time_balanced.append(elapsed_time)
    cv_results_balanced.append(roc_auc)

Epoch 1/2
Epoch 2/2
Elapsed computation time: 55.142 secs
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Elapsed computation time: 9.535 secs
Epoch 1/2
Epoch 2/2
Elapsed computation time: 51.075 secs
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Elapsed computation time: 11.542 secs
Epoch 1/2
Epoch 2/2
Elapsed computation time: 73.459 secs
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Elapsed computation time: 8.908 secs
Epoch 1/2
Epoch 2/2
Elapsed computation time: 88.158 secs
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Elapsed computation time: 10.044 secs
Epoch 1/2
Epoch 2/2
Elapsed computation time: 87.759 secs
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Elapsed computation time: 17.077 secs
Epoch 1/2
Epoch 2/2
Elapsed computation time: 121.454 secs
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Elapsed computation time: 13.812 secs
Epoch 1/2
Epoch 2/2
Elapsed computation time: 100.057 secs
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Elapsed computat

### Tracé des résultats et du temps de calcul

In [None]:
df_results = pd.DataFrame(
    {
        "Balanced model": cv_results_balanced,
        "Imbalanced model": cv_results_imbalanced,
    }
)
df_results = df_results.unstack().reset_index()

df_time = pd.DataFrame(
    {"Balanced model": cv_time_balanced, "Imbalanced model": cv_time_imbalanced}
)
df_time = df_time.unstack().reset_index()

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure()
sns.boxplot(y="level_0", x=0, data=df_time)
sns.despine(top=True, right=True, left=True)
plt.xlabel("time [s]")
plt.ylabel("")
plt.title("Computation time difference using a random under-sampling")

plt.figure()
sns.boxplot(y="level_0", x=0, data=df_results, whis=10.0)
sns.despine(top=True, right=True, left=True)
ax = plt.gca()
ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: "%i%%" % (100 * x)))
plt.xlabel("ROC-AUC")
plt.ylabel("")
plt.title("Difference in terms of ROC-AUC using a random under-sampling")