# Import des outils / jeu de données

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTENC
from keras import layers
from lightgbm import LGBMClassifier
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import IsolationForest, RandomForestClassifier, VotingClassifier
from sklearn.feature_selection import mutual_info_regression
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
)
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.naive_bayes import BernoulliNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler, StandardScaler
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from tensorflow import keras

In [None]:
SEED = 0

In [None]:
np.random.seed(SEED)
sns.set_theme()

In [None]:
df = pd.read_csv(
    "data/data-cleaned-feature-engineering.csv",
    sep=",",
    index_col="ID",
    parse_dates=True,
)

In [None]:
df_transforme = pd.read_csv(
    "data/data-transformed.csv",
    sep=",",
    index_col="ID",
    parse_dates=True,
)

## Variables globales

In [None]:
var_numeriques = [
    "Year_Birth",
    "Income",
    "Recency",
    "MntWines",
    "MntFruits",
    "MntMeatProducts",
    "MntFishProducts",
    "MntSweetProducts",
    "MntGoldProds",
    "NumDealsPurchases",
    "NumWebPurchases",
    "NumCatalogPurchases",
    "NumStorePurchases",
    "NumWebVisitsMonth",
]

In [None]:
var_categoriques = [
    "Education",
    "Marital_Status",
    "Kidhome",
    "Teenhome",
    "AcceptedCmp1",
    "AcceptedCmp2",
    "AcceptedCmp3",
    "AcceptedCmp4",
    "AcceptedCmp5",
    "Response",
]

In [None]:
LABELS = (0, 1)

## Fonctions et variables utiles

In [None]:
score_modeles = []

In [None]:
def ajout_score(model, nom_modele, y_test, y_pred):
    """Ajoute le score F1 de la classe 1 à score_modeles."""
    clf_report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).T

    score_f1_classe1 = clf_report.iloc[1, 2]

    score_modeles.extend(([nom_modele, "score_f1_classe1", score_f1_classe1],))

In [None]:
def evaluate_models(models, prefix, X_train, X_test, y_train, y_test):
    """Evalue tous les modèles dans `models` et sauvegarde les résultats avec un préfixe `prefix`
    (utile pour distinguer les différentes stratégies de pré-traitement des données)."""
    results = []

    for model, model_name in models:
        name = f"{prefix}/{model_name}"

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        scores = cross_val_score(
            model,
            X_train,
            y_train,
            cv=5,
            scoring=make_scorer(f1_score, labels=[LABELS[1]]),
        )
        scores = scores.mean()

        # clf_report = pd.DataFrame(
        #     classification_report(y_test, y_pred, output_dict=True)
        # ).T
        # cm = confusion_matrix(y_test, y_pred, labels=LABELS, normalize="true")
        # # sns.heatmap(cm, annot=True, cmap="Purples", vmin=0, vmax=1)
        #
        # score_f1_classe1 = clf_report.iloc[1, 2]

        results.append([name, scores])
        score_modeles.extend(([model_name, "score_f1_classe1", scores],))
        # ajout_score(model, name, y_test, y_pred)

    return results

# Liste des modèles

In [None]:
models = [
    [DummyClassifier(strategy="uniform", random_state=SEED), "DummyClassifier_Uniform"],
    [
        DummyClassifier(strategy="constant", constant=1, random_state=SEED),
        "DummyClassifier_Constant1",
    ],
    [LogisticRegression(random_state=SEED), "LogisticRegression"],
    [LinearDiscriminantAnalysis(), "LinearDiscriminantAnalysis"],
    [DecisionTreeClassifier(random_state=SEED), "DecisionTreeClassifier"],
    [RandomForestClassifier(random_state=SEED), "RandomForestClassifier"],
    [xgboost.XGBClassifier(random_state=SEED), "XGBClassifier"],
    [CatBoostClassifier(random_state=SEED, verbose=False), "CatBoostClassifier"],
    [LGBMClassifier(random_state=SEED), "LGBMClassifier"],
    [LinearSVC(random_state=SEED), "LinearSVC"],
    [BernoulliNB(), "BernoulliNB"],
    [ComplementNB(), "ComplementNB"],
    [KNeighborsClassifier(), "KNeighborsClassifier"],
    [
        VotingClassifier(
            estimators=[
                ("lr", LogisticRegression(random_state=SEED)),
                ("dt", RandomForestClassifier(random_state=SEED)),
                ("lda", LinearDiscriminantAnalysis()),
                ("xgb", xgboost.XGBClassifier(random_state=SEED)),
            ],
            voting="soft",
        ),
        "VotingClassifier",
    ],
]

# Traitement des données

## Pipeline Scaler & OneHotEncoding

In [None]:
OneHotEncoder

In [None]:
df[var_categoriques].head()

In [None]:
preprocessor = ColumnTransformer(
    remainder="passthrough",
    transformers=[
        ("std", standard_transformer, var_categoriques),
    ],
)

In [None]:
pipeline = Pipeline(steps=[("scaler", RobustScaler())])

preprocessor = ColumnTransformer(
    remainder="passthrough",
    transformers=[
        ("std", standard_transformer, [3]),
    ],
)

In [None]:
under_sampling_manuel = Pipeline(steps=[()])

## Par défaut

In [None]:
X = pd.get_dummies(df_transforme.drop(columns=["Response", "Dt_Customer"]))
y = df[["Response"]].astype(int)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED
)

In [None]:
prefix = "défaut"
results = evaluate_models(models, prefix, X_train, X_test, y_train, y_test)

In [None]:
sorted(results, key=lambda x: x[1], reverse=True)

## Équilibrage des classes

### Under-sampling (manuel)

In [None]:
samples0 = df_transforme[df_transforme["Response"] == 0].sample(350, random_state=SEED)

In [None]:
X_eq = pd.concat((samples0, df_transforme[df_transforme["Response"] == 1]))

In [None]:
X_eq["Response"].hist()

In [None]:
y_eq = X_eq.pop("Response").astype(int)

In [None]:
X_eq = pd.get_dummies(X_eq.drop(columns=["Dt_Customer"]))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_eq, y_eq, test_size=0.2, random_state=SEED
)

In [None]:
prefix = "éq_classes"
results = evaluate_models(models, prefix, X_train, X_test, y_train, y_test)

In [None]:
sorted(results, key=lambda x: x[1], reverse=True)

### Over-sampling (SMOTE)

In [None]:
cat_cols = list(X.select_dtypes(include=["category", "int", "bool"]).columns)

In [None]:
cat_cols_index = list(map(lambda c: list(X.columns).index(c), cat_cols))

In [None]:
list(X.iloc[:, cat_cols_index].columns)

In [None]:
sm = SMOTENC(
    categorical_features=cat_cols_index,
    random_state=SEED,
)

In [None]:
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

In [None]:
y_train_sm.value_counts(normalize=True)

In [None]:
prefix = "SMOTE"
results = evaluate_models(models, prefix, X_train_sm, X_test, y_train_sm, y_test)

In [None]:
sorted(results, key=lambda x: x[1], reverse=True)

# Réseau de neurones

In [None]:
X_train = np.asarray(X_train).astype("float32")
y_train = np.asarray(y_train).astype("float32")
X_test = np.asarray(X_test).astype("float32")
y_test = np.asarray(y_test).astype("float32")

In [None]:
X_train.shape

In [None]:
np.random.seed(0)
model = keras.Sequential(
    [
        layers.Dense(400, activation="relu", input_shape=[X_train.shape[1]]),
        layers.Dense(400, activation="relu"),
        layers.Dense(400, activation="sigmoid"),
        layers.Dense(400, activation="relu"),
        layers.Dense(1, activation="sigmoid"),
    ]
)

In [None]:
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["binary_accuracy"],
)

In [None]:
early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)

In [None]:
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    # validation_split=0.2,
    batch_size=512,
    epochs=1000,
    callbacks=[early_stopping],
    verbose=0,  # hide the output because we have so many epochs
)

In [None]:
history_df = pd.DataFrame(history.history)
# Start the plot at epoch 5
history_df.loc[5:, ["loss", "val_loss"]].plot()
history_df.loc[5:, ["binary_accuracy", "val_binary_accuracy"]].plot()

print(
    ("Best Validation Loss: {:0.4f}" + "\nBest Validation Accuracy: {:0.4f}").format(
        history_df["val_loss"].min(), history_df["val_binary_accuracy"].max()
    )
)

In [None]:
y_pred = model.predict(X_test)

In [None]:
sns.histplot(y_pred > 0.5, discrete=True)

In [None]:
y_pred_old = y_pred

In [None]:
y_pred = y_pred > 0.5

In [None]:
print(classification_report(y_test, y_pred, labels=LABELS))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=LABELS)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=LABELS)
disp.plot()

In [None]:
nom_modele = "Réseau de Neurones"
ajout_score(model, nom_modele, y_test, y_pred)

# Feature importance

## XGBoost feature importance

In [None]:
xgb = models[6][0]

In [None]:
fi = xgb.feature_importances_

In [None]:
fi = pd.DataFrame(fi.reshape((1, len(fi))), columns=X.columns)

In [None]:
fi = fi.sort_values(
    by=0, axis=1, ascending=False
)  # trier les colonnes en fonction de la ligne 0

In [None]:
plt.figure(figsize=(5, 12))
sns.barplot(fi, orient="h", color="gray")

## Permutation importance

In [None]:
result = permutation_importance(xgb, X_test, y_test, n_repeats=10, random_state=SEED)

In [None]:
sns.histplot(result.importances_std < 0.015)

In [None]:
pi_results = result.importances_mean

In [None]:
pi_results = pd.DataFrame(pi_results.reshape((1, len(pi_results))), columns=X.columns)
pi_results = pi_results.sort_values(by=0, axis=1, ascending=False)

In [None]:
plt.figure(figsize=(5, 12))
sns.barplot(pi_results, orient="h", color="gray")

## Mutual Information

### Sans OneHotEncoding

In [None]:
# Label encoding for categoricals
for colname in df.select_dtypes(["object", "category", "bool"]):
    df[colname], _ = df[colname].factorize()

# All discrete features should now have integer dtypes (double-check this before using MI!)
discrete_features = df.dtypes == int

In [None]:
discrete_features.drop("Response", axis=0, inplace=True)

In [None]:
def make_mi_scores(X, y, discrete_features):
    mi_scores = mutual_info_regression(
        X, y, discrete_features=discrete_features, random_state=SEED
    )
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [None]:
mi_scores = make_mi_scores(df.drop(columns=["Response"]), y, discrete_features)

In [None]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


plt.figure(figsize=(5, 12))
plot_mi_scores(mi_scores)

### Avec OneHotEncoding

In [None]:
# Label encoding for categoricals
for colname in X.select_dtypes(["object", "category", "bool"]):
    X[colname], _ = X[colname].factorize()

# All discrete features should now have integer dtypes (double-check this before using MI!)
discrete_features = X.dtypes == int

In [None]:
mi_scores = make_mi_scores(X, y, discrete_features)

In [None]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


plt.figure(figsize=(5, 12))
plot_mi_scores(mi_scores)

### Modèles après MI (avec OneHotEncoding)

In [None]:
positive_mi = mi_scores > 0

In [None]:
cols_to_drop = positive_mi[positive_mi == 0].index

In [None]:
X_positive_mi = X_eq.drop(columns=cols_to_drop)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_positive_mi, y_eq, test_size=0.2, random_state=SEED
)

In [None]:
prefix = "positive_mi"
results = evaluate_models(models, prefix, X_train, X_test, y_train, y_test)

In [None]:
sorted(results, key=lambda x: x[1], reverse=True)

# Sauvegarde des données

In [None]:
score_modeles_df = pd.DataFrame(score_modeles, columns=["Modèle", "Métrique", "Valeur"])

In [None]:
score_modeles_df.to_csv("data/results/classifications.csv", index=False)