# Import des outils / jeu de données

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost
from imblearn.over_sampling import SMOTENC
from keras import layers
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import IsolationForest, RandomForestClassifier, VotingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    classification_report,
    confusion_matrix,
)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import BernoulliNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from tensorflow import keras

In [None]:
np.random.seed(0)
sns.set_theme()

In [None]:
df = pd.read_csv(
    "data/data-cleaned-feature-engineering.csv",
    sep=",",
    index_col="ID",
    parse_dates=True,
)

In [None]:
df_transforme = pd.read_csv(
    "data/data-transformed.csv",
    sep=",",
    index_col="ID",
    parse_dates=True,
)

## Variables globales

In [None]:
var_numeriques = [
    "Year_Birth",
    "Income",
    "Recency",
    "MntWines",
    "MntFruits",
    "MntMeatProducts",
    "MntFishProducts",
    "MntSweetProducts",
    "MntGoldProds",
    "NumDealsPurchases",
    "NumWebPurchases",
    "NumCatalogPurchases",
    "NumStorePurchases",
    "NumWebVisitsMonth",
]

In [None]:
var_categoriques = [
    "Education",
    "Marital_Status",
    "Kidhome",
    "Teenhome",
    "AcceptedCmp1",
    "AcceptedCmp2",
    "AcceptedCmp3",
    "AcceptedCmp4",
    "AcceptedCmp5",
    "Response",
]

In [None]:
LABELS = (0, 1)

## Fonctions et variables utiles

In [None]:
score_modeles = []

In [None]:
def ajout_score(model, nom_modele, y_test, y_pred):
    """Ajoute le score F1 de la classe 1 à score_modeles."""
    clf_report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).T

    score_f1_classe1 = clf_report.iloc[1, 2]

    score_modeles.extend(([nom_modele, "score_f1_classe1", score_f1_classe1],))

In [None]:
def evaluate_models(models, prefix, X_train, X_test, y_train, y_test):
    """Evalue tous les modèles dans `models` et sauvegarde les résultats avec un préfixe `prefix`
    (utile pour distinguer les différentes stratégies de pré-traitement des données)."""
    results = []

    for model, model_name in models:
        name = f"{prefix}/{model_name}"

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        clf_report = pd.DataFrame(
            classification_report(y_test, y_pred, output_dict=True)
        ).T
        cm = confusion_matrix(y_test, y_pred, labels=LABELS, normalize="true")
        # sns.heatmap(cm, annot=True, cmap="Purples", vmin=0, vmax=1)

        score_f1_classe1 = clf_report.iloc[1, 2]

        results.append([name, score_f1_classe1])
        ajout_score(model, name, y_test, y_pred)

    return results

# Liste des modèles

In [None]:
models = [
    [DummyClassifier(strategy="uniform", random_state=0), "DummyClassifier_Uniform"],
    [
        DummyClassifier(strategy="constant", constant=1, random_state=0),
        "DummyClassifier_Constant1",
    ],
    [LogisticRegression(random_state=0), "LogisticRegression"],
    [LinearDiscriminantAnalysis(), "LinearDiscriminantAnalysis"],
    [DecisionTreeClassifier(random_state=0), "DecisionTreeClassifier"],
    [RandomForestClassifier(random_state=0), "RandomForestClassifier"],
    [xgboost.XGBClassifier(random_state=0), "XGBClassifier"],
    [LinearSVC(random_state=0), "LinearSVC"],
    [BernoulliNB(), "BernoulliNB"],
    [ComplementNB(), "ComplementNB"],
    [KNeighborsClassifier(), "KNeighborsClassifier"],
    [
        VotingClassifier(
            estimators=[
                ("lr", LogisticRegression(random_state=0)),
                ("dt", RandomForestClassifier(random_state=0)),
                ("lda", LinearDiscriminantAnalysis()),
                ("xgb", xgboost.XGBClassifier(random_state=0)),
            ],
            voting="soft",
        ),
        "VotingClassifier",
    ],
]

# Traitement des données

## Par défaut

In [None]:
X = pd.get_dummies(df_transforme.drop(columns=["Response", "Dt_Customer"]))
y = df[["Response"]].astype(int)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
prefix = "défaut"
results = evaluate_models(models, prefix, X_train, X_test, y_train, y_test)

In [None]:
sorted(results, key=lambda x: x[1], reverse=True)

## Équilibrage des classes

### Under-sampling (manuel)

In [None]:
samples0 = df_transforme[df_transforme["Response"] == 0].sample(350, random_state=0)

In [None]:
X_eq = pd.concat((samples0, df_transforme[df_transforme["Response"] == 1]))

In [None]:
X_eq["Response"].hist()

In [None]:
y_eq = X_eq.pop("Response").astype(int)

In [None]:
X_eq = pd.get_dummies(X_eq.drop(columns=["Dt_Customer"]))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_eq, y_eq, test_size=0.2, random_state=0
)

In [None]:
prefix = "éq_classes"
results = evaluate_models(models, prefix, X_train, X_test, y_train, y_test)

In [None]:
sorted(results, key=lambda x: x[1], reverse=True)

### Over-sampling (SMOTE)

In [None]:
sm = SMOTENC(
    categorical_features=[  # todo: générer cette liste automatiquement...
        2,
        3,
        5,
        6,
        19,
        20,
        21,
        22,
        23,
        24,
        25,
        26,
        27,
    ],
    random_state=0,
)

In [None]:
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

In [None]:
y_train_sm.value_counts(normalize=True)

In [None]:
prefix = "SMOTE"
results = evaluate_models(models, prefix, X_train_sm, X_test, y_train_sm, y_test)

In [None]:
sorted(results, key=lambda x: x[1], reverse=True)

# Réseau de neurones

In [None]:
X_train = np.asarray(X_train).astype("float32")
y_train = np.asarray(y_train).astype("float32")
X_test = np.asarray(X_test).astype("float32")
y_test = np.asarray(y_test).astype("float32")

In [None]:
X_train.shape

In [None]:
np.random.seed(0)
model = keras.Sequential(
    [
        layers.Dense(400, activation="relu", input_shape=[X_train.shape[1]]),
        layers.Dense(400, activation="relu"),
        layers.Dense(400, activation="sigmoid"),
        layers.Dense(400, activation="relu"),
        layers.Dense(1, activation="sigmoid"),
    ]
)

In [None]:
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["binary_accuracy"],
)

In [None]:
early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)

In [None]:
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    # validation_split=0.2,
    batch_size=512,
    epochs=1000,
    callbacks=[early_stopping],
    verbose=0,  # hide the output because we have so many epochs
)

In [None]:
history_df = pd.DataFrame(history.history)
# Start the plot at epoch 5
history_df.loc[5:, ["loss", "val_loss"]].plot()
history_df.loc[5:, ["binary_accuracy", "val_binary_accuracy"]].plot()

print(
    ("Best Validation Loss: {:0.4f}" + "\nBest Validation Accuracy: {:0.4f}").format(
        history_df["val_loss"].min(), history_df["val_binary_accuracy"].max()
    )
)

In [None]:
y_pred = model.predict(X_test)

In [None]:
sns.histplot(y_pred > 0.5, discrete=True)

In [None]:
y_pred_old = y_pred

In [None]:
y_pred = y_pred > 0.5

In [None]:
print(classification_report(y_test, y_pred, labels=LABELS))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=LABELS)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=LABELS)
disp.plot()

In [None]:
nom_modele = "Réseau de Neurones"
ajout_score(model, nom_modele, y_test, y_pred)

# Sauvegarde des données

In [None]:
score_modeles_df = pd.DataFrame(score_modeles, columns=["Modèle", "Métrique", "Valeur"])

In [None]:
score_modeles_df.to_csv("data/results/classifications.csv", index=False)