# Import des outils / jeu de données

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.model_selection import train_test_split

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
)

In [None]:
np.random.seed(0)
sns.set_theme()

In [None]:
df = pd.read_csv(
    "data/data-cleaned-feature-engineering.csv",
    sep=",",
    index_col="ID",
    parse_dates=True,
)

In [None]:
df_transforme = pd.read_csv(
    "data/data-transformed.csv",
    sep=",",
    index_col="ID",
    parse_dates=True,
)

## Variables globales

In [None]:
var_numeriques = [
    "Year_Birth",
    "Income",
    "Recency",
    "MntWines",
    "MntFruits",
    "MntMeatProducts",
    "MntFishProducts",
    "MntSweetProducts",
    "MntGoldProds",
    "NumDealsPurchases",
    "NumWebPurchases",
    "NumCatalogPurchases",
    "NumStorePurchases",
    "NumWebVisitsMonth",
]

In [None]:
var_categoriques = [
    "Education",
    "Marital_Status",
    "Kidhome",
    "Teenhome",
    "AcceptedCmp1",
    "AcceptedCmp2",
    "AcceptedCmp3",
    "AcceptedCmp4",
    "AcceptedCmp5",
    "Response",
]

In [None]:
X = pd.get_dummies(df_transforme.drop(columns=["Response", "Dt_Customer"]))
y = df[["Response"]].astype(int)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Fonctions et variables utiles

In [None]:
score_modeles = []

In [None]:
def ajout_score(model, nom_modele, y_test, y_pred):
    """Ajoute le score F1 de la classe 1 à score_modeles."""
    clf_report = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).T

    score_f1_classe0 = clf_report.iloc[0, 2]
    score_f1_classe1 = clf_report.iloc[1, 2]

    score_modeles.extend(
        (
            [nom_modele, "score_f1_classe0", score_f1_classe0],
            [nom_modele, "score_f1_classe1", score_f1_classe1],
        )
    )

# Régression logistique

## Régression logistique simple

In [None]:
sns.boxplot(df, x="Response", y="Recency")

In [None]:
X = df_transforme[["Recency"]]

In [None]:
y = df[["Response"]].astype(int)

In [None]:
log_reg = LogisticRegression(random_state=0)

In [None]:
log_reg.fit(X_train, y_train)

In [None]:
y_pred = log_reg.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=log_reg.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=log_reg.classes_)
disp.plot()

In [None]:
print(classification_report(y_test, y_pred, labels=log_reg.classes_))

In [None]:
sns.regplot(
    data=df, x="Income", y="Response", logistic=True, ci=None, line_kws={"color": "red"}
)

## Régression logistique multiple

In [None]:
X = pd.get_dummies(df_transforme.drop(columns=["Response"]))

In [None]:
y = df[["Response"]].astype(int)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
model = LogisticRegression(random_state=0)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, labels=model.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()

In [None]:
nom_modele = "Régression logistique"
ajout_score(model, nom_modele, y_test, y_pred)

# Linear Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
model = LinearDiscriminantAnalysis()

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, labels=model.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()

In [None]:
nom_modele = "Linear Discriminant Analysis"
ajout_score(model, nom_modele, y_test, y_pred)

# Arbre de décision

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier()

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, labels=model.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()

In [None]:
nom_modele = "Decision Tree"
ajout_score(model, nom_modele, y_test, y_pred)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier(random_state=0)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, labels=model.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()

In [None]:
nom_modele = "Random Forest"
ajout_score(model, nom_modele, y_test, y_pred)

# Support Vector Classifier

In [None]:
from sklearn.svm import LinearSVC

In [None]:
model = LinearSVC(random_state=0)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, labels=model.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()

In [None]:
nom_modele = "Support Vector Classifier"
ajout_score(model, nom_modele, y_test, y_pred)

# Naive Bayes

## Bernoulli Naive Bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB

In [None]:
model = BernoulliNB()

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, labels=model.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()

In [None]:
nom_modele = "Naive Bayes (Bernoulli)"
ajout_score(model, nom_modele, y_test, y_pred)

## Complement Naive Bayes

In [None]:
from sklearn.naive_bayes import ComplementNB

In [None]:
model = ComplementNB()

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, labels=model.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()

In [None]:
nom_modele = "Naive Bayes (Complement)"
ajout_score(model, nom_modele, y_test, y_pred)

# k-NN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
model = KNeighborsClassifier(n_neighbors=4)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, labels=model.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()

In [None]:
nom_modele = "k-NN"
ajout_score(model, nom_modele, y_test, y_pred)

# XGBoost

In [None]:
import xgboost

In [None]:
model = xgboost.XGBClassifier(random_state=0)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, labels=model.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()

In [None]:
nom_modele = "XGBoost"
ajout_score(model, nom_modele, y_test, y_pred)

# Réseau de neurones

In [None]:
X_train = np.asarray(X_train).astype("float32")
y_train = np.asarray(y_train).astype("float32")
X_test = np.asarray(X_test).astype("float32")
y_test = np.asarray(y_test).astype("float32")

In [None]:
X_train.shape

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

np.random.seed(0)
model = keras.Sequential(
    [
        layers.Dense(400, activation="relu", input_shape=[X_train.shape[1]]),
        layers.Dense(400, activation="relu"),
        layers.Dense(400, activation="sigmoid"),
        layers.Dense(400, activation="relu"),
        layers.Dense(1, activation="sigmoid"),
    ]
)

In [None]:
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["binary_accuracy"],
)

In [None]:
early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)

In [None]:
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    # validation_split=0.2,
    batch_size=512,
    epochs=1000,
    callbacks=[early_stopping],
    verbose=0,  # hide the output because we have so many epochs
)

In [None]:
history_df = pd.DataFrame(history.history)
# Start the plot at epoch 5
history_df.loc[5:, ["loss", "val_loss"]].plot()
history_df.loc[5:, ["binary_accuracy", "val_binary_accuracy"]].plot()

print(
    ("Best Validation Loss: {:0.4f}" + "\nBest Validation Accuracy: {:0.4f}").format(
        history_df["val_loss"].min(), history_df["val_binary_accuracy"].max()
    )
)

In [None]:
y_pred = model.predict(X_test)

In [None]:
sns.histplot(y_pred > 0.5, kde=True)

In [None]:
y_pred_old = y_pred

In [None]:
y_pred = y_pred > 0.5

In [None]:
print(classification_report(y_test, y_pred))  # , labels=model.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred)  # , labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)  # , display_labels=model.classes_)
disp.plot()

In [None]:
nom_modele = "Réseau de Neurones"
ajout_score(model, nom_modele, y_test, y_pred)

# Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
model = VotingClassifier(
    estimators=[
        ("lr", LogisticRegression(random_state=0)),
        ("dt", DecisionTreeClassifier(random_state=0)),
        ("lda", LinearDiscriminantAnalysis()),
        ("xgb", xgboost.XGBClassifier(random_state=0)),
    ],
    voting="soft",
)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, labels=model.classes_))

In [None]:
clf_report = pd.DataFrame(
    classification_report(y_test, y_pred, labels=model.classes_, output_dict=True)
).T

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()

In [None]:
nom_modele = "Voting Classifier"
ajout_score(model, nom_modele, y_test, y_pred)

# Équilibrage des classes

In [None]:
samples0 = df_transforme[df_transforme["Response"] == 0].sample(350, random_state=0)

In [None]:
Xnew = pd.concat((samples0, df_transforme[df_transforme["Response"] == 1]))

In [None]:
print(Xnew)

In [None]:
Xnew["Response"].hist()

In [None]:
X = pd.get_dummies(Xnew.drop(columns=["Response", "Dt_Customer"]))
y = Xnew[["Response"]].astype(int)

In [None]:
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## XGBoost

In [None]:
model = xgboost.XGBClassifier(random_state=0)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, labels=model.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()

## Neural Network

In [None]:
X_train = np.asarray(X_train).astype("float32")
y_train = np.asarray(y_train).astype("float32")
X_test = np.asarray(X_test).astype("float32")
y_test = np.asarray(y_test).astype("float32")

In [None]:
X_train.shape

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

np.random.seed(0)
model = keras.Sequential(
    [
        layers.Dense(400, activation="relu", input_shape=[X_train.shape[1]]),
        layers.Dense(400, activation="relu"),
        layers.Dense(400, activation="sigmoid"),
        layers.Dense(400, activation="relu"),
        layers.Dense(1, activation="sigmoid"),
    ]
)

In [None]:
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["binary_accuracy"],
)

In [None]:
early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)

In [None]:
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    # validation_split=0.2,
    batch_size=512,
    epochs=1000,
    callbacks=[early_stopping],
    verbose=0,  # hide the output because we have so many epochs
)

In [None]:
history_df = pd.DataFrame(history.history)
# Start the plot at epoch 5
history_df.loc[5:, ["loss", "val_loss"]].plot()
history_df.loc[5:, ["binary_accuracy", "val_binary_accuracy"]].plot()

print(
    ("Best Validation Loss: {:0.4f}" + "\nBest Validation Accuracy: {:0.4f}").format(
        history_df["val_loss"].min(), history_df["val_binary_accuracy"].max()
    )
)

In [None]:
y_pred = model.predict(X_test)

In [None]:
sns.histplot(y_pred > 0.5, kde=True)

In [None]:
y_pred_old = y_pred

In [None]:
y_pred = y_pred > 0.5

In [None]:
print(classification_report(y_test, y_pred))  # , labels=model.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred, normalize="true")
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

# Sauvegarde des données

In [None]:
score_modeles_df = pd.DataFrame(score_modeles, columns=["Modèle", "Métrique", "Valeur"])

In [None]:
score_modeles_df.to_csv("data/results/classifications.csv")