# Import des outils / jeu de données

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import statsmodels.api as sm
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
)

In [None]:
np.random.seed(0)
sns.set_theme()

In [None]:
df = pd.read_csv(
    "data/data-cleaned-feature-engineering.csv",
    sep=",",
    index_col="ID",
    parse_dates=True,
)

In [None]:
df_transforme = pd.read_csv(
    "data/data-transformed.csv",
    sep=",",
    index_col="ID",
    parse_dates=True,
)

## Variables globales

In [None]:
var_numeriques = [
    "Year_Birth",
    "Income",
    "Recency",
    "MntWines",
    "MntFruits",
    "MntMeatProducts",
    "MntFishProducts",
    "MntSweetProducts",
    "MntGoldProds",
    "NumDealsPurchases",
    "NumWebPurchases",
    "NumCatalogPurchases",
    "NumStorePurchases",
    "NumWebVisitsMonth",
]

In [None]:
var_categoriques = [
    "Education",
    "Marital_Status",
    "Kidhome",
    "Teenhome",
    "Complain",
    "AcceptedCmp1",
    "AcceptedCmp2",
    "AcceptedCmp3",
    "AcceptedCmp4",
    "AcceptedCmp5",
    "Response",
]

# Autres modèles

In [None]:
X = pd.get_dummies(df_transforme.drop(columns=["Response"]))
y = df[["Response"]].astype(int)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Linear Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
lda = LinearDiscriminantAnalysis()

In [None]:
lda.fit(X_train, y_train)

In [None]:
y_pred = lda.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, labels=lda.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=lda.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=lda.classes_)
disp.plot()

## Arbre de décision

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier()

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, labels=model.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
model = RandomForestClassifier(random_state=0)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, labels=model.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()

## XGBoost

In [None]:
import xgboost

In [None]:
model = xgboost.XGBClassifier(random_state=0)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, labels=model.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()

## Réseau de neurones

In [None]:
X_train = np.asarray(X_train).astype("float32")
y_train = np.asarray(y_train).astype("float32")
X_test = np.asarray(X_test).astype("float32")
y_test = np.asarray(y_test).astype("float32")

In [None]:
X_train.shape

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential(
    [
        layers.Dense(4, activation="relu", input_shape=[698]),
        layers.Dense(4, activation="relu"),
        layers.Dense(1, activation="sigmoid"),
    ]
)

In [None]:
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["binary_accuracy"],
)

In [None]:
early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)

In [None]:
history = model.fit(
    X_train,
    y_train,
    # validation_data=(X_test, y_test),
    validation_split=0.2,
    batch_size=512,
    epochs=1000,
    callbacks=[early_stopping],
    verbose=0,  # hide the output because we have so many epochs
)

In [None]:
history_df = pd.DataFrame(history.history)
# Start the plot at epoch 5
history_df.loc[5:, ["loss", "val_loss"]].plot()
history_df.loc[5:, ["binary_accuracy", "val_binary_accuracy"]].plot()

print(
    ("Best Validation Loss: {:0.4f}" + "\nBest Validation Accuracy: {:0.4f}").format(
        history_df["val_loss"].min(), history_df["val_binary_accuracy"].max()
    )
)

In [None]:
y_pred = model.predict(X_test)

In [None]:
sns.histplot(y_pred > 0.5, kde=True)

In [None]:
y_pred_old = y_pred

In [None]:
y_pred = y_pred > 0.5

In [None]:
print(classification_report(y_test, y_pred))  # , labels=model.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred)  # , labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)  # , display_labels=model.classes_)
disp.plot()

## Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
model = VotingClassifier(
    estimators=[
        ("lr", LogisticRegression(random_state=0)),
        ("dt", DecisionTreeClassifier(random_state=0)),
        ("lda", LinearDiscriminantAnalysis()),
        ("xgb", xgboost.XGBClassifier(random_state=0)),
    ],
    voting="soft",
)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred, labels=model.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()