# Import des outils / jeu de données

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import prince
import seaborn as sns
import statsmodels.api as sm
from mlxtend.plotting import plot_pca_correlation_graph
from scipy.stats import bartlett, shapiro
from sklearn.cluster import DBSCAN, OPTICS, AgglomerativeClustering, KMeans
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.metrics import (
    calinski_harabasz_score,
    davies_bouldin_score,
    silhouette_score,
)
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import RobustScaler, StandardScaler
from statsmodels.formula.api import ols

In [None]:
SEED = 0

In [None]:
np.random.seed(SEED)
sns.set_theme()

In [None]:
df = pd.read_csv(
    "data/data-cleaned-feature-engineering.csv",
    sep=",",
    index_col="ID",
    parse_dates=True,
)

In [None]:
df_transforme = pd.read_csv(
    "data/data-transformed.csv",
    sep=",",
    index_col="ID",
    parse_dates=True,
)

## Variables globales

In [None]:
var_numeriques = [
    "Year_Birth",
    "Income",
    "Recency",
    "MntWines",
    "MntFruits",
    "MntMeatProducts",
    "MntFishProducts",
    "MntSweetProducts",
    "MntGoldProds",
    "NumDealsPurchases",
    "NumWebPurchases",
    "NumCatalogPurchases",
    "NumStorePurchases",
    "NumWebVisitsMonth",
]

In [None]:
var_categoriques = [
    "Education",
    "Marital_Status",
    "Kidhome",
    "Teenhome",
    "AcceptedCmp1",
    "AcceptedCmp2",
    "AcceptedCmp3",
    "AcceptedCmp4",
    "AcceptedCmp5",
    "Response",
]

# Analyse multi-variée

## Analyse en Composantes Principales (ACP)

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("scaler", StandardScaler(), var_numeriques),
    ],
)

In [None]:
df_centre_reduit = pd.DataFrame(
    preprocessor.fit_transform(df), columns=df[var_numeriques].columns
)

In [None]:
acp = PCA(random_state=SEED)

In [None]:
acp.fit(df_centre_reduit)

In [None]:
variance_expliquee = pd.Series(acp.explained_variance_ratio_)

In [None]:
variance_expliquee

In [None]:
variance_expliquee.plot.barh()

In [None]:
composantes_principales = pd.DataFrame(
    acp.fit_transform(df_centre_reduit), index=df.index
)

In [None]:
composantes_principales.head()

In [None]:
sns.scatterplot(composantes_principales, x=0, y=1, hue=df["Response"])

### Cercle de corrélations

In [None]:
plot_pca_correlation_graph(
    df_centre_reduit,
    df_centre_reduit.columns,
    X_pca=composantes_principales.iloc[:, :2],
    explained_variance=acp.explained_variance_[:2],
    dimensions=(1, 2),
)

In [None]:
_, correlation_matrix = plot_pca_correlation_graph(
    df_centre_reduit,
    df_centre_reduit.columns,
    X_pca=composantes_principales.iloc[:, :4],
    explained_variance=acp.explained_variance_[:4],
    dimensions=(3, 4),
)

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(
    correlation_matrix,
    annot=True,
    cmap="BrBG",
    linewidths=0.5,
    vmax=1,
    vmin=-1,
)

## Analyse Factorielle des Correspondances (AFC)

In [None]:
table_contingence = pd.crosstab(df["Kidhome"], df["Teenhome"])

In [None]:
table_contingence

In [None]:
ca = prince.CA(
    # n_components=3,
    # n_iter=3,
    # copy=True,
    # check_input=True,
    # engine='sklearn',
    random_state=SEED
)

ca = ca.fit(table_contingence)

In [None]:
ca.eigenvalues_summary

In [None]:
ca.plot(table_contingence)

In [None]:
table_contingence = pd.crosstab(df["Marital_Status"], df["Education"])

In [None]:
table_contingence

In [None]:
ca = prince.CA(random_state=SEED)
ca = ca.fit(table_contingence)

In [None]:
ca.eigenvalues_summary

In [None]:
ca.plot(table_contingence)

In [None]:
# todo: à interpréter

## Analyse des Correspondances Multiples (ACM)

In [None]:
mca = prince.MCA(random_state=SEED)
mca = mca.fit(df[var_categoriques])

In [None]:
mca.plot(df[var_categoriques])

In [None]:
# todo: à interpréter

# Clusters etc

## ACM

In [None]:
mca = prince.MCA(n_components=df[var_categoriques].shape[1], random_state=SEED)

In [None]:
mca.fit(df[var_categoriques])

In [None]:
X_clust = mca.row_coordinates(df[var_categoriques])

In [None]:
X_clust.columns = df[var_categoriques].columns

In [None]:
X_clust.head()

## Fusion ACM et var quanti

In [None]:
X_clust = pd.concat((df[var_numeriques], X_clust), axis=1)

In [None]:
X_clust.head()

In [None]:
scaler = RobustScaler()
df_apres_scale = pd.DataFrame(
    scaler.fit_transform(X_clust),
    columns=X_clust.columns,
    index=df.index,
)

In [None]:
df_apres_scale.head()

In [None]:
df_avec_clusters = df_apres_scale.copy()

## Différents algorithmes de clusters

In [None]:
NB_CLUSTER_MIN = 2
NB_CLUSTER_MAX = 6  # non inclus

In [None]:
dict_kmeans = {
    f"KMeans{i}": KMeans(n_clusters=i, random_state=SEED)
    for i in range(NB_CLUSTER_MIN, NB_CLUSTER_MAX)
}

dict_gmm = {
    f"GMM{i}": GaussianMixture(
        n_components=i, covariance_type="full", random_state=SEED
    )
    for i in range(NB_CLUSTER_MIN, NB_CLUSTER_MAX)
}

dict_cah_ward = {
    f"CAH (Ward) {i}": AgglomerativeClustering(n_clusters=i)
    for i in range(NB_CLUSTER_MIN, NB_CLUSTER_MAX)
}

dict_cah_average = {
    f"CAH (average linkage) {i}": AgglomerativeClustering(
        n_clusters=i, linkage="average"
    )
    for i in range(NB_CLUSTER_MIN, NB_CLUSTER_MAX)
}

dict_cah_simple = {
    f"CAH (single linkage) {i}": AgglomerativeClustering(n_clusters=i, linkage="single")
    for i in range(NB_CLUSTER_MIN, NB_CLUSTER_MAX)
}

dict_cah_complete = {
    f"CAH (complete linkage) {i}": AgglomerativeClustering(
        n_clusters=i, linkage="complete"
    )
    for i in range(NB_CLUSTER_MIN, NB_CLUSTER_MAX)
}

In [None]:
model_clusters = {
    **dict_kmeans,
    **dict_gmm,
    **dict_cah_ward,
    **dict_cah_average,
    **dict_cah_simple,
    **dict_cah_complete,
    "OPTICS": OPTICS(),
}

In [None]:
cluster_metrics = []

for (model_name, model) in model_clusters.items():
    if model.__class__ == GaussianMixture:  # cas particulier du mélange gaussien
        df_avec_clusters[model_name] = model.fit_predict(df_apres_scale)
    else:
        model.fit(df_apres_scale)
        df_avec_clusters[model_name] = model.labels_

    df_avec_clusters[model_name] = pd.Categorical(
        df_avec_clusters[model_name].astype(str)
    )
    repartition = list(
        df_avec_clusters[model_name].value_counts(normalize=True).round(2).astype(str)
    )  # todo: enlever astype(str) si ça sert à rien (tester)

    cluster_metrics.append(
        [
            model_name,
            " | ".join(repartition),
            silhouette_score(
                df_apres_scale, df_avec_clusters[model_name]
            ),  # proche de 1 = mieux
            calinski_harabasz_score(
                df_apres_scale,
                df_avec_clusters[model_name],
            ),  # plus élevé, mieux c'est
            davies_bouldin_score(
                df_apres_scale, df_avec_clusters[model_name]
            ),  # proche de 0 = mieux
        ]
    )

In [None]:
pd.DataFrame(
    cluster_metrics,
    columns=[
        "Algorithme de clustering",
        "Répartition",
        "Silhouette",
        "Calinski-Harabasz",
        "Davies-Bouldin",
    ],
)

## Visualisation

In [None]:
def affiche_taille_clusters(nom_cluster):
    plt.title("Taille des clusters")
    sns.histplot(df_avec_clusters[nom_cluster], shrink=0.5)

    plt.show()

In [None]:
def affiche_clusters_acp(nom_cluster):
    _, ax = plt.subplots(1, 2, figsize=(12, 5))

    ax[0].set_title("Clusters sur les composantes principales 0-1")
    ax[1].set_title("Clusters sur les composantes principales 2-3")

    sns.scatterplot(
        composantes_principales,
        x=0,
        y=1,
        hue=df_avec_clusters[nom_cluster],
        alpha=0.8,
        ax=ax[0],
    )
    sns.scatterplot(
        composantes_principales,
        x=2,
        y=3,
        hue=df_avec_clusters[nom_cluster],
        alpha=0.8,
        ax=ax[1],
    )

    plt.show()

In [None]:
def affiche_clusters_var_quanti(nom_cluster):
    """Affiche les variables quantitatives en fonction des clusters."""
    for var in var_numeriques:
        _, ax = plt.subplots(1, 2, figsize=(10, 3))

        sns.boxplot(
            x=df[var],
            y=df_avec_clusters[nom_cluster],
            width=0.25,
            ax=ax[0],
        )

        sns.histplot(
            x=df[var],
            kde=True,
            ax=ax[1],
            hue=df_avec_clusters[nom_cluster],
            stat="probability",
            common_norm=False,
        )

        plt.show()

In [None]:
def affiche_clusters_var_quali(nom_cluster):
    """Affiche les variables qualitatives en fonction des clusters et vice-versa."""
    for var in var_categoriques:
        _, ax = plt.subplots(1, 2, figsize=(10, 4))

        sns.histplot(
            x=df[var].astype(str),
            ax=ax[0],
            hue=df_avec_clusters[nom_cluster],
            multiple="dodge",
            shrink=0.5,
            common_norm=True,
        )

        sns.histplot(
            hue=df[var].astype(str),
            ax=ax[1],
            x=df_avec_clusters[nom_cluster],
            multiple="dodge",
            shrink=0.5,
            common_norm=True,
        )

        plt.show()

In [None]:
def affiche_clusters(nom_cluster):
    """Affiche les variables en fonction des clusters."""
    affiche_taille_clusters(nom_cluster)
    affiche_clusters_acp(nom_cluster)

    affiche_clusters_var_quanti(nom_cluster)
    affiche_clusters_var_quali(nom_cluster)

In [None]:
affiche_clusters("KMeans2")

In [None]:
affiche_clusters("KMeans4")

In [None]:
affiche_clusters("CAH (average linkage) 2")

# Sauvegarde du Dataframe

In [None]:
# cluster3 = pd.DataFrame(k3.labels_, columns=["cluster3"])

In [None]:
# cluster3.to_csv("data/clusters-kmeans3.csv")