# Import des outils / jeu de données

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [None]:
np.random.seed(0)
sns.set_theme()

In [None]:
df = pd.read_csv(
    "data/data-cleaned-feature-engineering.csv",
    sep=",",
    index_col="ID",
    parse_dates=True,
)

In [None]:
df_transforme = pd.read_csv(
    "data/data-transformed.csv",
    sep=",",
    index_col="ID",
    parse_dates=True,
)

## Variables globales

In [None]:
var_numeriques = [
    "Year_Birth",
    "Income",
    "Recency",
    "MntWines",
    "MntFruits",
    "MntMeatProducts",
    "MntFishProducts",
    "MntSweetProducts",
    "MntGoldProds",
    "NumDealsPurchases",
    "NumWebPurchases",
    "NumCatalogPurchases",
    "NumStorePurchases",
    "NumWebVisitsMonth",
]
var_categoriques = [
    "Education",
    "Marital_Status",
    "Kidhome",
    "Teenhome",
    "Complain",
    "AcceptedCmp1",
    "AcceptedCmp2",
    "AcceptedCmp3",
    "AcceptedCmp4",
    "AcceptedCmp5",
    "Response",
]

# Analyse multi-variée

## Clustering

In [None]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
df_apres_scale = pd.DataFrame(
    scaler.fit_transform(df_transforme[var_numeriques]),
    columns=df[var_numeriques].columns,
)

In [None]:
from sklearn.cluster import KMeans

In [None]:
k3 = KMeans(n_clusters=3)
k4 = KMeans(n_clusters=4)

In [None]:
k3.fit(df_apres_scale[var_numeriques])
k4.fit(df_apres_scale[var_numeriques])

In [None]:
df["cluster"] = k3.labels_

In [None]:
sns.histplot(df["cluster"])

In [None]:
sns.histplot(df, x="Income", hue="cluster", kde=True)

In [None]:
sns.scatterplot(df, x="Year_Birth", y="Income", hue="cluster")

In [None]:
df[var_numeriques + ["cluster"]]

In [None]:
df["cluster"] = k4.labels_
sns.histplot(df["cluster"])

In [None]:
sns.histplot(df, x="Income", hue="cluster", kde=True)

In [None]:
sns.scatterplot(df, x="Year_Birth", y="Income", hue="cluster")

## Analyse en Composantes Principales (ACP)

In [None]:
from sklearn.decomposition import PCA

In [None]:
# todo: centrer / réduire

In [None]:
from sklearn.preprocessing import RobustScaler, StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
df_apres_scale = pd.DataFrame(
    scaler.fit_transform(df[var_numeriques]), columns=df[var_numeriques].columns
)

In [None]:
df_apres_scale

In [None]:
# for var in var_numeriques:
#     _, ax = plt.subplots(1, 2, figsize=(8, 2))
#     sns.boxplot(df_apres_scale[var], width=0.25, ax=ax[0])
#     sns.histplot(df_apres_scale[var], kde=True, ax=ax[1])
#     plt.show()

In [None]:
acp = PCA(random_state=0)

In [None]:
acp.fit(df_apres_scale)

In [None]:
variance_expliquee = pd.Series(
    acp.explained_variance_ratio_, index=df[var_numeriques].columns
)

In [None]:
variance_expliquee

In [None]:
variance_expliquee.plot.barh()

In [None]:
df_acp = pd.DataFrame(acp.fit_transform(df_apres_scale), index=df.index)

In [None]:
df_acp.head()

In [None]:
sns.scatterplot(df_acp, x=0, y=1)

### Cercle de corrélations

In [None]:
from mlxtend.plotting import plot_pca_correlation_graph

In [None]:
_, correlation_matrix = plot_pca_correlation_graph(
    df_apres_scale,
    df_apres_scale.columns,
    X_pca=df_acp.iloc[:, :2],
    explained_variance=acp.explained_variance_[:2],
    dimensions=(1, 2),
)

In [None]:
sns.heatmap(
    correlation_matrix,
    annot=True,
    cmap="BrBG",
    linewidths=0.5,
    vmax=1,
    vmin=-1,
)

### Test clusters

In [None]:
df_acp["clusterk3"] = k3.labels_
df_acp["clusterk4"] = k4.labels_

In [None]:
sns.scatterplot(df_acp, x=0, y=1, hue="clusterk3")

In [None]:
df_acp.head()

In [None]:
sns.scatterplot(df_acp, x=0, y=1, hue=df["Response"])

In [None]:
sns.scatterplot(df_acp, x=0, y=2, hue=df["Response"])

In [None]:
sns.scatterplot(df_acp, x=0, y=1, hue="clusterk4")

## Analyse Factorielle des Correspondances (AFC)

In [None]:
import prince

In [None]:
table_contingence = pd.crosstab(df["Kidhome"], df["Teenhome"])

In [None]:
table_contingence

In [None]:
ca = prince.CA(
    # n_components=3,
    # n_iter=3,
    # copy=True,
    # check_input=True,
    # engine='sklearn',
    random_state=0
)

ca = ca.fit(table_contingence)

In [None]:
ca.eigenvalues_summary

In [None]:
ca.plot(table_contingence)

In [None]:
table_contingence = pd.crosstab(df["Marital_Status"], df["Education"])

In [None]:
table_contingence

In [None]:
ca = prince.CA(random_state=0)
ca = ca.fit(table_contingence)

In [None]:
ca.eigenvalues_summary

In [None]:
ca.plot(table_contingence)

In [None]:
# todo: à interpréter

## Analyse des Correspondances Multiples (ACM)

In [None]:
mca = prince.MCA(random_state=0)
mca = mca.fit(df[var_categoriques])

In [None]:
mca.plot(df[var_categoriques])

In [None]:
# todo: à interpréter