# <center>SEGMENTATION DE LA CLIENTELE D'UNE ENTREPRISE.</center>

### OBJECTIF: 
       Apprendre a analyser un ensemble de données de transactions clients afin d'interprêter le comportement 
       des clients d'un fournisseur.

### TACHES: 
   * Analyse exploratoire des données
   * Analyse en composantes principales
   * Creation des clusters

## ANALYSE EXPLORATOIRE DES DONNEES

### IMPORTATION

In [None]:
from warnings import filterwarnings
filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, KernelPCA
from scipy.stats import pearsonr
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

### DONNEES

In [None]:
df = pd.read_csv("customers.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.isna().sum()

In [None]:
df["Channel"].value_counts()

In [None]:
100 * df["Region"].value_counts(normalize = True)

In [None]:
df = df.rename(columns = {"Delicassen" : "Delicatessen"})
df["Region"] = df["Region"].map({1: "Lisbon", 2: "Porto", 3: "Other"})

In [None]:
colonnes = df.columns.tolist()
colonnes

In [None]:
for colonne in colonnes:
    if colonne == "Channel" or colonne == "Region":
        sns.countplot(y = df[colonne], palette = 'cool').set_title(colonne)
        plt.show()
    else:
        sns.histplot(x = df[colonne], palette = 'cool').set_title(colonne)
        plt.show()

In [None]:
sns.pairplot(df);

## ANALYSE EN COMPOSANTES PRINCIPALES

In [None]:
data = df.drop(["Region", "Channel"], axis = 1)
data.head()

In [None]:
#standardisation

scaler = StandardScaler()
data = scaler.fit_transform(data)
data

In [None]:
#ACP LINEAIRE

pca = PCA(n_components = 2)
res_pca = pca.fit_transform(data)
res_pca

In [None]:
plt.scatter(res_pca[:,0], res_pca[:,1])
plt.xlabel("composate_1")
plt.ylabel("composate_2")
plt.title("PC1 vs PC2");

In [None]:
# coefficient de correlation de pearson

pearsonr(res_pca[:,0], res_pca[:,1])[0]

In [None]:
# ACP non-lineaire

kpca = KernelPCA(n_components = 2, kernel = "rbf")
res_kpca_rbf = kpca.fit_transform(data)
res_kpca_rbf

In [None]:
plt.scatter(res_kpca_rbf[:,0], res_kpca_rbf[:,1])
plt.xlabel("composate_1")
plt.ylabel("composate_2")
plt.title("PC1 vs PC2");

In [None]:
# ajout des nouvelles colonnes au dataset

df["x_kpca_rbf"] = res_kpca_rbf[:,0]
df["y_kpca_rbf"] = res_kpca_rbf[:,1]

In [None]:
df.head()

## CREATION DES CLUSTERS (KMEANS)

In [None]:
# choisir le meilleur nombre de cluster

kmeans = KMeans()
visualizer = KElbowVisualizer(kmeans, k = (2, 12))
visualizer.fit(res_kpca_rbf)
visualizer.show();

In [None]:
model = KMeans(n_clusters = 5)
model.fit(res_kpca_rbf)
cluster = model.predict(res_kpca_rbf)
cluster

In [None]:
plt.scatter(res_kpca_rbf[:,0], res_kpca_rbf[:,1], c = cluster)
plt.xlabel("composate_1")
plt.ylabel("composate_2")
plt.title("PC1 vs PC2");

In [None]:
df["cluster"] = cluster

In [None]:
df.head()

In [None]:
df["cluster"].unique()

### conclusion

In [None]:
result = df.drop(["x_kpca_rbf", "y_kpca_rbf","Channel"], axis = 1).groupby(["cluster", "Region"]).mean()
result

In [None]:
result.to_csv("result.csv")