In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist, squareform


In [None]:
fic_epita_kantar_codes = pd.read_csv('data/fic_epita_kantar_codes.csv', sep=';')
fic_epita_kantar_codes

In [None]:
from utils.utils import select_features

a = ['A11', 'A12', 'A13', 'A14', 'A4', 'A5', 'A5bis', 
           'A8_1_slice', 'A8_2_slice', 'A8_3_slice', 'A8_4_slice', 
           'B1_1_slice', 'B1_2_slice', 'B2_1_slice', 'B2_2_slice', 
           'B3', 'B4', 'B6', 
           'C1_1_slice', 'C1_2_slice', 'C1_3_slice', 'C1_4_slice', 
           'C1_5_slice', 'C1_6_slice', 'C1_7_slice', 'C1_8_slice', 
           'C1_9_slice']

# Combine these columns into a feature space
features = select_features(fic_epita_kantar_codes, a)
features

In [None]:
# We can fill the missing values with 0 based on the nature of the questions
features.fillna(0, inplace=True)
features = features.values

In [None]:
weights = fic_epita_kantar_codes['weight'].values

len(weights)

In [None]:
# Standardize the feature space
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

features_scaled[0]

In [None]:
inertia = []
k_range = range(2, 11)  # Essayer de 1 à 10 clusters

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(features_scaled, sample_weight=weights)
    inertia.append(kmeans.inertia_)

# Affichage du graphique de la méthode du coude
plt.figure(figsize=(8, 5))
plt.plot(k_range, inertia, marker='o')
plt.title("Elbow Method for Optimal k in K-means")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia")
plt.xticks(k_range)
plt.show()

In [None]:
from utils.utils import weighted_distance_matrix

weighted_distances = weighted_distance_matrix(features_scaled, weights)

# Conversion en format condensé pour linkage
condensed_weighted_distances = squareform(weighted_distances)

# Clustering hiérarchique avec la matrice de distances pondérées
Z = linkage(condensed_weighted_distances, method='ward')
Z

In [None]:
plt.figure(figsize=(25, 10))
dendrogram(Z, orientation='top')
plt.show()