In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import os
import time
from numba import cuda
import math

# Chargement
DATA_PATH = "../data/ADEME-CarLabelling.csv"
df = pd.read_csv(DATA_PATH, sep=';', encoding='utf-8')

colonnes_utiles = ['Poids à vide', 'Puissance fiscale', 'Prix véhicule']
df = df[colonnes_utiles].dropna()
df = df[(df['Poids à vide'] > 400) & (df['Puissance fiscale'] > 1) & (df['Prix véhicule'] > 1000)]
df = pd.concat([df] * 10, ignore_index=True)

X = df.values.astype(np.float32)
X_mean = np.mean(X, axis=0)
X_std = np.std(X, axis=0)
X_scaled = (X - X_mean) / X_std


In [None]:
k = 4
np.random.seed(42)
centroids = X_scaled[np.random.choice(X_scaled.shape[0], k, replace=False)]
n_samples, n_features = X_scaled.shape

@cuda.jit
def cuda_assign_clusters(data, centroids, labels):
    i = cuda.grid(1)
    if i < data.shape[0]:
        min_dist = 1e20
        best_cluster = -1
        for j in range(centroids.shape[0]):
            dist = 0.0
            for f in range(data.shape[1]):
                temp = data[i, f] - centroids[j, f]
                dist += temp * temp
            if dist < min_dist:
                min_dist = dist
                best_cluster = j
        labels[i] = best_cluster



In [None]:
start_time = time.time()

d_data = cuda.to_device(X_scaled)
d_centroids = cuda.to_device(centroids)
d_labels = cuda.device_array(n_samples, dtype=np.int32)

threads_per_block = 128
blocks_per_grid = math.ceil(n_samples / threads_per_block)

max_iter = 100
for _ in range(max_iter):
    cuda_assign_clusters[blocks_per_grid, threads_per_block](d_data, d_centroids, d_labels)
    labels = d_labels.copy_to_host()

    # Update sur CPU
    new_centroids = np.zeros_like(centroids)
    counts = np.zeros(k)
    for i in range(n_samples):
        new_centroids[labels[i]] += X_scaled[i]
        counts[labels[i]] += 1
    for j in range(k):
        if counts[j] > 0:
            new_centroids[j] /= counts[j]

    if np.allclose(centroids, new_centroids):
        break
    d_centroids = cuda.to_device(new_centroids)
    centroids = new_centroids

end_time = time.time()
print(f"Temps d'exécution total : {end_time - start_time:.2f} secondes")




In [None]:
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_scaled[:, 0], X_scaled[:, 1], X_scaled[:, 2], c=labels, cmap='viridis', s=1)
ax.set_xlabel('Poids à vide')
ax.set_ylabel('Puissance fiscale')
ax.set_zlabel('Prix véhicule')
plt.title(f"Clustering K-Means (k={k}) des véhicules (CUDA simple)")
plt.tight_layout()
plt.show()

# Affichage des clusters
df['Cluster'] = labels
df.groupby('Cluster')[colonnes_utiles].mean()
