#K-Means for Classification

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.metrics import adjusted_rand_score, silhouette_score

## Load data

In [None]:
df = pd.read_csv("Mall_Customers.csv")

In [None]:
df.head(3)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.drop("CustomerID", axis=1, inplace=True)

In [None]:
df

## EDA

In [None]:
import math

def plot_all_histograms(df, title_prefix=""):
    num_cols = df.select_dtypes(include=[np.number]).columns
    n_cols = 3
    n_rows = math.ceil(len(num_cols) / n_cols)

    plt.figure(figsize=(5 * n_cols, 4 * n_rows))

    for i, col in enumerate(num_cols, 1):
        plt.subplot(n_rows, n_cols, i)
        sns.histplot(df[col], kde=True, bins=30)
        plt.title(f"{title_prefix}{col}")
        plt.xlabel("")
        plt.ylabel("")

    plt.tight_layout()
    plt.show()

In [None]:
plot_all_histograms(df)

In [None]:
sns.countplot(data=df, x="Genre", hue="Genre")

In [None]:
# Correlation matrix
plt.figure(figsize=(10, 8))
numeric_df = df.select_dtypes(include=[np.number])
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', center=0)
plt.title("Correlation Matrix")
plt.show()

## Normalize Features

In [None]:
label_encoder = LabelEncoder()

df['Genre'] = label_encoder.fit_transform(df['Genre'])

In [None]:
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df)

In [None]:
df = pd.DataFrame(df_scaled, columns = df.columns)
df.head(3)

In [None]:
plot_all_histograms(df)

## K-Means Clustering

In [None]:
inertia = []
silhouette_scores = []

k_values = range(2, 11)
for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(df_scaled)
    inertia.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(df_scaled, kmeans.labels_))

# Vẽ Elbow Method
plt.plot(k_values, inertia, marker='o')
plt.xlabel('Số cụm (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method')
plt.show()

# Vẽ Silhouette Score
plt.plot(k_values, silhouette_scores, marker='o', color='green')
plt.xlabel('Số cụm (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score cho từng k')
plt.show()


In [None]:
optimal_k = k_values[np.argmax(silhouette_scores)]
print(f"Số cluster tối ưu theo Silhouette Score: {optimal_k}")
print(f"Silhouette Score cao nhất: {max(silhouette_scores):.3f}")

In [None]:
kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
cluster_labels = kmeans_final.fit_predict(df_scaled)

In [None]:
silhouette_score(df_scaled, kmeans.labels_)