In [40]:
import os
os.environ["LOKY_MAX_CPU_COUNT"] = "4"  
os.environ["OMP_NUM_THREADS"] = "1"

In [41]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [42]:
df = pd.read_csv("Mall_Customers.csv")
df.head(10)


Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40
5,6,Female,22,17,76
6,7,Female,35,18,6
7,8,Female,23,18,94
8,9,Male,64,19,3
9,10,Female,30,19,72


In [43]:
df.shape

(200, 5)

In [44]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

ValueError: Found input variables with inconsistent numbers of samples: [200, 178]

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.isna().sum()

In [None]:
xlabel = "Annual Income (k$)"
ylabel = "Spending Score (1-100)"

plt.scatter(df[xlabel], df[ylabel])
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.title("Annual Income vs Spending Score")
plt.show()

In [None]:
X = df[['Annual Income (k$)', "Spending Score (1-100)"]]

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
inertias = []
K_range = range(1, 11)

for k in K_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_scaled)
    inertias.append(km.inertia_)

plt.figure(figsize=(7,5))
plt.plot(list(K_range), inertias, marker='o')
plt.xlabel("Number of clusters (k)")
plt.ylabel("Inertia")
plt.title("Elbow Method")
plt.show()


In [None]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

In [45]:
class KMeany:
    def __init__(self, k=3, max_iters=100):
        self.k = k
        self.max_iters = max_iters
        self.centroids = None
    def fit(self, X):
        n_samples, n_features = X.shape
        
        # Step 1: Initialize centroids randomly
        random_idx = np.random.choice(n_samples, self.k, replace=False)
        self.centroids = X[random_idx]

        for _ in range(self.max_iters):
            # Step 2: Assign clusters
            clusters = [[] for _ in range(self.k)]
            for idx, sample in enumerate(X):
                distances = [
                    euclidean_distance(sample, centroid)
                    for centroid in self.centroids
                ]
                cluster_idx = np.argmin(distances)
                clusters[cluster_idx].append(idx)

            # Step 3: Store old centroids
            old_centroids = self.centroids.copy()

            # Step 4: Update centroids
            for i, cluster in enumerate(clusters):
                if cluster:
                    self.centroids[i] = np.mean(X[cluster], axis=0)

            # Step 5: Check convergence
            if np.all(old_centroids == self.centroids):
                break

        self.clusters = clusters
    def predict(self, X):
        labels = []
        for sample in X:
            distances = [
                euclidean_distance(sample, centroid)
                for centroid in self.centroids
            ]
            labels.append(np.argmin(distances))
        return np.array(labels)


In [46]:
kmeans = KMeany(k=5)
kmeans.fit(X_scaled)

labels = kmeans.predict(X_scaled)
print("Cluster labels:", labels)
print("Centroids:", kmeans.centroids)

Cluster labels: [3 0 3 0 3 0 3 0 3 0 3 0 3 0 3 0 3 0 3 0 3 0 3 0 3 0 3 0 3 0 3 0 3 0 3 0 3
 0 3 0 3 0 3 4 3 0 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 1 2 1 4 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 4 1 2 1 2 1
 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1]
Centroids: [[-1.32954532  1.13217788]
 [ 0.99158305  1.23950275]
 [ 1.03782678 -1.26622991]
 [-1.30751869 -1.13696536]
 [-0.2088822  -0.01892551]]


In [47]:
def inertia(X, centroids, labels):
    total = 0.0
    for i in range(len(X)):
        centroid = centroids[labels[i]]
        
        total += np.sum((X[i] - centroid) ** 2)
    return total

In [48]:
wcss = inertia(X_scaled, kmeans.centroids, labels)
wcss

np.float64(65.57885579985044)

In [49]:
from sklearn.metrics import silhouette_score
sil = silhouette_score(X_scaled, labels)
print("Silhouette Score:", sil)

Silhouette Score: 0.5539423799434205
