In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Step 1: Load the Mall Customers dataset
data = pd.read_csv("C:\\Users\\91862\\Downloads\\BI_clustering\\6th Prac\\Mall_Customers.csv")

# View the first few rows of the dataset
data.head()

# Step 2: Data Cleaning and Preprocessing
# We will use 'Annual Income' and 'Spending Score' columns for clustering
X = data[['Annual Income (k$)', 'Spending Score (1-100)']]

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Elbow Plot to determine the number of clusters
# We will use the Elbow method to determine the optimal number of clusters
inertia = []
for i in range(1, 11):  # Checking for cluster sizes from 1 to 10
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot the Elbow Curve
plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), inertia, marker='o')
plt.title('Elbow Method for Optimal Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()

# Step 4: Train KMeans with the optimal number of clusters (let's assume it is 5 after analyzing the elbow plot)
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_scaled)
labels = kmeans.labels_

# Step 5: Apply PCA for dimensionality reduction (for visualization purposes)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Step 6: Scatter plot of the clusters in the reduced 2D space
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis', marker='o')
plt.title('KMeans Clustering with PCA Reduction')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar()
plt.show()