
# K-Means Clustering and PCA from Scratch using NumPy

This notebook implements:
- Principal Component Analysis (PCA)
- K-Means Clustering

Both algorithms are implemented **from scratch** without using any ML libraries.

Dataset used: Iris Dataset


In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



## Data Loading and Preprocessing

- The last column **Species** is removed before giving data to the algorithms.
- It is stored separately for later comparison.
- Only numerical features are used.


In [None]:

df = pd.read_csv('/mnt/data/8db8424d-6cf8-451c-af2e-a786508aeadd.csv')

species = df['Species'].values
data = df.drop(columns=['Species']).values

print("Dataset Shape:", data.shape)
df.head()



## Principal Component Analysis (PCA) from Scratch


In [None]:

def PCA_from_scratch(dataset):
    mean = np.mean(dataset, axis=0)
    std = np.std(dataset, axis=0)
    standardized = (dataset - mean) / std

    cov_matrix = np.cov(standardized.T)
    eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)

    idx = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[idx]
    eigenvectors = eigenvectors[:, idx]

    projection = standardized.dot(eigenvectors[:, :3])
    return projection, eigenvalues, eigenvectors

pca_data, eigenvalues, eigenvectors = PCA_from_scratch(data)
print("Top 3 Eigenvalues:", eigenvalues[:3])


In [None]:

from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(pca_data[:,0], pca_data[:,1], pca_data[:,2])
ax.set_title("PCA Projection (First 3 PCs)")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
ax.set_zlabel("PC3")
plt.show()



## K-Means Clustering from Scratch


In [None]:

def KMeans_from_scratch(dataset, k=3, max_iters=100):
    np.random.seed(42)
    random_idx = np.random.choice(len(dataset), k, replace=False)
    centroids = dataset[random_idx]

    for _ in range(max_iters):
        distances = np.sqrt(((dataset - centroids[:, np.newaxis])**2).sum(axis=2))
        labels = np.argmin(distances, axis=0)

        new_centroids = np.array([dataset[labels == i].mean(axis=0) for i in range(k)])

        if np.all(centroids == new_centroids):
            break

        centroids = new_centroids

    return labels, centroids

labels, centroids = KMeans_from_scratch(data, k=3)



## Cluster Visualization (Predicted vs Actual)


In [None]:

plt.figure(figsize=(7,5))
plt.scatter(pca_data[:,0], pca_data[:,1], c=labels)
plt.title("Clusters Predicted by K-Means")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()


In [None]:

species_unique = np.unique(species)
species_map = {name: idx for idx, name in enumerate(species_unique)}
actual_labels = np.array([species_map[s] for s in species])

plt.figure(figsize=(7,5))
plt.scatter(pca_data[:,0], pca_data[:,1], c=actual_labels)
plt.title("Actual Species Distribution")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()



## Accuracy Comparison


In [None]:

from itertools import permutations

def calculate_accuracy(true_labels, pred_labels):
    best_acc = 0
    for perm in permutations(np.unique(pred_labels)):
        mapped = np.array([perm[label] for label in pred_labels])
        acc = np.mean(mapped == true_labels)
        best_acc = max(best_acc, acc)
    return best_acc

accuracy = calculate_accuracy(actual_labels, labels)
print("Clustering Accuracy:", accuracy)
