## Data Loading and Preprocessing
 Load the Iris dataset, separate the species labels, drop unnecessary columns, and standardize the features.


In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from collections import Counter

# Load data
df = pd.read_csv('Iris Dataset.csv')
species_labels = df['Species'].copy()
data = df.drop(['Id', 'Species'], axis=1)

# Standardize the data
def standardize(data):
    mean = np.mean(data, axis=0)
    std = np.std(data, axis=0)
    return (data - mean) / std

X = data.values
X_std = standardize(X)

## Principal Component Analysis
 Perform PCA to reduce the dataset to 3 dimensions and analyze the variance explained by each principal component.


In [23]:
def pca(X, n_components=3):
    cov_matrix = np.cov(X.T)
    eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
    sorted_idx = np.argsort(eigenvalues)[::-1]
    eigenvalues = eigenvalues[sorted_idx]
    eigenvectors = eigenvectors[:, sorted_idx]
    principal_components = eigenvectors[:, :n_components]
    X_pca = np.dot(X, principal_components)
    return X_pca, eigenvalues, principal_components

X_pca, eigenvalues, pcs = pca(X_std)
variance_explained = eigenvalues / np.sum(eigenvalues) * 100

## K-Means Clustering
 Cluster the standardized data into 3 groups and compute centroids and cluster assignments.

In [24]:
def k_means(X, k=3, max_iters=100, n_init=10):
    best_clusters = None
    best_centroids = None
    best_inertia = float('inf')

    for _ in range(n_init):
        centroids = X[np.random.choice(X.shape[0], k, replace=False)]
        for _ in range(max_iters):
            distances = np.sqrt(((X - centroids[:, np.newaxis])**2).sum(axis=2))
            clusters = np.argmin(distances, axis=0)
            new_centroids = np.array([X[clusters == i].mean(axis=0) for i in range(k)])
            if np.allclose(centroids, new_centroids, rtol=1e-4):
                break
            centroids = new_centroids
        inertia = sum(np.sum((X[clusters == i] - centroids[i])**2) for i in range(k))
        if inertia < best_inertia:
            best_inertia = inertia
            best_clusters = clusters
            best_centroids = centroids

    return best_clusters, best_centroids

clusters, centroids = k_means(X_std, k=3)

## Cluster Accuracy Evaluation
 Map clusters to actual species using majority voting and compute accuracy.

In [25]:
def map_clusters_to_species(clusters, species_labels):
    cluster_to_species = {}
    for cluster_id in np.unique(clusters):
        species_in_cluster = species_labels[clusters == cluster_id]
        most_common = Counter(species_in_cluster).most_common(1)[0][0]
        cluster_to_species[cluster_id] = most_common
    return cluster_to_species

cluster_map = map_clusters_to_species(clusters, species_labels)
predicted_species = np.array([cluster_map[c] for c in clusters])
accuracy = np.mean(predicted_species == species_labels) * 100


## Confusion matrix

In [26]:
def create_confusion_matrix(true_labels, pred_labels):
    classes = np.unique(true_labels)
    matrix = np.zeros((len(classes), len(classes)), dtype=int)
    class_to_idx = {cls: i for i, cls in enumerate(classes)}
    for true, pred in zip(true_labels, pred_labels):
        matrix[class_to_idx[true], class_to_idx[pred]] += 1
    return matrix, classes

conf_matrix, classes = create_confusion_matrix(species_labels, predicted_species)
