<a href="https://colab.research.google.com/github/AgarwalMayank2/Face_Detection/blob/main/applying_ML_algorithms/Clustering_PRMLproj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Task : Identify Face in image using clustering algorithm
dataset : using processed dataset

we extracted CNN(ResNet), HoG, LBP features and made these processed datasets

processed datasets : https://github.com/AgarwalMayank2/Face_Detection/tree/main/processed_dataset


# Clustering on CNN features

## Loading filtered_CNN_features_dataset

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import random
from sklearn.metrics import accuracy_score, adjusted_rand_score, confusion_matrix
from scipy.optimize import linear_sum_assignment

In [22]:
url_filtered_CNN_features_dataset = 'https://raw.githubusercontent.com/AgarwalMayank2/Face_Detection/refs/heads/main/processed_dataset/filtered_CNN_features_dataset.csv' #for CNN limited
df_CNN = pd.read_csv(url_filtered_CNN_features_dataset) # reading url for extracted CNN_features_dataset_limited.csv
df_CNN.drop('Unnamed: 0', axis=1, inplace=True)

#dropping those labels whose number of datapoints are less than 80
# Get the counts of each label
label_counts = df_CNN['2048'].value_counts()

# Filter out labels with counts less than 80
labels_to_keep = label_counts[label_counts >= 80].index

# Filter the DataFrame
df_CNN = df_CNN[df_CNN['2048'].isin(labels_to_keep)]

df_CNN

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,2048
207,0.334396,1.031447,0.751403,0.061664,1.358539,0.131194,2.189165,0.168872,0.077561,0.128026,...,0.238129,0.033446,0.691363,1.453779,0.023197,1.180975,0.147620,0.132144,0.061259,Donald_Rumsfeld
208,0.052159,1.385429,0.948584,0.242452,0.765526,0.205163,2.773744,0.487321,0.375770,0.297134,...,0.151970,0.030394,0.931749,1.507545,0.074761,1.410434,0.187171,0.037343,0.192220,Donald_Rumsfeld
209,0.215924,1.243201,1.040831,0.082872,0.996569,0.093324,3.699462,0.166822,0.085976,0.420676,...,0.280522,0.046874,0.933633,1.932075,0.076832,1.779623,0.345520,0.009288,0.176782,Donald_Rumsfeld
210,0.273529,1.568788,1.131568,0.069734,1.060217,0.003534,3.313872,0.470634,0.280771,0.550614,...,0.392776,0.303419,0.718607,1.815761,0.034294,1.519493,0.046060,0.031623,0.141197,Donald_Rumsfeld
211,0.209844,1.102461,0.380182,0.109593,0.713786,0.054662,2.087155,0.206262,0.103134,0.152454,...,0.100046,0.225041,0.373597,0.834548,0.102522,0.784862,0.023190,0.141059,0.058093,Donald_Rumsfeld
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3959,0.312240,1.009504,0.836836,0.018082,0.645396,0.066804,2.271123,0.094211,0.141208,0.130362,...,0.292594,0.216060,0.466659,0.785201,0.021710,0.663016,0.343248,0.060820,0.134825,Colin_Powell
3960,0.318314,1.077076,0.959523,0.018094,0.547238,0.033101,2.574726,0.171667,0.403900,0.141492,...,0.184252,0.081133,0.731656,1.326214,0.040101,1.001839,0.011269,0.099451,0.093939,Colin_Powell
3961,0.344084,0.716365,1.162212,0.025244,1.336606,0.032508,3.778199,0.205079,0.106398,0.161860,...,0.078263,0.198066,0.789224,1.613293,0.019929,1.323015,0.041566,0.020037,0.051515,Colin_Powell
3962,0.349864,0.792629,0.792507,0.079453,1.206452,0.083974,2.950798,0.072012,0.180612,0.077492,...,0.130345,0.346125,0.530768,1.389377,0.117860,1.205199,0.044289,0.032452,0.238427,Colin_Powell


Splitting dataframe into train and test in 4:1 ratio

In [23]:
# Separate features and labels
X = df_CNN.iloc[:, :-1]
y = df_CNN.iloc[:, -1]

# Encode labels (alphabetically)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
# encoding is benificial as working on numbers is lot easier than working on string

# Ensure stratified split (16 training, 4 testing per class)
X_train_CNN, X_test_CNN, y_train_CNN, y_test_CNN = train_test_split(X, y_encoded, test_size=1/5, random_state=42, stratify=y_encoded)

# Normalize features
scaler = StandardScaler()
X_train_CNN = scaler.fit_transform(X_train_CNN)
X_test_CNN = scaler.transform(X_test_CNN)

print(f"Dataset size: {df_CNN.shape}")
print(f"Training size: {X_train_CNN.shape}, Testing size: {X_test_CNN.shape}")

Dataset size: (1140, 2049)
Training size: (912, 2048), Testing size: (228, 2048)


## Function of algorithm

In [24]:
class CustomKMeans:
  def __init__(self , n_clusters=2,max_iter=100):
    self.n_clusters =n_clusters
    self.max_iter=max_iter
    self.centroids=None
  def fit_predict(self,X):
    random_index=random.sample(range(0,X.shape[0]),self.n_clusters)
    self.centroids = X[random_index]

    for i in range(self.max_iter):
      cluster_group = self.assign_clusters(X)
      old_centroids = self.centroids
      self.centroids=self.move_centroids(X,cluster_group)
      if (old_centroids == self.centroids).all():
        break
    return cluster_group
  def assign_clusters(self,X):
    cluster_group = []
    distances = []
    for row in X:
      for centroid in self.centroids:
          distances.append(np.sqrt(np.dot(row-centroid,row-centroid)))
      min_distance = min(distances)
      index_pos = distances.index(min_distance)
      cluster_group.append(index_pos)
      distances.clear()

    return np.array(cluster_group)
  def move_centroids(self,X,cluster_group):
    new_centroids = []
    cluster_type = np.unique(cluster_group)
    for type in cluster_type:
      new_centroids.append(X[cluster_group == type].mean(axis=0))
    return np.array(new_centroids)

In [25]:
# Initialize custom KMeans with desired number of clusters
n_clusters = len(np.unique(y_train_CNN))
kmeans = CustomKMeans(n_clusters=n_clusters, max_iter=100)
train_clusters = kmeans.fit_predict(X_train_CNN)

# ---- 2. Map clusters to known labels using Hungarian algorithm ----
def get_cluster_label_mapping(y_true, cluster_ids):
    conf_mat = confusion_matrix(y_true, cluster_ids)
    row_ind, col_ind = linear_sum_assignment(-conf_mat)
    mapping = dict(zip(col_ind, row_ind))  # cluster_id -> label
    return mapping

cluster_to_label = get_cluster_label_mapping(y_train_CNN, train_clusters)

# ---- 3. Predict on NEW unseen data (or test data) ----
def predict_with_cluster_labels(kmeans_model, X_new, cluster_to_label):
    clusters = kmeans_model.assign_clusters(X_new)
    predicted_labels = np.array([cluster_to_label.get(cluster, -1) for cluster in clusters])
    return predicted_labels

# Example: Predict on test data
y_test_pred = predict_with_cluster_labels(kmeans, X_test_CNN, cluster_to_label)
# --- Predict on training data as well ---
y_train_pred = predict_with_cluster_labels(kmeans, X_train_CNN, cluster_to_label)

# --- Evaluation ---
train_acc = accuracy_score(y_train_CNN, y_train_pred)
test_acc = accuracy_score(y_test_CNN, y_test_pred)

print(f"\nTraining Accuracy (cluster-mapped): {train_acc:.2f}")
print(f"Test Accuracy (cluster-mapped): {test_acc:.2f}")


Training Accuracy (cluster-mapped): 0.30
Test Accuracy (cluster-mapped): 0.32
