<a href="https://colab.research.google.com/github/AgarwalMayank2/Face_Detection/blob/main/applying_ML_algorithms/KNN_PRMLproj.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# KNN Model

In [None]:
import numpy as np
import math

In [None]:
def euclidean_dist(x,y):
  # x1,y1,z1=x
  # x2,y2,z2=y
  return math.sqrt(sum([(a-b)**2 for a,b in zip(x,y)]))

In [None]:
from collections import Counter

In [None]:
def knn(k, X_train, X_test, y_test):
  predictions=[]
  for i in range(X_test.shape[0]):
    distances=[]
    nearest_labels=[]
    for j in range(X_train.shape[0]):
      # append the pair (distance, index) to distances
      dist=euclidean_dist(X_test[i], X_train[j])
      distances.append((dist, j))
    # sorting distances on the basis of first item of pair i.e. distance
    distances.sort(key=lambda x:x[0])
    for k1 in range(min(k, len(distances))):  # Ensure k does not exceed available distances
      nearest_labels.append(y_train[distances[k1][1]])

    label_counts = Counter(nearest_labels)
    most_common_label = label_counts.most_common(1)[0][0] if nearest_labels else -1

    predictions.append(label_encoder.inverse_transform([most_common_label])[0])

    if i<=100 or i==300 or i==500 or i==800 or i==900:
      print(f'Label {i} detected : {predictions[i]}')
      # if most_common_label!=-1:
      #   print(X_test_cnn_pca[i], '\t', label_encoder.inverse_transform([most_common_label])[0])
      # else:
      #   print(X_test_cnn_pca[i], '\t', 'Unknown')

  print(predictions)
  findAccuracy(predictions, y_test)

In [None]:
def findAccuracy(predictions, y_test):
  count=0
  print(predictions)
  # print(y_test)
  for i in range(len(predictions)):
    if predictions[i]==label_encoder.inverse_transform([y_test[i]])[0]:
      count+=1

  accuracy = count/len(predictions)
  print(f"Accuracy: {accuracy * 100:.5f}%")

# Reading and Filtering filtered CNN features

In [None]:
url_filtered_CNN_features_dataset = 'https://raw.githubusercontent.com/AgarwalMayank2/Face_Detection/refs/heads/main/processed_dataset/filtered_CNN_features_dataset.csv' #for CNN limited
df = pd.read_csv(url_filtered_CNN_features_dataset) # reading url for extracted CNN_features_dataset_limited.csv
df.shape

(4324, 2050)

In [None]:
label_counts = df['2048'].value_counts()
valid_labels = label_counts[label_counts >= 80].index
filtered_df_cnn = df[df['2048'].isin(valid_labels)]
filtered_df_cnn.drop('Unnamed: 0', axis=1, inplace=True)
filtered_df_cnn.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_cnn.drop('Unnamed: 0', axis=1, inplace=True)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,2048
207,0.334396,1.031447,0.751403,0.061664,1.358539,0.131194,2.189165,0.168872,0.077561,0.128026,...,0.238129,0.033446,0.691363,1.453779,0.023197,1.180975,0.14762,0.132144,0.061259,Donald_Rumsfeld
208,0.052159,1.385429,0.948584,0.242452,0.765526,0.205163,2.773744,0.487321,0.37577,0.297134,...,0.15197,0.030394,0.931749,1.507545,0.074761,1.410434,0.187171,0.037343,0.19222,Donald_Rumsfeld
209,0.215924,1.243201,1.040831,0.082872,0.996569,0.093324,3.699462,0.166822,0.085976,0.420676,...,0.280522,0.046874,0.933633,1.932075,0.076832,1.779623,0.34552,0.009288,0.176782,Donald_Rumsfeld
210,0.273529,1.568788,1.131568,0.069734,1.060217,0.003534,3.313872,0.470634,0.280771,0.550614,...,0.392776,0.303419,0.718607,1.815761,0.034294,1.519493,0.04606,0.031623,0.141197,Donald_Rumsfeld
211,0.209844,1.102461,0.380182,0.109593,0.713786,0.054662,2.087155,0.206262,0.103134,0.152454,...,0.100046,0.225041,0.373597,0.834548,0.102522,0.784862,0.02319,0.141059,0.058093,Donald_Rumsfeld


# Train/Test split - for CNN dataset

In [None]:
# Separate features and labels
X_cnn = filtered_df_cnn.iloc[:, :-1]
y_cnn = filtered_df_cnn.iloc[:, -1]

# Encode labels (alphabetically)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_cnn)
# encoding is benificial as working on numbers is lot easier than working on string

# Ensure stratified split (16 training, 4 testing per class)
X_train_cnn, X_test_cnn, y_train_cnn, y_test_cnn = train_test_split(X_cnn, y_encoded, test_size=1/5, random_state=42, stratify=y_encoded)

# Normalize features
scaler = StandardScaler()
X_train_cnn = scaler.fit_transform(X_train_cnn)
X_test_cnn = scaler.transform(X_test_cnn)

print(f"Dataset size: {filtered_df_cnn.shape}")
print(f"Training size: {X_train_cnn.shape}, Testing size: {X_test_cnn.shape}")

Dataset size: (1140, 2049)
Training size: (912, 2048), Testing size: (228, 2048)


# Applying PCA for dimensionality reduction

In [None]:
from sklearn.decomposition import PCA

## PCA on CNN features

In [None]:
pca=PCA(n_components=900)

X_train_cnn_pca=pca.fit_transform(X_train_cnn)
X_test_cnn_pca=pca.transform(X_test_cnn)

print(X_train_cnn_pca.shape)
print(X_test_cnn_pca.shape)

(1248, 900)
(312, 900)


In [None]:
import numpy as np
print(np.sum(pca.explained_variance_ratio_))  # Should be close to 1

# Setting y_train and y_test

In [None]:
y_train = y_train_cnn
y_test = y_test_cnn

print(y_train[:10])
print(y_test[:10])

[2 0 1 1 2 2 2 0 2 2]
[2 4 2 1 2 3 2 0 4 2]


## Printing y_test

In [None]:
for i in range(len(y_test)):
  if i<=100:
    print(label_encoder.inverse_transform([y_test[i]])[0])

George_W_Bush
Tony_Blair
George_W_Bush
Donald_Rumsfeld
George_W_Bush
Gerhard_Schroeder
George_W_Bush
Colin_Powell
Tony_Blair
George_W_Bush
Colin_Powell
Colin_Powell
Gerhard_Schroeder
George_W_Bush
Donald_Rumsfeld
Tony_Blair
George_W_Bush
Colin_Powell
Colin_Powell
George_W_Bush
George_W_Bush
Colin_Powell
George_W_Bush
Colin_Powell
Donald_Rumsfeld
Colin_Powell
George_W_Bush
George_W_Bush
Colin_Powell
George_W_Bush
George_W_Bush
Colin_Powell
George_W_Bush
Tony_Blair
Donald_Rumsfeld
Donald_Rumsfeld
Tony_Blair
George_W_Bush
Tony_Blair
George_W_Bush
Gerhard_Schroeder
Donald_Rumsfeld
Colin_Powell
Gerhard_Schroeder
George_W_Bush
Tony_Blair
Gerhard_Schroeder
Donald_Rumsfeld
Donald_Rumsfeld
George_W_Bush
George_W_Bush
George_W_Bush
Gerhard_Schroeder
Colin_Powell
George_W_Bush
George_W_Bush
Gerhard_Schroeder
Colin_Powell
George_W_Bush
Colin_Powell
Tony_Blair
George_W_Bush
Tony_Blair
George_W_Bush
Tony_Blair
George_W_Bush
Colin_Powell
George_W_Bush
George_W_Bush
Donald_Rumsfeld
Donald_Rumsfeld
Geo

# Reading LBP features

In [None]:
url_filtered_lbp_features = 'https://raw.githubusercontent.com/AgarwalMayank2/Face_Detection/refs/heads/main/processed_dataset/filtered_LBP_features_dataset.csv'
df = pd.read_csv(url_filtered_lbp_features)

In [None]:
label_counts = df['256'].value_counts()
valid_labels = label_counts[label_counts >= 80].index
filtered_df_LBP = df[df['256'].isin(valid_labels)]
filtered_df_LBP.drop('Unnamed: 0', axis=1, inplace=True)
print(filtered_df_LBP.head())

         0     1      2      3     4     5      6      7      8     9  ...  \
199  747.0  72.0  433.0  185.0  61.0   6.0  224.0  354.0  392.0  30.0  ...   
200  516.0  33.0  438.0  186.0  34.0   2.0  172.0  316.0  243.0  23.0  ...   
201  805.0  76.0  366.0  204.0  68.0  10.0  181.0  307.0  537.0  54.0  ...   
202  572.0  42.0  304.0  219.0  37.0   3.0  247.0  736.0  587.0  22.0  ...   
203  577.0  49.0  403.0  268.0  87.0   4.0  334.0  641.0  494.0  34.0  ...   

       247     248    249   250    251    252    253    254      255  \
199  498.0  3001.0  734.0  36.0  578.0  616.0  546.0  453.0  12702.0   
200  341.0  2309.0  466.0  24.0  354.0  500.0  426.0  516.0  17530.0   
201  529.0  2171.0  559.0  37.0  413.0  581.0  438.0  537.0  22703.0   
202  624.0  3621.0  582.0  28.0  273.0  590.0  465.0  315.0   3466.0   
203  564.0  1535.0  548.0  21.0  284.0  500.0  448.0  280.0   4402.0   

               256  
199  George_W_Bush  
200  George_W_Bush  
201  George_W_Bush  
202  George_W_

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_LBP.drop('Unnamed: 0', axis=1, inplace=True)


# Train/Test split - for LBP dataset

In [None]:
# Separate features and labels
X_LBP = filtered_df_LBP.iloc[:, :-1]
y_LBP = filtered_df_LBP.iloc[:, -1]

# Encode labels (alphabetically)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_LBP)
# encoding is benificial as working on numbers is lot easier than working on string

# Ensure stratified split (64 training, 16 testing per class)
X_train_LBP, X_test_LBP, y_train_LBP, y_test_LBP = train_test_split(X_LBP, y_encoded, test_size=1/5, random_state=42, stratify=y_encoded)

# Normalize features
scaler = StandardScaler()
X_train_LBP = scaler.fit_transform(X_train_LBP)
X_test_LBP = scaler.transform(X_test_LBP)

print(f"Dataset size: {filtered_df_LBP.shape}")
print(f"Training size: {X_train_LBP.shape}, Testing size: {X_test_LBP.shape}")

Dataset size: (1140, 257)
Training size: (912, 256), Testing size: (228, 256)


# Setting y_train and y_test

In [None]:
y_train = y_train_LBP
y_test = y_test_LBP

print(y_train[:10])
print(y_test[:10])

[2 0 1 1 2 2 2 0 2 2]
[2 4 2 1 2 3 2 0 4 2]


## Printing y_test for checking

In [None]:
for i in range(len(y_test)):
  if i<=100:
    print(label_encoder.inverse_transform([y_test[i]])[0])

George_W_Bush
Tony_Blair
George_W_Bush
Donald_Rumsfeld
George_W_Bush
Gerhard_Schroeder
George_W_Bush
Colin_Powell
Tony_Blair
George_W_Bush
Colin_Powell
Colin_Powell
Gerhard_Schroeder
George_W_Bush
Donald_Rumsfeld
Tony_Blair
George_W_Bush
Colin_Powell
Colin_Powell
George_W_Bush
George_W_Bush
Colin_Powell
George_W_Bush
Colin_Powell
Donald_Rumsfeld
Colin_Powell
George_W_Bush
George_W_Bush
Colin_Powell
George_W_Bush
George_W_Bush
Colin_Powell
George_W_Bush
Tony_Blair
Donald_Rumsfeld
Donald_Rumsfeld
Tony_Blair
George_W_Bush
Tony_Blair
George_W_Bush
Gerhard_Schroeder
Donald_Rumsfeld
Colin_Powell
Gerhard_Schroeder
George_W_Bush
Tony_Blair
Gerhard_Schroeder
Donald_Rumsfeld
Donald_Rumsfeld
George_W_Bush
George_W_Bush
George_W_Bush
Gerhard_Schroeder
Colin_Powell
George_W_Bush
George_W_Bush
Gerhard_Schroeder
Colin_Powell
George_W_Bush
Colin_Powell
Tony_Blair
George_W_Bush
Tony_Blair
George_W_Bush
Tony_Blair
George_W_Bush
Colin_Powell
George_W_Bush
George_W_Bush
Donald_Rumsfeld
Donald_Rumsfeld
Geo

# Calling knn function without applying PCA or LDA

In [None]:
knn(5, X_train_LBP, X_test_LBP, y_test)

In [None]:
knn(5, X_train_cnn, X_test_cnn, y_test)

# Combined features

In [None]:
combined_X_train = np.concatenate((X_train_cnn, X_train_LBP), axis=1)
combined_X_test = np.concatenate((X_test_cnn, X_test_LBP), axis=1)

In [None]:
print(X_train_cnn.shape[1])

2048


In [None]:
print(X_train_LBP.shape)

(912, 256)


In [None]:
print(combined_X_train.shape[1])
print(combined_X_test.shape[1])

2304
2304


## Applying PCA on Combined features

In [None]:
pca=PCA(n_components=500)

X_train_pca=pca.fit_transform(combined_X_train)
X_test_pca=pca.transform(combined_X_test)

print(X_train_pca.shape)
print(X_test_pca.shape)

(912, 500)
(228, 500)


## Applying LDA on Combined features

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

# Apply LDA with at most (number of classes - 1) components
lda = LDA(n_components=min(500, len(set(y_train)) - 1))

X_train_lda = lda.fit_transform(combined_X_train, y_train)
X_test_lda = lda.transform(combined_X_test)

print(X_train_lda.shape)
print(X_test_lda.shape)

(912, 4)
(228, 4)


# Calling knn function after applying PCA on combined features

In [None]:
knn(5, X_train_pca, X_test_pca, y_test)

Label 0 detected : George_W_Bush
Label 1 detected : George_W_Bush
Label 2 detected : George_W_Bush
Label 3 detected : Donald_Rumsfeld
Label 4 detected : George_W_Bush
Label 5 detected : Donald_Rumsfeld
Label 6 detected : George_W_Bush
Label 7 detected : George_W_Bush
Label 8 detected : Tony_Blair
Label 9 detected : George_W_Bush
Label 10 detected : Tony_Blair
Label 11 detected : Donald_Rumsfeld
Label 12 detected : Tony_Blair
Label 13 detected : Tony_Blair
Label 14 detected : George_W_Bush
Label 15 detected : Tony_Blair
Label 16 detected : Donald_Rumsfeld
Label 17 detected : Colin_Powell
Label 18 detected : George_W_Bush
Label 19 detected : George_W_Bush
Label 20 detected : George_W_Bush
Label 21 detected : George_W_Bush
Label 22 detected : George_W_Bush
Label 23 detected : Colin_Powell
Label 24 detected : George_W_Bush
Label 25 detected : Colin_Powell
Label 26 detected : Colin_Powell
Label 27 detected : Tony_Blair
Label 28 detected : George_W_Bush
Label 29 detected : George_W_Bush
Labe

In [None]:
knn(5, combined_X_train, combined_X_test, y_test)

Label 0 detected : George_W_Bush
Label 1 detected : George_W_Bush
Label 2 detected : George_W_Bush
Label 3 detected : Donald_Rumsfeld
Label 4 detected : George_W_Bush
Label 5 detected : Donald_Rumsfeld
Label 6 detected : George_W_Bush
Label 7 detected : George_W_Bush
Label 8 detected : Tony_Blair
Label 9 detected : George_W_Bush
Label 10 detected : Tony_Blair
Label 11 detected : Donald_Rumsfeld
Label 12 detected : Tony_Blair
Label 13 detected : George_W_Bush
Label 14 detected : George_W_Bush
Label 15 detected : Tony_Blair
Label 16 detected : Donald_Rumsfeld
Label 17 detected : Colin_Powell
Label 18 detected : George_W_Bush
Label 19 detected : George_W_Bush
Label 20 detected : George_W_Bush
Label 21 detected : George_W_Bush
Label 22 detected : George_W_Bush
Label 23 detected : Colin_Powell
Label 24 detected : George_W_Bush
Label 25 detected : Colin_Powell
Label 26 detected : Colin_Powell
Label 27 detected : Tony_Blair
Label 28 detected : George_W_Bush
Label 29 detected : George_W_Bush
L

# Calling knn function after applying LDA on combined features

In [None]:
knn(3, X_train_lda, X_test_lda, y_test)

In [None]:
knn(5, X_train_lda, X_test_lda, y_test)

Label 0 detected : George_W_Bush
Label 1 detected : Tony_Blair
Label 2 detected : George_W_Bush
Label 3 detected : Donald_Rumsfeld
Label 4 detected : George_W_Bush
Label 5 detected : George_W_Bush
Label 6 detected : Gerhard_Schroeder
Label 7 detected : Colin_Powell
Label 8 detected : Tony_Blair
Label 9 detected : George_W_Bush
Label 10 detected : Colin_Powell
Label 11 detected : Colin_Powell
Label 12 detected : Gerhard_Schroeder
Label 13 detected : George_W_Bush
Label 14 detected : Donald_Rumsfeld
Label 15 detected : George_W_Bush
Label 16 detected : George_W_Bush
Label 17 detected : Tony_Blair
Label 18 detected : George_W_Bush
Label 19 detected : George_W_Bush
Label 20 detected : George_W_Bush
Label 21 detected : Donald_Rumsfeld
Label 22 detected : George_W_Bush
Label 23 detected : Colin_Powell
Label 24 detected : George_W_Bush
Label 25 detected : Colin_Powell
Label 26 detected : George_W_Bush
Label 27 detected : George_W_Bush
Label 28 detected : Gerhard_Schroeder
Label 29 detected : 

In [None]:
knn(7, X_train_lda, X_test_lda, y_test)