In [None]:
import pickle, os
from sklearn.cluster import KMeans
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import rand_score

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Load and Prepare Data

In [None]:
def load_dataset(name_file):

    desired_directory = '/content/drive/MyDrive/CI_Dataset' # Replace with your desired directory path

    file_path = os.path.join(desired_directory, name_file)

    with open(file_path, 'rb') as f:
        data_dict = pickle.load(f)

    return data_dict

In [None]:
loaders_dict = load_dataset(f"dataset-flowers102-features.pkl")
x_train = loaders_dict["x_train"]
x_test = loaders_dict["x_test"]
y_train = loaders_dict["y_train"]
y_test = loaders_dict["y_test"]
print(f"x_train:{x_train.shape}, y_train:{y_train.shape}")
print(f"x_test:{x_test.shape}, y_test:{y_test.shape}")

x_train:(4094, 512), y_train:(4094,)
x_test:(4095, 512), y_test:(4095,)


## K-Means Clustering

In [None]:
# k = 50
kmeans = KMeans(n_clusters=50).fit(x_train)
# print(kmeans.labels_.shape)
# x_train_labels = clustering.labels_
# x_train_centers = clustering.cluster_centers_



## KNN

In [None]:
def knn_test(test_datapoint, compare_datapoints, compare_labels, k):
    distances = np.sum((test_datapoint - compare_datapoints)**2, axis=1)
    distances_index = np.argsort(distances)
    k_nearest_neighbor = distances_index[:k]
    k_nearest_neighbor_labels = compare_labels[k_nearest_neighbor]
    counts = np.bincount(k_nearest_neighbor_labels)
    return np.argmax(counts)

In [None]:
def cluster_select(datapoint, clustering, k):
    cluster_centers = clustering.cluster_centers_
    distances = np.sum((datapoint - cluster_centers)**2, axis=1)
    distances_index = np.argsort(distances)
    k_nearest_clusters = clustering.predict(cluster_centers[distances_index[:k]])
    # print(k_nearest_clusters)
    return k_nearest_clusters

## Inference

In [None]:
def inference(datapoint, train_datapoints, train_labels, clustering, k1, k2, true_label):
    cluster_labels = clustering.labels_
    cluster_index = cluster_select(datapoint, clustering, k1)
    cluster_mask = np.isin(cluster_labels, cluster_index)
    selected_data = train_datapoints[cluster_mask]
    selected_labels = train_labels[cluster_mask]
    label = knn_test(datapoint, selected_data, selected_labels, k2)
    calculate_purity(cluster_index, label, clustering, train_labels, true_label)

    return label

In [None]:
y_pred = []

for test_datapoint in x_test:
    y_pred.append(inference(test_datapoint, x_train, y_train, kmeans, 4, 5))

scores = accuracy_score(y_test, np.array(y_pred))*100 - 4*0.2

print(accuracy_score(y_test, np.array(y_pred))* 100, scores)

## Find Optimal Values

In [None]:
clsuters_range = list(range(1,50))
knn_range = list(range(1,15,2))
scores = np.zeros((len(clsuters_range), len(knn_range)))

for i in range(len(clsuters_range)):
    for j in range(len(knn_range)):
        y_pred = []

        for test_datapoint in x_test:
            y_pred.append(inference(test_datapoint, x_train, y_train, kmeans, clsuters_range[i], knn_range[j]))

        scores[i][j] = accuracy_score(y_test, np.array(y_pred))*100 - clsuters_range[i]*0.2

In [None]:
print(scores)
max_index = np.unravel_index(np.argmax(scores, axis=None), scores.shape)
print(clsuters_range[max_index[0]], knn_range[max_index[1]])

# Correlation between KNN Labels and Cluster Labels

In [None]:
def calculate_purity(cluster_index, label, clustering, train_labels, true_label):
  cluster_labels = clustering.labels_
  for index in cluster_index:
    cluster_mask = np.isin(cluster_labels, index)
    cluster =  train_labels[cluster_mask]
    pred = train_labels[cluster_mask & (train_labels == label)]
    true = train_labels[cluster_mask & (train_labels == true_label)]
    pred_purity = len(pred) / len(cluster) * 100
    true_purity = len(true) / len(cluster) * 100
    print("Cluster {} : purity of the predicted label: {}".format(index, pred_purity))
    print("Cluster {} : purity of the true label: {}".format(index, true_purity))

  cluster_mask = np.isin(cluster_labels, cluster_index)
  cluster =  train_labels[cluster_mask]
  pred = train_labels[cluster_mask & (train_labels == label)]
  true = train_labels[cluster_mask & (train_labels == true_label)]
  pred_purity = len(pred) / len(cluster) * 100
  true_purity = len(true) / len(cluster) * 100
  print("Integrated Cluster : purity of the predicted label : {}".format(pred_purity))
  print("Integrated Cluster : purity of the true label: {}".format(true_purity))

In [None]:
y_pred = []
k1_opt = 4
k2_opt = 5

# misclassified
print("Misclassified Data:")
y_pred.append(inference(x_test[27], x_train, y_train, kmeans, k1_opt, k2_opt, y_test[27]))

# correctly classified
print("Correctly Classified Data:")

y_pred.append(inference(x_test[9], x_train, y_train, kmeans, k1_opt, k2_opt, y_test[9]))

Misclassified Data:
Cluster 6 : purity of the predicted label: 25.405405405405407
Cluster 6 : purity of the true label: 25.405405405405407
Cluster 1 : purity of the predicted label: 12.903225806451612
Cluster 1 : purity of the true label: 12.903225806451612
Cluster 11 : purity of the predicted label: 6.25
Cluster 11 : purity of the true label: 6.25
Cluster 15 : purity of the predicted label: 1.2658227848101267
Cluster 15 : purity of the true label: 1.2658227848101267
Integrated Cluster : purity of the predicted label : 13.953488372093023
Integrated Cluster : purity of the true label: 13.953488372093023
Correctly Classified Data:
Cluster 9 : purity of the predicted label: 81.1881188118812
Cluster 9 : purity of the true label: 81.1881188118812
Cluster 48 : purity of the predicted label: 3.9473684210526314
Cluster 48 : purity of the true label: 3.9473684210526314
Cluster 42 : purity of the predicted label: 2.3622047244094486
Cluster 42 : purity of the true label: 2.3622047244094486
Cluste

#K-means Evaluation

In [None]:
def purity_score(y_true, y_pred):
    cluster_labels = np.unique(y_pred)
    max_labels = np.zeros_like(cluster_labels)
    for i in range(len(cluster_labels)):
        mask = (y_pred == cluster_labels[i])
        max_labels[i] = np.argmax(np.bincount(y_true[mask]))
    purity = np.sum(y_true == max_labels[y_pred]) / len(y_true)
    return purity

In [None]:
# rand index
y_pred = kmeans.labels_
rand_index = rand_score(y_train, y_pred)
print("Rand Index: {}".format(rand_index))

# purity
purity = purity_score(y_train, y_pred)
print("Purity: {}".format(purity))

Rand Index: 0.9801593889790748
Purity: 0.5288226673180264
