# KNN classification on test dataset (transformation of images)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsRegressor
from tensorflow.keras.preprocessing import image_dataset_from_directory
from tensorflow.keras import layers, models, Sequential, Input, Model
from tensorflow.data.experimental import cardinality
from tensorflow.data.experimental import AUTOTUNE
from tensorflow.keras.models import load_model
import joblib

In [2]:
CNN_model_path = '../models/Full_training/20201212_205911_VGG16_v3_27/'
#CNN_model_path = '../models/Full_training/20201211_202203_VGG16_v3_4/'
images_db_dir = r'C:\Users\pitip\OneDrive\Bureau\raw_data\Clean_Data\Test'
test_dir_imext = r'C:\Users\pitip\OneDrive\Bureau\raw_data\Clean_Data\Recog_test_dataset'

# KNN model preparation
- load CNN model
- prepare DB for KNN fit
- get embedding

In [3]:
model = load_model(CNN_model_path)



In [4]:
%%time

BATCH_SIZE = 32
IMG_SIZE = (224, 224)

layer_outputs = [model.layers[-1].input]
embedding_model = models.Model(inputs=model.input, outputs=layer_outputs)

img_list = []
for dir_path in os.listdir(images_db_dir):
    img_list += os.listdir(os.path.join(images_db_dir, dir_path))

test_dataset = image_dataset_from_directory(images_db_dir, shuffle=False, batch_size=BATCH_SIZE, image_size=IMG_SIZE, label_mode='int', 
                                             labels=img_list, interpolation='bilinear')

image_embeddings = embedding_model.predict(test_dataset)
image_embeddings.shape

Found 1390 files belonging to 12 classes.
Wall time: 5min 58s


(1390, 400)

In [5]:
knr = KNeighborsRegressor(n_neighbors=1)
knr.fit(image_embeddings, img_list)

KNeighborsRegressor(n_neighbors=1)

## KNN prediction

In [6]:
%%time
img_list_imext = []
for dir_path in os.listdir(test_dir_imext):
    img_list_imext += os.listdir(os.path.join(test_dir_imext, dir_path))
test_dataset_imext = image_dataset_from_directory(test_dir_imext, shuffle=False, batch_size=BATCH_SIZE, image_size=IMG_SIZE, label_mode='int', 
                                             labels=img_list_imext, interpolation='bilinear')


#print(img_list_imext)
image_embeddings_imext = embedding_model.predict(test_dataset_imext)
image_embeddings_imext.shape

Found 158 files belonging to 1 classes.
Wall time: 41.2 s


(158, 400)

In [7]:
pred_1 = []
dist_1 = []
pred_2 = []
dist_2 = []
pred_3 = []
dist_3 = []
for i in range(len(img_list_imext)):
    dist, pred_label = knr.kneighbors(X=image_embeddings_imext[i,:].reshape(1, -1), n_neighbors=3, return_distance=True)
    pred_1.append(img_list[pred_label[0][0]])
    pred_2.append(img_list[pred_label[0][1]])
    pred_3.append(img_list[pred_label[0][2]])
    dist_1.append(dist[0][0])
    dist_2.append(dist[0][1])
    dist_3.append(dist[0][2])

In [8]:
results_df = pd.DataFrame({'Label': img_list_imext, 'Original_image': [os.path.splitext(s)[0].split('_')[0]+'.jpg' for s in img_list_imext],
                           'Transform':[int(os.path.splitext(s)[0].split('_')[-1]) if len(os.path.splitext(s)[0].split('_'))==2 else 0 for s in img_list_imext], 'pred_1':pred_1,'pred_2':pred_2,
                            'pred_3':pred_3, 'dist_1':dist_1, 'dist_2':dist_2, 'dist_3':dist_3})
results_df.head()

Unnamed: 0,Label,Original_image,Transform,pred_1,pred_2,pred_3,dist_1,dist_2,dist_3
0,10030.jpg,10030.jpg,0,10030.jpg,99482.jpg,69562.jpg,0.232015,3.994799,4.68716
1,10030_1.jpg,10030.jpg,1,10030.jpg,99482.jpg,91143.jpg,5.505867,5.720191,5.729329
2,10030_2.jpg,10030.jpg,2,51451.jpg,7391.jpg,99482.jpg,5.209195,5.27185,5.34507
3,10030_3.jpg,10030.jpg,3,4825.jpg,22335.jpg,75701.jpg,5.173648,5.554562,5.694982
4,10030_4.jpg,10030.jpg,4,10030.jpg,99482.jpg,69562.jpg,1.777897,4.237323,5.001415


In [9]:
accuracy_score(results_df['Original_image'], results_df['pred_1'])

0.7215189873417721

In [10]:
results_df.head(20)

Unnamed: 0,Label,Original_image,Transform,pred_1,pred_2,pred_3,dist_1,dist_2,dist_3
0,10030.jpg,10030.jpg,0,10030.jpg,99482.jpg,69562.jpg,0.232015,3.994799,4.68716
1,10030_1.jpg,10030.jpg,1,10030.jpg,99482.jpg,91143.jpg,5.505867,5.720191,5.729329
2,10030_2.jpg,10030.jpg,2,51451.jpg,7391.jpg,99482.jpg,5.209195,5.27185,5.34507
3,10030_3.jpg,10030.jpg,3,4825.jpg,22335.jpg,75701.jpg,5.173648,5.554562,5.694982
4,10030_4.jpg,10030.jpg,4,10030.jpg,99482.jpg,69562.jpg,1.777897,4.237323,5.001415
5,10030_5.jpg,10030.jpg,5,10030.jpg,99482.jpg,69562.jpg,2.249013,4.357912,4.79414
6,10030_6.jpg,10030.jpg,6,10030.jpg,99482.jpg,69562.jpg,3.136268,4.658548,4.877944
7,10030_7.jpg,10030.jpg,7,4825.jpg,10521.jpg,101093.jpg,5.635366,5.7436,5.846428
8,13171.jpg,13171.jpg,0,13171.jpg,79071.jpg,83215.jpg,2.128714,5.912957,6.009899
9,13171_1.jpg,13171.jpg,1,19980.jpg,8986.jpg,19880.jpg,5.262571,5.419285,5.990157


In [11]:
def top3_accuracy(results_df):
    results_df2 = results_df[['Label', 'Original_image', 'Transform', 'pred_1', 'pred_2', 'pred_3']].copy()
    results_df2['1_st'] = (results_df2['pred_1']==results_df2['Original_image'])*1
    results_df2['2_nd'] = (results_df2['pred_2']==results_df2['Original_image'])*1
    results_df2['3_rd'] = (results_df2['pred_3']==results_df2['Original_image'])*1
    
    results_df2['in_top_3'] = results_df2[['1_st', '2_nd', '3_rd']].max(axis = 1)
    
    print(f"Top 3 accuracy: {results_df2['in_top_3'].sum()/len(results_df2['in_top_3']):.4f}")
    return results_df2
    

In [12]:
results_df_top_3 = top3_accuracy(results_df)

Top 3 accuracy: 0.7848


In [13]:
transform_list = ['original', 'crop', 'extend', 'rotate', 'contrast', 'color balance', 'noise', 'all modif']
acc_list = []
for i, transform in enumerate(transform_list):
    transform_results_df = results_df.loc[results_df['Transform']==i]
    acc = accuracy_score(transform_results_df['Original_image'], transform_results_df['pred_1'])
    acc_list.append(acc)
    top_3_transform_results_df = results_df_top_3.loc[results_df_top_3['Transform']==i]
    top3_acc = top_3_transform_results_df['in_top_3'].sum()/len(top_3_transform_results_df['in_top_3'])
    print(f"Transformation method: {transform} - Accuracy: {acc:.4f} - Top 3 accuracy: {top3_acc:.4f}")

Transformation method: original - Accuracy: 0.9500 - Top 3 accuracy: 0.9500
Transformation method: crop - Accuracy: 0.5500 - Top 3 accuracy: 0.7000
Transformation method: extend - Accuracy: 0.4000 - Top 3 accuracy: 0.5500
Transformation method: rotate - Accuracy: 0.6000 - Top 3 accuracy: 0.6500
Transformation method: contrast - Accuracy: 0.9500 - Top 3 accuracy: 0.9500
Transformation method: color balance - Accuracy: 1.0000 - Top 3 accuracy: 1.0000
Transformation method: noise - Accuracy: 0.9500 - Top 3 accuracy: 0.9500
Transformation method: all modif - Accuracy: 0.3684 - Top 3 accuracy: 0.5263
