In [None]:
#========================================== Program for K-NN; Random Classifier; Evaluation
# RAPIDS cuML kNN model

import cudf, cuml, cupy
from cuml.neighbors import KNeighborsClassifier as cuKNeighbors
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
import pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

print('cuML version',cuml.__version__)


def unpickle(file):
    with open(file, 'rb') as f:
        dict = pickle.load(f, encoding="latin1")
    return dict

data_FULL_training = unpickle(r'../input/trimagi/training_x.dat') #Loads all unpickled raw data in datadict variable
Label_FULL_training = unpickle(r'../input/trimagi/training_y.dat') #Loads all unpickled raw data in datadict variable

data_Validation = unpickle(r'../input/trimagi/validation_x.dat') #Loads all unpickled raw Test data in datadict_tst variable

#Function for pre-processing
    
def DATA_Processing(data_list,class_list,validation_list):
    
    #Remove unformatted data
    remove_idx=[]
    for i in range(1,len(data_list)):
        if data_list[i].shape[0]!=8 or data_list[i].shape[1]!=8 or data_list[i].shape[2]!=3:
            #print(i)
            #print(data_list[i].shape)  
            remove_idx.append(i)

    for j in range(len(remove_idx)):    
        print("Removing value at index",remove_idx[j]) 
        remove=data_list.pop(remove_idx[j])
        remove=class_list.pop(remove_idx[j])


    data_list=np.array(data_list) 
    class_list=np.array(class_list)
    validation_list=np.array(validation_list)


    #8x8 image FILTERED TO 64 valued vector
    data_list=data_list[:,:,:,0]
    data_list = data_list.reshape(data_list.shape[0], data_list.shape[1]*data_list.shape[2])
    #print(data_list[0])

    validation_list=validation_list[:,:,:,0]
    validation_list = validation_list.reshape(validation_list.shape[0], validation_list.shape[1]*validation_list.shape[2])
           
    return data_list,class_list,validation_list


X,Y,VALIDATION_TST = DATA_Processing(data_FULL_training,Label_FULL_training,data_Validation)

###SPLIT THE Training DATA
X, Q_TST, Y, Q_Label = train_test_split(X, Y, test_size=0.05, random_state=0, stratify=Y)
#X, Q_TST, Y, Q_Label = train_test_split(X, Y, test_size=10000, train_size=50000, random_state=0, stratify=Y)


X = np.array(X).astype("float32")  
Y = np.array(Y).astype("float32") 
Q_TST = np.array(Q_TST).astype("float32")
Q_Label = np.array(Q_Label).astype("float32")


print("Training Dataset's Dimensions:" ,X.shape,"class Dim: ", Y.shape)
print("Test Dataset's Dimensions:" ,Q_TST.shape,"class Dim: ", Q_Label.shape)
print()

print("Validation Dataset's Dimensions:" ,VALIDATION_TST.shape)




In [None]:
# This function is only used to record time it takes to perform search over multiple K values. 
def Tuning_KNN(Max_K=10):
    global X,Y,Q_TST
    #Max_K=10
    Accuracy_List=[]
    count=[]

    print("======================================================================================")
    print()
#    print("BATCH is of Shape: ",BATCH.shape)
#    print("BATCH is of length: ",len(BATCH))
#    #print("BATCH : ",Values)

    for i in range(Max_K,100):
        X1=X
        Y1=Y
        Q_TST1=Q_TST
        
        model = cuKNeighbors(n_neighbors=i+1)
        model.fit(X1, Y1)
        BATCH = model.predict(Q_TST1)
        BATCH_Acc = accuracy_score(Q_Label, BATCH)
        del model
        del X1, Y1, Q_TST1
        print("K=",i+1,": ",BATCH_Acc)
        Accuracy_List.append(BATCH_Acc)
        count.append(i+1)

    plt.figure()
    plt.plot(count,Accuracy_List,'--')
    plt.show()
    
%timeit -n1 -r1  Tuning_KNN(44) #will run the function from K=44 to K=99 and display time taken.

In [None]:
# This block will run a search to find optimal K value for our data.
# Either provide specific values to test K in list below 
# or give a range directly in for-loop eg: (123 to 141) to search on all Ks


Accuracy_List=[]
count=[]
Ks=[400,420,440,460,480,500,520,540,560,580,600,620,640,660,680,700,
    720,740,760,780,800,820,840,860,880,900,920,940,960,980,1000]

for i in range(123,141):
    X1=X
    Y1=Y
    Q_TST1=Q_TST
    
    model = cuKNeighbors(n_neighbors=i)
    model.fit(X1, Y1)
    BATCH = model.predict(Q_TST1)
    BATCH_Acc = accuracy_score(Q_Label, BATCH)
    #del model
    #del X1, Y1, Q_TST1
    print("K=",(i),": ",BATCH_Acc)
    print()
    cv_scores = cross_val_score(model, X, Y, cv=5)#print each cv score (accuracy) and average them
    print(cv_scores)
    print('cv_scores mean:{}'.format(np.mean(cv_scores)))
    print()
    #Accuracy_List.append(BATCH_Acc)
    Accuracy_List.append(cv_scores)
    count.append(i)

plt.figure()
plt.plot(count,Accuracy_List,'--')
plt.show()

