## Implement K Nearest Neighbour From Scratch and Compare with the Built-in Applying on Breast Cancer Dataset



Audity Ghosh
<br>CSE, RUET

#### Objectives:
- Implement KNN with Python
- How KNN works?
- SKlearn KNN
- Comparison using accuracy



#### References:
[1] Implement KNN from scratch https://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/


### Import libraries

In [146]:
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

#### set K

In [147]:
K_of_Knn = 5

### Load data, Breast Cancer Wisconsin (Diagnostic) Data Set , Benign=0, Malignent = 1, Two Classes

In [148]:
data =  pd.read_csv("Breast_Cancer_Dataset.csv")
data.head()

Unnamed: 0,diagnosis,symmetry_worst,fractal_dimension_worst
0,M,0.4601,0.1189
1,M,0.275,0.08902
2,M,0.3613,0.08758
3,M,0.6638,0.173
4,M,0.2364,0.07678


#### move diagnosis column to the last

In [149]:
data = data.reindex(columns = [col for col in data.columns if col != 'diagnosis'] + ['diagnosis'])
data.head()

Unnamed: 0,symmetry_worst,fractal_dimension_worst,diagnosis
0,0.4601,0.1189,M
1,0.275,0.08902,M
2,0.3613,0.08758,M
3,0.6638,0.173,M
4,0.2364,0.07678,M


In [150]:
data.shape

(569, 3)

#### Drop null columns

In [151]:
data = data.dropna(axis=1,inplace=False)
data.head()

Unnamed: 0,symmetry_worst,fractal_dimension_worst,diagnosis
0,0.4601,0.1189,M
1,0.275,0.08902,M
2,0.3613,0.08758,M
3,0.6638,0.173,M
4,0.2364,0.07678,M


In [152]:
data.shape

(569, 3)

In [153]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 3 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   symmetry_worst           569 non-null    float64
 1   fractal_dimension_worst  569 non-null    float64
 2   diagnosis                569 non-null    object 
dtypes: float64(2), object(1)
memory usage: 13.5+ KB


In [154]:
data.describe()

Unnamed: 0,symmetry_worst,fractal_dimension_worst
count,569.0,569.0
mean,0.290076,0.083946
std,0.061867,0.018061
min,0.1565,0.05504
25%,0.2504,0.07146
50%,0.2822,0.08004
75%,0.3179,0.09208
max,0.6638,0.2075


#### Replace Benign with 0, Malignent with 1

In [155]:
data['diagnosis'] = data['diagnosis'].replace(['B', 'M'], [0, 1])

In [156]:
data['diagnosis'].value_counts()

0    357
1    212
Name: diagnosis, dtype: int64

#### Convert dataframe to 2D List

In [157]:
df = data

In [158]:
data = data.values.tolist()

### Function to calculate the Manhattan distance between two rows

In [159]:

def manhattan_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        distance += abs(row1[i] - row2[i])
    return distance

### Function to calculate the Eucildian distance between two rows

In [160]:
def euclidean_distance(row1, row2):
    distance = 0.0
    for i in range(len(row1)-1):
        
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)
 

### Locate the closest neighbors

In [161]:

def get_neighbors(train, test_row, K):
    
    distances = []
    
    for train_row in train:
        dist = manhattan_distance(test_row, train_row)
        distances.append((train_row, dist)) # get all distances from the test row to all train samples 
        
    distances.sort(key=lambda tup: tup[1]) # sort by distance increasingly
    
    neighbors = []
    for i in range(K): # choose only k closest neighbours 
        neighbors.append(distances[i][0])
        
    return neighbors

In [162]:
# test of k closest neighbours' labels 
neighbors = get_neighbors(data, data[0], K_of_Knn)
for neighbor in neighbors:
    print(neighbor[-1])
    #print(neighbor)

1.0
1.0
1.0
1.0
1.0


### prediction with the closest neighbors

In [163]:

def predict_classification(train, test_row, num_neighbors):
    neighbors = get_neighbors(train, test_row, num_neighbors) # k closest neighbours 
    output_values = [row[-1] for row in neighbors] # their labels 
    #print(output_values)
    prediction = max(set(output_values), key=output_values.count) # which label is maximum in population among k neighbours
    return prediction

In [164]:
prediction = predict_classification(data, data[5], K_of_Knn)
print('Desired Output %d, Actual Output %d.' % (data[0][-1], prediction))

Desired Output 1, Actual Output 1.


In [165]:
len(data)

569

## Split and Predict 

In [166]:
def split_predict_compare(train_percent,test_percent):
    
    no_of_train_samples = int(len(data)*train_percent*0.01)
    no_of_test_samples = int(len(data)*test_percent*0.01)
    #print(no_of_train_samples,no_of_test_samples)
    
    #train = data[:no_of_train_samples]
    #test = data[-no_of_test_samples:]
    
    
    #print(test)
    
    train = df.sample(frac = train_percent/100,random_state = 42).values.tolist()
    
    test = []
    for row in data:
        if row not in train:
            test.append(row)
    
    
    
    
    ## Manual KNN
    TP,TN,FP,FN = 0,0,0,0
    
    for row in test:
        prediction = predict_classification(train, row, K_of_Knn)
        #print('Desired Output %d, Actual Output %d.' % (row[-1], prediction))
        if(row[-1]==1 and prediction==1):
            TP+=1
        elif(row[-1]==0 and prediction==0):
            TN+=1
        elif(row[-1]==0 and prediction==1):
            FP+=1
        else:
            FN+=1
            
            
    
    #print(TP,TN,FP,FN)
    accuracy = (TP+TN)/(TP+TN+FP+FN)
    
    print(f"Using {train_percent}% train and {test_percent}% test with KNN raw code is", accuracy)
    
    
    
    
    
    
    ## Built-In KNN 
    
    if(test_percent!=0):
        
        ## KNN SKLEARN    
        # Split dataset into training set and test set
        X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'diagnosis'], df['diagnosis'], test_size=test_percent*0.01)

        #Create KNN Classifier
        knn = KNeighborsClassifier(n_neighbors=K_of_Knn)
        

        #Train the model using the training sets
        knn.fit(X_train, y_train)

        #Predict the response for test dataset
        y_pred = knn.predict(X_test)
        
        
        

        print(f"Using {train_percent}% train and {test_percent}% test with built-in KNN is",metrics.accuracy_score(y_test, y_pred))
    
        
        
    
    

### 70% train data, 30% test data

In [167]:
split_predict_compare(70,30)

Using 70% train and 30% test with KNN raw code is 0.6374269005847953
Using 70% train and 30% test with built-in KNN is 0.6491228070175439


### 80% train data, 20% test data

In [168]:
split_predict_compare(80,20)

Using 80% train and 20% test with KNN raw code is 0.6578947368421053
Using 80% train and 20% test with built-in KNN is 0.6842105263157895
