<a href="https://colab.research.google.com/github/dinuka-rp/Python-Machine-Learning/blob/master/KNN_ClassificationBuild2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Creating a K Nearest Neighbor Classifier from scratch, Training & Testing with the breast-cancer dataset

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [2]:
import numpy as np
from math import sqrt
import warnings
from collections import Counter
import pandas as pd   # to load in the dataset
import random     # used to shuffle the dataset

## KNN Algortihm

In [10]:
def k_nearest_neighbours(data, predict, k=3):
  if len(data) >= k:
    warnings.warn("k is set to a value less than the total voting groups")
    
  distances = []

  for group in data:
    for features in data[group]:
      euclidean_distance = np.linalg.norm(np.array(features)- np.array(predict))
      distances.append([euclidean_distance, group])

  # getting the groups of the top 3(k) distances (sorting is done in ascending order)
  votes = [i[1] for i in sorted(distances)[:k]]

  # print(Counter(votes).most_common(1))
  vote_result = Counter(votes).most_common(1)[0][0]

  return vote_result

## Feature Engineering


In [4]:
# importing data file
data_file_name = 'breast-cancer-wisconsin.data'
data_file_path = F"/content/gdrive/My Drive/Extra Learning/ML - Python/KNN-Classification/{data_file_name}" 

df = pd.read_csv(data_file_path)
print(df.head())

df.replace('?', -99999, inplace=True)     # replacing ? with significantly irrelevant number to the dataset

df.drop(['id'], 1, inplace=True)    # is has no effect on the result

full_data = df.astype(float).values.tolist()    # converting all data to float and saving all values as a list

        id  clump_thickness  unif_cell_size  ...  norm_nucleoli  mitoses  class
0  1000025                5               1  ...              1        1      2
1  1002945                5               4  ...              2        1      2
2  1015425                3               1  ...              1        1      2
3  1016277                6               8  ...              7        1      2
4  1017023                4               1  ...              1        1      2

[5 rows x 11 columns]


In [5]:
print(full_data[:5])
random.shuffle(full_data)     # shuffle all data
print(20*'~')
print(full_data[:5])

[[5.0, 1.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0], [5.0, 4.0, 4.0, 5.0, 7.0, 10.0, 3.0, 2.0, 1.0, 2.0], [3.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 1.0, 1.0, 2.0], [6.0, 8.0, 8.0, 1.0, 3.0, 4.0, 3.0, 7.0, 1.0, 2.0], [4.0, 1.0, 1.0, 3.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0]]
~~~~~~~~~~~~~~~~~~~~
[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 2.0], [1.0, 1.0, 1.0, 3.0, 2.0, 3.0, 1.0, 1.0, 1.0, 2.0], [5.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0], [8.0, 5.0, 6.0, 2.0, 3.0, 10.0, 6.0, 6.0, 1.0, 4.0], [4.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 2.0, 1.0, 2.0]]


In [6]:
test_size = 0.2
train_set = {2:[], 4:[]}
test_set = {2:[], 4:[]}

train_data = full_data[:-int(test_size*len(full_data))]     # first 80% of data
test_data = full_data[-int(test_size*len(full_data)):]      # last 20% of data

```-``` counts from the end of the array
[stackoverflow explanation](https://stackoverflow.com/questions/509211/understanding-slice-notation)

### Populating train_set and test_set with features, without the label to the respective labels (2 or 4)

In [7]:
for i in train_data:
  # getting the last element in the array element (contains 2 or 4 - expected prediction/state of the patient)
  train_set[i[-1]].append(i[:-1])

for i in test_data:
  test_set[i[-1]].append(i[:-1])

  

## Training KNN model & Testing to get the Accuracy

In [14]:
correct = 0
total = 0

for group in test_set:    # for each 2 & 4 groups in test dataset
  for data in test_set[group]:     # for each list of features per group in test dataset
    vote = k_nearest_neighbours(train_set, data, k=5)   # sklearn use k=5 as default
    # print('vote:', vote)
    if group == vote:
      correct += 1
    total += 1

print('Accuracy:', correct/total)

Accuracy: 0.9856115107913669
