<a href="https://colab.research.google.com/github/AmaniBKL/Projects-ML/blob/main/Supervised_ML_KNN_from_scratch_iris_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1. Handle Data**

In [None]:
# load our data file.
import csv
data='/content/iris.data.txt' # path of our data file.
with open(data, 'r') as csvfile: # open a file using the open method with read parameter under the name of csvfile.
  lines = csv.reader(csvfile) # using csv module to read our data.
  for row in lines :
    print (', '.join(row)) # separator.join(iterable) : takes all items in an iterable and joins them into one string.

5.1, 3.5, 1.4, 0.2, Iris-setosa
4.9, 3.0, 1.4, 0.2, Iris-setosa
4.7, 3.2, 1.3, 0.2, Iris-setosa
4.6, 3.1, 1.5, 0.2, Iris-setosa
5.0, 3.6, 1.4, 0.2, Iris-setosa
5.4, 3.9, 1.7, 0.4, Iris-setosa
4.6, 3.4, 1.4, 0.3, Iris-setosa
5.0, 3.4, 1.5, 0.2, Iris-setosa
4.4, 2.9, 1.4, 0.2, Iris-setosa
4.9, 3.1, 1.5, 0.1, Iris-setosa
5.4, 3.7, 1.5, 0.2, Iris-setosa
4.8, 3.4, 1.6, 0.2, Iris-setosa
4.8, 3.0, 1.4, 0.1, Iris-setosa
4.3, 3.0, 1.1, 0.1, Iris-setosa
5.8, 4.0, 1.2, 0.2, Iris-setosa
5.7, 4.4, 1.5, 0.4, Iris-setosa
5.4, 3.9, 1.3, 0.4, Iris-setosa
5.1, 3.5, 1.4, 0.3, Iris-setosa
5.7, 3.8, 1.7, 0.3, Iris-setosa
5.1, 3.8, 1.5, 0.3, Iris-setosa
5.4, 3.4, 1.7, 0.2, Iris-setosa
5.1, 3.7, 1.5, 0.4, Iris-setosa
4.6, 3.6, 1.0, 0.2, Iris-setosa
5.1, 3.3, 1.7, 0.5, Iris-setosa
4.8, 3.4, 1.9, 0.2, Iris-setosa
5.0, 3.0, 1.6, 0.2, Iris-setosa
5.0, 3.4, 1.6, 0.4, Iris-setosa
5.2, 3.5, 1.5, 0.2, Iris-setosa
5.2, 3.4, 1.4, 0.2, Iris-setosa
4.7, 3.2, 1.6, 0.2, Iris-setosa
4.8, 3.1, 1.6, 0.2, Iris-setosa
5.4, 3.4

In [None]:
# split the data into a training and test dataset.
# this is a random segmentation of training data sets and test data sets. 
import csv
import random
def loadDataset(filename, split, trainingSet=[] , testSet=[]): # filename: file path and split : ratio of training data set to test data set.
  with open(filename, 'r') as csvfile:
    lines = csv.reader(csvfile)
    dataset = list(lines) # put list of lines in one list.
    for x in range(len(dataset)-1):
      for y in range(4):
        dataset[x][y] = float(dataset[x][y])
      if random.random() < split: # random.random() is used to generate a random number of 0 to 1: 0 <= n < 1.0.
        trainingSet.append(dataset[x])
      else:
        testSet.append(dataset[x])

In [None]:
# test this function out with our iris dataset.
trainingSet=[]
testSet=[]
# it is a common practice that the ratio of training data set to test data set is 67/33, so split is usually 0.66.
loadDataset(data, 0.66, trainingSet, testSet) 
print ('Train: ' + repr(len(trainingSet)))
print ('Test: ' + repr(len(testSet)) )

Train: 99
Test: 50


# **2. Similarity**

In [None]:
# calculate the similarity between any two given data instances.
# number of elements in the instance1 =number of elements in the instance2.
# the length refers to the number of elements in the instance1.
import math
def euclideanDistance(instance1, instance2, length): # fixing length and ignoring the final dimension of the species.
  d = 0
  for x in range(length):
    d += pow((instance1[x]-instance2[x]),2) # using the euclidean distance measure. 
  return math.sqrt(d)

In [None]:
# testing our function with some sample data.
data1 = [2, 2, 2, 'a']
data2 = [4, 4, 4, 'b']
d = 0
d = euclideanDistance(data1, data2, 3)
print ('Distance: ' + repr(d))

Distance: 3.4641016151377544


# **3. Neighbors**

In [None]:
# collect the k most similar instances (neighbors) from the training set for a given unseen test instance.
import operator
def getNeighbors(trainingSet, testInstance, k): # tesInstance: data to be predicted.
  distances = []
  length = len(testInstance)-1
  for x in range(len(trainingSet)):
    dist = euclideanDistance(testInstance, trainingSet[x], length) # calculating all the distance between testInstance and elements of trainingSet.
    distances.append((trainingSet[x], dist)) # list contening instance and distance.
  distances.sort(key=operator.itemgetter(1)) # ordored list from the small distance to the largest one.
  neighbors = []
  for x in range(k):
    neighbors.append(distances[x][0]) # selecting the subset with the smallest distance value.
    return neighbors

In [None]:
# test out the function.
trainSet = [[2, 2, 2, 'a'], [4, 4, 4, 'b']]
testInst = [5, 5, 5]
k = 1
voisins = getNeighbors(trainSet, testInst, 1)
print(voisins)

[[4, 4, 4, 'b']]


# **4. Response**

In [None]:
# devise a predicted response based on those neighbors.
import operator
def getResponse(neighbors):
  classVotes = {} # a dictionary contening the class and the vote.
  for x in range(len(neighbors)): # traversing the nearest element.
    response = neighbors[x][-1] # the class is the last attribute for each neighbor.
    if response in classVotes:
      classVotes[response] += 1 # allowing each neighbor to vote for their class attribute.
    else:
      classVotes[response] = 1
  sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True) # ordered list from the largest to the smallest one.
  return sortedVotes[0][0] # take the majority vote as the prediction.

In [None]:
# test out the function with some test neighbors.
voisins = [[1,1,1,'a'], [2,2,2,'a'], [3,3,3,'b']]
response = getResponse(voisins)
print(response)

a


# **5. Accuracy**

In [None]:
# evaluate the accuracy of predictions.
# sums the total correct predictions and returns the accuracy as a percentage of correct classifications.
# this is the classification accuracy.
def getAccuracy(testSet, predictions):
  c = 0
  for x in range(len(testSet)):
    if testSet[x][-1] == predictions[x]:
      c += 1
  return (c/float(len(testSet))) * 100.0 # the accuracy of the model is the ratio of the total correct predictions out of all predictions made.

In [None]:
# test the function with a test dataset and predictions.
testSet0 = [[1,1,1,'a'], [2,2,2,'a'], [3,3,3,'b']]
predictions0 = ['a', 'a', 'a']
accuracy0 = getAccuracy(testSet0, predictions0)
print(accuracy0)

66.66666666666666


# **6. Main**

In [None]:
# main function.
def main(): 
  data='/content/iris.data.txt'
  trainingSet=[] 
  testSet=[] 
  split = 0.67 
  loadDataset(data, split, trainingSet, testSet) 
  print ('Train: ' + repr(len(trainingSet)))
  print ('Test: ' + repr(len(testSet)) )

  predictions=[]
  k = 5
  for x in range(len(testSet)):
    neighbors = getNeighbors(trainingSet, testSet[x], k)
    result = getResponse(neighbors)
    predictions.append(result)

  accuracy = getAccuracy(testSet, predictions)
  print('Accuracy= ' + repr(accuracy) + '%')


In [None]:
import csv
import random
import math
import operator
main() 

Train: 100
Test: 49
Accuracy= 97.95918367346938%


# **7. Another distance metric**

In [None]:
# import math library
from math import *
from decimal import Decimal
 
#  Minkowski distance function.
def proot(value, root):
  rootvalue = 1 / float(root)
  return round (Decimal(value) ** Decimal(rootvalue), 3)
 
def MinkowskiDistance(x, y, p):
  i=[0, 1, 2]
  x0=[]
  y0=[]
  for index in i:
    x0.append(x[index])
    y0.append(y[index])
  return (proot( sum( pow(abs(a-b), p) for a, b in zip(x0, y0) ), p))

In [None]:
# testing our function with some sample data.
data1 = [2, 2, 2, 'a']
data2 = [4, 4, 4, 'b']
p = 3
d = 0
print(MinkowskiDistance(data1, data2, p))

2.884
