## import libraries

In [2]:
import pandas as pd
import numpy as np
from math import sqrt

### read csv

In [3]:
df_train = pd.read_csv("data/iris/iris_train.csv", delimiter=',', header=None, names=[i for i in range(5)], index_col=False)

In [4]:
df_test = pd.read_csv("data/iris/iris_test.csv", delimiter=',', header=None, names=[i for i in range(5)], index_col=False)

### exploratory

In [5]:
df_train.head(5)

Unnamed: 0,0,1,2,3,4
0,7.7,3.8,6.7,2.2,2
1,5.0,3.4,1.6,0.4,0
2,6.7,3.0,5.0,1.7,1
3,5.9,3.0,4.2,1.5,1
4,6.6,2.9,4.6,1.3,1


In [6]:
df_test.head(5)

Unnamed: 0,0,1,2,3,4
0,5.6,2.9,3.6,1.3,1
1,6.4,3.2,4.5,1.5,1
2,5.1,3.8,1.9,0.4,0
3,4.6,3.2,1.4,0.2,0
4,5.2,3.5,1.5,0.2,0


### get features

In [7]:
x_train = df_train.iloc[:,:-1]
x_train.head(5)

Unnamed: 0,0,1,2,3
0,7.7,3.8,6.7,2.2
1,5.0,3.4,1.6,0.4
2,6.7,3.0,5.0,1.7
3,5.9,3.0,4.2,1.5
4,6.6,2.9,4.6,1.3


In [8]:
x_test = df_test.iloc[:,:-1]
x_test.head(5)

Unnamed: 0,0,1,2,3
0,5.6,2.9,3.6,1.3
1,6.4,3.2,4.5,1.5
2,5.1,3.8,1.9,0.4
3,4.6,3.2,1.4,0.2
4,5.2,3.5,1.5,0.2


### get labels

In [9]:
y_train = df_train.iloc[:,-1:]
y_train.head(5)

Unnamed: 0,4
0,2
1,0
2,1
3,1
4,1


In [10]:
y_test = df_test.iloc[:,-1:]
y_test.head(5)

Unnamed: 0,4
0,1
1,1
2,0
3,0
4,0


In [11]:
y_trainset=y_train.values
y_trainset[:5]

array([[2],
       [0],
       [1],
       [1],
       [1]], dtype=int64)

In [12]:
y_testset=y_test.values
y_testset[:5]

array([[1],
       [1],
       [0],
       [0],
       [0]], dtype=int64)

###  data normalization

#### min max scale

In [13]:
x_train = (x_train-x_train.min())/(x_train.max()-x_train.min())
x_train.head(5)

Unnamed: 0,0,1,2,3
0,0.944444,0.75,0.965517,0.875
1,0.194444,0.583333,0.086207,0.125
2,0.666667,0.416667,0.672414,0.666667
3,0.444444,0.416667,0.534483,0.583333
4,0.638889,0.375,0.603448,0.5


In [14]:
x_trainset=x_train.values
x_trainset[:5]

array([[0.94444444, 0.75      , 0.96551724, 0.875     ],
       [0.19444444, 0.58333333, 0.0862069 , 0.125     ],
       [0.66666667, 0.41666667, 0.67241379, 0.66666667],
       [0.44444444, 0.41666667, 0.53448276, 0.58333333],
       [0.63888889, 0.375     , 0.60344828, 0.5       ]])

In [15]:
x_test = (x_test-x_test.min())/(x_test.max()-x_test.min())
x_test.head(5)

Unnamed: 0,0,1,2,3
0,0.363636,0.35,0.45614,0.5
1,0.606061,0.5,0.614035,0.583333
2,0.212121,0.8,0.157895,0.125
3,0.060606,0.5,0.070175,0.041667
4,0.242424,0.65,0.087719,0.041667


In [16]:
x_testset=x_test.values
x_testset[:5]

array([[0.36363636, 0.35      , 0.45614035, 0.5       ],
       [0.60606061, 0.5       , 0.61403509, 0.58333333],
       [0.21212121, 0.8       , 0.15789474, 0.125     ],
       [0.06060606, 0.5       , 0.07017544, 0.04166667],
       [0.24242424, 0.65      , 0.0877193 , 0.04166667]])

### data handling

In [17]:
y = []
for i in y_trainset:
    y.append(i[0])

In [18]:
trainset = np.insert(x_trainset, 4, y, axis=1) 
trainset

array([[0.94444444, 0.75      , 0.96551724, 0.875     , 2.        ],
       [0.19444444, 0.58333333, 0.0862069 , 0.125     , 0.        ],
       [0.66666667, 0.41666667, 0.67241379, 0.66666667, 1.        ],
       [0.44444444, 0.41666667, 0.53448276, 0.58333333, 1.        ],
       [0.63888889, 0.375     , 0.60344828, 0.5       , 1.        ],
       [0.22222222, 0.54166667, 0.10344828, 0.16666667, 0.        ],
       [0.38888889, 0.33333333, 0.51724138, 0.5       , 1.        ],
       [0.19444444, 0.        , 0.4137931 , 0.375     , 1.        ],
       [0.72222222, 0.45833333, 0.65517241, 0.58333333, 1.        ],
       [0.55555556, 0.20833333, 0.65517241, 0.58333333, 1.        ],
       [0.25      , 0.58333333, 0.05172414, 0.04166667, 0.        ],
       [0.22222222, 0.75      , 0.06896552, 0.08333333, 0.        ],
       [0.77777778, 0.41666667, 0.82758621, 0.83333333, 2.        ],
       [0.27777778, 0.70833333, 0.06896552, 0.04166667, 0.        ],
       [0.13888889, 0.58333333, 0.

In [19]:
yi = []
for i in y_testset:
    yi.append(i[0])

In [20]:
testset = np.insert(x_testset, 4, yi , axis =1) 
testset

array([[0.36363636, 0.35      , 0.45614035, 0.5       , 1.        ],
       [0.60606061, 0.5       , 0.61403509, 0.58333333, 1.        ],
       [0.21212121, 0.8       , 0.15789474, 0.125     , 0.        ],
       [0.06060606, 0.5       , 0.07017544, 0.04166667, 0.        ],
       [0.24242424, 0.65      , 0.0877193 , 0.04166667, 0.        ],
       [0.15151515, 0.45      , 0.0877193 , 0.        , 0.        ],
       [0.90909091, 0.3       , 0.89473684, 0.75      , 2.        ],
       [0.39393939, 0.4       , 0.56140351, 0.45833333, 1.        ],
       [0.60606061, 0.25      , 0.75438596, 0.75      , 2.        ],
       [0.33333333, 1.        , 0.07017544, 0.04166667, 0.        ],
       [0.33333333, 0.1       , 0.47368421, 0.375     , 1.        ],
       [0.60606061, 0.3       , 0.80701754, 0.83333333, 2.        ],
       [0.21212121, 0.6       , 0.0877193 , 0.04166667, 0.        ],
       [0.15151515, 0.15      , 0.61403509, 0.66666667, 2.        ],
       [0.48484848, 0.6       , 0.

### trainning

In [21]:
def Euclidean_distance(row1, row2):
    distance = 0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

"""
second function we have created is Get_Neighbors with the help of this function we can find out our nearest 3 datapoints.This function shows that the distance between our test data and training row is different.
"""

def Get_Neighbors(train, test_row, num):
   
    distance = list() # []
    data = []
    for i in train:
        dist = Euclidean_distance(test_row, i)
        distance.append(dist)
        data.append(i)
    distance = np.array(distance)
    data = np.array(data)

    """ we are finding index of min distance """

    index_dist = distance.argsort()

    """ we arange our data acco. to index """

    data  = data[index_dist]

    """ we are slicing num number of datas """

    neighbors = data[:num]
    
    return neighbors

"""Here we have created third function Predict_Classification for the prediction of values through our trained model"""  

def predict_classification(train, test_row, num):
    Neighbors = Get_Neighbors(train, test_row, num)
    Classes = []
    for i in Neighbors:
        Classes.append(i[-1])
    prediction = max(Classes, key= Classes.count)
    return prediction

"""We have created Evaluate Function for checking the accuracy of our predicted values """

def Evaluate(y_true, y_pred):
    n_correct = 0
    for i in range(len(y_true)):
        if y_true[i] == y_pred[i]:
            n_correct += 1

    acc = n_correct/len(y_true)
    
    return acc

### testing

In [41]:
prediction = predict_classification(trainset, testset[9], 4)

print("Label là:", prediction)

Label là: 0.0


### evaluate

In [24]:
#created an empty list y_pred where we store all the prediction values.
y_pred = []
y_true = testset[:, -1]
for i in testset:
    prediction = predict_classification(trainset, i, 3)
    y_pred.append(prediction)

Evaluate(y_true, y_pred)

0.9

In [23]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true, y_pred)

array([[17,  0,  0],
       [ 0, 15,  0],
       [ 0,  5, 13]], dtype=int64)