## import libraries

In [1]:
import pandas as pd
import numpy as np
from math import sqrt

### prepare data


In [20]:
df_train = pd.read_csv("data/optics/opt_train.csv", delimiter=',', header=None, names=[i for i in range(65)], index_col=False)

In [21]:
df_test = pd.read_csv("data/optics/opt_test.csv", delimiter=',', header=None, names=[i for i in range(65)], index_col=False)

### exploratory

In [22]:
df_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,0,1,6,15,12,1,0,0,0,7,...,0,0,0,6,14,7,1,0,0,0
1,0,0,10,16,6,0,0,0,0,7,...,0,0,0,10,16,15,3,0,0,0
2,0,0,8,15,16,13,0,0,0,1,...,0,0,0,9,14,0,0,0,0,7
3,0,0,0,3,11,16,0,0,0,0,...,0,0,0,0,1,15,2,0,0,4
4,0,0,5,14,4,0,0,0,0,0,...,0,0,0,4,12,14,7,0,0,6


In [23]:
df_test.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,0,0,5,13,9,1,0,0,0,0,...,0,0,0,6,13,10,0,0,0,0
1,0,0,0,12,13,5,0,0,0,0,...,0,0,0,0,11,16,10,0,0,1
2,0,0,0,4,15,12,0,0,0,0,...,0,0,0,0,3,11,16,9,0,2
3,0,0,7,15,13,1,0,0,0,8,...,0,0,0,7,13,13,9,0,0,3
4,0,0,0,1,11,0,0,0,0,0,...,0,0,0,0,2,16,4,0,0,4


### get features

In [24]:
x_train = df_train.iloc[:,:-1]
x_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0,1,6,15,12,1,0,0,0,7,...,0,0,0,0,6,14,7,1,0,0
1,0,0,10,16,6,0,0,0,0,7,...,3,0,0,0,10,16,15,3,0,0
2,0,0,8,15,16,13,0,0,0,1,...,0,0,0,0,9,14,0,0,0,0
3,0,0,0,3,11,16,0,0,0,0,...,0,0,0,0,0,1,15,2,0,0
4,0,0,5,14,4,0,0,0,0,0,...,12,0,0,0,4,12,14,7,0,0


In [25]:
x_test = df_test.iloc[:,:-1]
x_test.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0,0,5,13,9,1,0,0,0,0,...,0,0,0,0,6,13,10,0,0,0
1,0,0,0,12,13,5,0,0,0,0,...,0,0,0,0,0,11,16,10,0,0
2,0,0,0,4,15,12,0,0,0,0,...,5,0,0,0,0,3,11,16,9,0
3,0,0,7,15,13,1,0,0,0,8,...,9,0,0,0,7,13,13,9,0,0
4,0,0,0,1,11,0,0,0,0,0,...,0,0,0,0,0,2,16,4,0,0


### get labels

In [26]:
y_train = df_train.iloc[:,-1:]
y_train.head(5)

Unnamed: 0,64
0,0
1,0
2,7
3,4
4,6


In [27]:
y_trainset=y_train.values
y_trainset[:5]

array([[0],
       [0],
       [7],
       [4],
       [6]], dtype=int64)

In [28]:
y_test = df_test.iloc[:,-1:]
y_test.head(5)

Unnamed: 0,64
0,0
1,1
2,2
3,3
4,4


In [29]:
y_testset=y_test.values
y_testset[:5]

array([[0],
       [1],
       [2],
       [3],
       [4]], dtype=int64)

### data normalization

#### min max scale

In [30]:
x_train = (x_train-x_train.min())/(x_train.max()-x_train.min())
x_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,,0.125,0.375,0.9375,0.75,0.0625,0.0,0.0,0.0,0.466667,...,0.0,0.0,0.0,0.0,0.375,0.875,0.4375,0.0625,0.0,0.0
1,,0.0,0.625,1.0,0.375,0.0,0.0,0.0,0.0,0.466667,...,0.1875,0.0,0.0,0.0,0.625,1.0,0.9375,0.1875,0.0,0.0
2,,0.0,0.5,0.9375,1.0,0.8125,0.0,0.0,0.0,0.066667,...,0.0,0.0,0.0,0.0,0.5625,0.875,0.0,0.0,0.0,0.0
3,,0.0,0.0,0.1875,0.6875,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0625,0.9375,0.125,0.0,0.0
4,,0.0,0.3125,0.875,0.25,0.0,0.0,0.0,0.0,0.0,...,0.75,0.0,0.0,0.0,0.25,0.75,0.875,0.4375,0.0,0.0


In [31]:
x_train.fillna(0, inplace = True)
x_trainset=x_train.values
x_trainset[:5]

array([[0.        , 0.125     , 0.375     , 0.9375    , 0.75      ,
        0.0625    , 0.        , 0.        , 0.        , 0.46666667,
        1.        , 0.375     , 0.375     , 0.625     , 0.        ,
        0.        , 0.        , 0.5       , 1.        , 0.125     ,
        0.        , 0.6875    , 0.125     , 0.        , 0.        ,
        0.3125    , 1.        , 0.1875    , 0.        , 0.3125    ,
        0.4375    , 0.        , 0.        , 0.46666667, 0.8125    ,
        0.1875    , 0.        , 0.5       , 0.5       , 0.        ,
        0.        , 0.25      , 0.75      , 0.        , 0.0625    ,
        0.8125    , 0.3125    , 0.        , 0.        , 0.        ,
        0.875     , 0.5625    , 0.9375    , 0.5625    , 0.        ,
        0.        , 0.        , 0.        , 0.375     , 0.875     ,
        0.4375    , 0.0625    , 0.        , 0.        ],
       [0.        , 0.        , 0.625     , 1.        , 0.375     ,
        0.        , 0.        , 0.        , 0.        , 0.4

In [32]:
x_test = (x_test-x_test.min())/(x_test.max()-x_test.min())
x_test.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,,0.0,0.3125,0.8125,0.5625,0.0625,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.375,0.8125,0.625,0.0,0.0,0.0
1,,0.0,0.0,0.75,0.8125,0.3125,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.6875,1.0,0.625,0.0,0.0
2,,0.0,0.0,0.25,0.9375,0.75,0.0,0.0,0.0,0.0,...,0.3125,0.0,0.0,0.0,0.0,0.1875,0.6875,1.0,0.5625,0.0
3,,0.0,0.4375,0.9375,0.8125,0.0625,0.0,0.0,0.0,0.5,...,0.5625,0.0,0.0,0.0,0.4375,0.8125,0.8125,0.5625,0.0,0.0
4,,0.0,0.0,0.0625,0.6875,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.125,1.0,0.25,0.0,0.0


In [33]:
x_test.fillna(0, inplace = True)
x_testset=x_test.values
x_testset[:5]

array([[0.        , 0.        , 0.3125    , 0.8125    , 0.5625    ,
        0.0625    , 0.        , 0.        , 0.        , 0.        ,
        0.8125    , 0.9375    , 0.625     , 0.9375    , 0.3125    ,
        0.        , 0.        , 0.1875    , 0.9375    , 0.125     ,
        0.        , 0.6875    , 0.5       , 0.        , 0.        ,
        0.26666667, 0.75      , 0.        , 0.        , 0.5       ,
        0.53333333, 0.        , 0.        , 0.35714286, 0.5       ,
        0.        , 0.        , 0.5625    , 0.57142857, 0.        ,
        0.        , 0.25      , 0.6875    , 0.        , 0.0625    ,
        0.75      , 0.4375    , 0.        , 0.        , 0.125     ,
        0.875     , 0.3125    , 0.625     , 0.75      , 0.        ,
        0.        , 0.        , 0.        , 0.375     , 0.8125    ,
        0.625     , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.75      , 0.8125    ,
        0.3125    , 0.        , 0.        , 0.        , 0. 

### data handling

In [34]:
y = []
for i in y_trainset:
    y.append(i[0])

In [35]:
trainset = np.insert(x_trainset, 64, y, axis=1) 
trainset[:5]

array([[0.        , 0.125     , 0.375     , 0.9375    , 0.75      ,
        0.0625    , 0.        , 0.        , 0.        , 0.46666667,
        1.        , 0.375     , 0.375     , 0.625     , 0.        ,
        0.        , 0.        , 0.5       , 1.        , 0.125     ,
        0.        , 0.6875    , 0.125     , 0.        , 0.        ,
        0.3125    , 1.        , 0.1875    , 0.        , 0.3125    ,
        0.4375    , 0.        , 0.        , 0.46666667, 0.8125    ,
        0.1875    , 0.        , 0.5       , 0.5       , 0.        ,
        0.        , 0.25      , 0.75      , 0.        , 0.0625    ,
        0.8125    , 0.3125    , 0.        , 0.        , 0.        ,
        0.875     , 0.5625    , 0.9375    , 0.5625    , 0.        ,
        0.        , 0.        , 0.        , 0.375     , 0.875     ,
        0.4375    , 0.0625    , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.625     , 1.        , 0.375     ,
        0.        , 0.        , 0.        , 0. 

In [36]:
yi = []
for i in y_testset:
    yi.append(i[0])

In [37]:
testset = np.insert(x_testset, 64, yi , axis =1) 
testset[:10]

array([[0.        , 0.        , 0.3125    , 0.8125    , 0.5625    ,
        0.0625    , 0.        , 0.        , 0.        , 0.        ,
        0.8125    , 0.9375    , 0.625     , 0.9375    , 0.3125    ,
        0.        , 0.        , 0.1875    , 0.9375    , 0.125     ,
        0.        , 0.6875    , 0.5       , 0.        , 0.        ,
        0.26666667, 0.75      , 0.        , 0.        , 0.5       ,
        0.53333333, 0.        , 0.        , 0.35714286, 0.5       ,
        0.        , 0.        , 0.5625    , 0.57142857, 0.        ,
        0.        , 0.25      , 0.6875    , 0.        , 0.0625    ,
        0.75      , 0.4375    , 0.        , 0.        , 0.125     ,
        0.875     , 0.3125    , 0.625     , 0.75      , 0.        ,
        0.        , 0.        , 0.        , 0.375     , 0.8125    ,
        0.625     , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.75      , 0.8125    ,
        0.3125    , 0.        , 0.        , 0. 

### trainning

In [38]:
def Euclidean_distance(row1, row2):
    distance = 0
    for i in range(len(row1)-1):
        distance += (row1[i] - row2[i])**2
    return sqrt(distance)

"""
second function we have created is Get_Neighbors with the help of this function we can find out our nearest 3 datapoints.This function shows that the distance between our test data and training row is different.
"""

def Get_Neighbors(train, test_row, num):
   
    distance = list() # []
    data = []
    for i in train:
        dist = Euclidean_distance(test_row, i)
        distance.append(dist)
        data.append(i)
    distance = np.array(distance)
    data = np.array(data)

    """ we are finding index of min distance """

    index_dist = distance.argsort()

    """ we arange our data acco. to index """

    data  = data[index_dist]

    """ we are slicing num number of datas """

    neighbors = data[:num]
    
    return neighbors

"""Here we have created third function Predict_Classification for the prediction of values through our trained model"""  

def predict_classification(train, test_row, num):
    Neighbors = Get_Neighbors(train, test_row, num)
    Classes = []
    for i in Neighbors:
        Classes.append(i[-1])
    prediction = max(Classes, key= Classes.count)
    return prediction

"""We have created Evaluate Function for checking the accuracy of our predicted values """

def Evaluate(y_true, y_pred):
    n_correct = 0
    for i in range(len(y_true)):
        if y_true[i] == y_pred[i]:
            n_correct += 1

    acc = n_correct/len(y_true)
    
    return acc

In [49]:
prediction = predict_classification(trainset, testset[5], 3)

print("Label là:", prediction)

Label là: 9.0


### Evaluate

In [51]:
#created an empty list y_pred where we store all the prediction values.
y_pred = []
y_true = testset[:, -1]
for i in testset:
    prediction = predict_classification(trainset, i, 1)
    y_pred.append(prediction)

Evaluate(y_true, y_pred)

0.9782971619365609

In [52]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true, y_pred)

array([[178,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0, 181,   0,   0,   0,   0,   0,   0,   1,   0],
       [  0,   2, 175,   0,   0,   0,   0,   0,   0,   0],
       [  0,   0,   0, 179,   0,   0,   0,   2,   0,   2],
       [  0,   2,   0,   0, 178,   0,   0,   0,   1,   0],
       [  0,   0,   0,   0,   1, 179,   0,   0,   0,   2],
       [  1,   0,   0,   0,   0,   1, 179,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0, 176,   0,   3],
       [  0,   9,   0,   1,   0,   0,   0,   0, 163,   1],
       [  0,   0,   0,   3,   2,   2,   0,   0,   3, 170]], dtype=int64)