In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from collections import Counter

In [25]:
def find_distance(a,b,p=1):
    dim = len(a)
    distance = 0
    for d in range(dim):
        distance += abs(a[d]-b[d])**p
    distance = distance**(1/p)
    return distance

In [26]:
def test_distance(test_point,train_points,y_train,k,p=1):
    distances = []
    for i in range(len(train_points)):
        distances.append(find_distance(test_point,train_points[i],p))
        
    df_distance = pd.DataFrame(data=distances,index=y_train.index,columns=["dist"])
    df_nn = df_distance.sort_values(by=["dist"],axis=0)[:k]
    return df_nn

In [27]:
def return_most_common(y,df_nn):
    counter = Counter(y[df_nn.index])
    return counter.most_common()[0][0]

In [28]:
from sklearn.preprocessing import StandardScaler

In [29]:
def knn_predict(X_train,y_train,X_test,k=5,p=1):
    y_predicted_data=[]
    for test_point in X_test:
        distances = test_distance(test_point,X_train,y_train,k,p)
        prediction = return_most_common(y_train,distances)
        y_predicted_data.append(prediction)
    return y_predicted_data

In [30]:
from sklearn.metrics import accuracy_score

In [31]:
glass_data = pd.read_csv("glass.csv")
glass_data.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [32]:
glass_data.isnull().sum()

RI      0
Na      0
Mg      0
Al      0
Si      0
K       0
Ca      0
Ba      0
Fe      0
Type    0
dtype: int64

In [33]:
X = glass_data.drop("Type", axis=1)
X

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0
...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0


In [34]:
y = glass_data["Type"]
y

0      1
1      1
2      1
3      1
4      1
      ..
209    7
210    7
211    7
212    7
213    7
Name: Type, Length: 214, dtype: int64

In [48]:
def predict(X, y):
    np.random.seed(42)
    test_sizes = [0.3, 0.1]
    k_values = [3, 5, 7]
    p_values = [1, 2]
    print("="*40)
    for k in k_values:
        print(f"For k value = {k}")
        for size in test_sizes:
            print(f"For test size = {size}")
            X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=size, random_state=41)
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            for p in p_values:
                if p == 2:
                    print("For Euclidean distance:")
                else:
                    print("For Manhattan distance:")
                y_preds = knn_predict(X_train, y_train, X_test, k=k, p=p)
                print("Accuracy score is: ", accuracy_score(y_test, y_preds))
            print("-"*30)
        print("="*40)

In [49]:
def predict_leave_one_out(X, y):
    np.random.seed(42)
    k_values = [3, 5, 7]
    p_values = [1, 2]
    for k in k_values:
        print(f"For k value = {k}")
        print("For leave one out")
        for p in p_values:
            if p == 2:
                print("For Euclidean distance:")
            else:
                print("For Manhattan distance:")
            scores = []
            for i in range(100):
                random_idx = np.random.choice(range(len(X)))
                X_train, X_test = X.drop(random_idx, axis=0), X.iloc[random_idx]
                y_train, y_test = y.drop(random_idx, axis=0), y.iloc[random_idx]
                scaler = StandardScaler()
                X_train = scaler.fit_transform(X_train)
                X_test = X_test.to_numpy()
                X_test = X_test.reshape(1, -1)
                X_test = scaler.transform(X_test)
                y_preds = knn_predict(X_train, y_train, X_test, k=k, p=p)
                if y_preds[0] == y_test:
                    scores.append(1)
                else:
                    scores.append(0)
            scores = pd.Series(data=scores)
            scores = scores.value_counts()
            print("Percentage of correct predictions: ", (scores[0]/sum(scores))*100, "%")
        print("="*40)

In [50]:
predict(X,y)

For k value = 3
For test size = 0.3
For Manhattan distance:
Accuracy score is:  0.9444444444444444
For Euclidean distance:
Accuracy score is:  0.8888888888888888
------------------------------
For test size = 0.1
For Manhattan distance:
Accuracy score is:  1.0
For Euclidean distance:
Accuracy score is:  1.0
------------------------------
For k value = 5
For test size = 0.3
For Manhattan distance:
Accuracy score is:  0.9444444444444444
For Euclidean distance:
Accuracy score is:  0.9444444444444444
------------------------------
For test size = 0.1
For Manhattan distance:
Accuracy score is:  1.0
For Euclidean distance:
Accuracy score is:  1.0
------------------------------
For k value = 7
For test size = 0.3
For Manhattan distance:
Accuracy score is:  0.8888888888888888
For Euclidean distance:
Accuracy score is:  0.8333333333333334
------------------------------
For test size = 0.1
For Manhattan distance:
Accuracy score is:  1.0
For Euclidean distance:
Accuracy score is:  1.0
-----------

In [51]:
predict_leave_one_out(X, y)

For k value = 3
For leave one out
For Manhattan distance:
Percentage of correct predictions:  6.0 %
For Euclidean distance:
Percentage of correct predictions:  4.0 %
For k value = 5
For leave one out
For Manhattan distance:
Percentage of correct predictions:  5.0 %
For Euclidean distance:
Percentage of correct predictions:  4.0 %
For k value = 7
For leave one out
For Manhattan distance:
Percentage of correct predictions:  7.000000000000001 %
For Euclidean distance:
Percentage of correct predictions:  5.0 %


In [52]:
fruit_data = pd.read_csv("Downloads/fruit.csv")
fruit_data.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [53]:
fruit_data.isnull().sum()

fruit_label      0
fruit_name       0
fruit_subtype    0
mass             0
width            0
height           0
color_score      0
dtype: int64

In [54]:
fruit_data = fruit_data.drop("fruit_name", axis=1)
fruit_data = fruit_data.drop("fruit_subtype", axis=1)
fruit_data.head()

Unnamed: 0,fruit_label,mass,width,height,color_score
0,1,192,8.4,7.3,0.55
1,1,180,8.0,6.8,0.59
2,1,176,7.4,7.2,0.6
3,2,86,6.2,4.7,0.8
4,2,84,6.0,4.6,0.79


In [55]:
X = fruit_data.drop("fruit_label", axis=1)
X

Unnamed: 0,mass,width,height,color_score
0,192,8.4,7.3,0.55
1,180,8.0,6.8,0.59
2,176,7.4,7.2,0.6
3,86,6.2,4.7,0.8
4,84,6.0,4.6,0.79
5,80,5.8,4.3,0.77
6,80,5.9,4.3,0.81
7,76,5.8,4.0,0.81
8,178,7.1,7.8,0.92
9,172,7.4,7.0,0.89


In [56]:
y = fruit_data["fruit_label"]
y

0     1
1     1
2     1
3     2
4     2
5     2
6     2
7     2
8     1
9     1
10    1
11    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    1
21    1
22    1
23    1
24    3
25    3
26    3
27    3
28    3
29    3
30    3
31    3
32    3
33    3
34    3
35    3
36    3
37    3
38    3
39    3
40    3
41    3
42    3
43    4
44    4
45    4
46    4
47    4
48    4
49    4
50    4
51    4
52    4
53    4
54    4
55    4
56    4
57    4
58    4
Name: fruit_label, dtype: int64

In [57]:
def predict(X, y):
    np.random.seed(42)
    test_sizes = [0.3, 0.1]
    k_values = [3, 5, 7]
    p_values = [1, 2]
    print("="*40)
    for k in k_values:
        print(f"For k value = {k}")
        for size in test_sizes:
            print(f"For test size = {size}")
            X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=size, random_state=41)
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            for p in p_values:
                if p == 2:
                    print("For Euclidean distance:")
                else:
                    print("For Manhattan distance:")
                y_preds = knn_predict(X_train, y_train, X_test, k=k, p=p)
                print("Accuracy score is: ", accuracy_score(y_test, y_preds))
            print("-"*30)
        print("="*40)

In [58]:
def predict_leave_one_out(X, y):
    np.random.seed(42)
    k_values = [3, 5, 7]
    p_values = [1, 2]
    for k in k_values:
        print(f"For k value = {k}")
        print("For leave one out")
        for p in p_values:
            if p == 2:
                print("For Euclidean distance:")
            else:
                print("For Manhattan distance:")
            scores = []
            for i in range(100):
                random_idx = np.random.choice(range(len(X)))
                X_train, X_test = X.drop(random_idx, axis=0), X.iloc[random_idx]
                y_train, y_test = y.drop(random_idx, axis=0), y.iloc[random_idx]
                scaler = StandardScaler()
                X_train = scaler.fit_transform(X_train)
                X_test = X_test.to_numpy()
                X_test = X_test.reshape(1, -1)
                X_test = scaler.transform(X_test)
                y_preds = knn_predict(X_train, y_train, X_test, k=k, p=p)
                if y_preds[0] == y_test:
                    scores.append(1)
                else:
                    scores.append(0)
            scores = pd.Series(data=scores)
            scores = scores.value_counts()
            print("Percentage of correct predictions: ", (scores[0]/sum(scores))*100, "%")
        print("="*40)

In [59]:
predict(X,y)

For k value = 3
For test size = 0.3
For Manhattan distance:
Accuracy score is:  0.9444444444444444
For Euclidean distance:
Accuracy score is:  0.8888888888888888
------------------------------
For test size = 0.1
For Manhattan distance:
Accuracy score is:  1.0
For Euclidean distance:
Accuracy score is:  1.0
------------------------------
For k value = 5
For test size = 0.3
For Manhattan distance:
Accuracy score is:  0.9444444444444444
For Euclidean distance:
Accuracy score is:  0.9444444444444444
------------------------------
For test size = 0.1
For Manhattan distance:
Accuracy score is:  1.0
For Euclidean distance:
Accuracy score is:  1.0
------------------------------
For k value = 7
For test size = 0.3
For Manhattan distance:
Accuracy score is:  0.8888888888888888
For Euclidean distance:
Accuracy score is:  0.8333333333333334
------------------------------
For test size = 0.1
For Manhattan distance:
Accuracy score is:  1.0
For Euclidean distance:
Accuracy score is:  1.0
-----------

In [60]:
predict_leave_one_out(X, y)

For k value = 3
For leave one out
For Manhattan distance:
Percentage of correct predictions:  6.0 %
For Euclidean distance:
Percentage of correct predictions:  4.0 %
For k value = 5
For leave one out
For Manhattan distance:
Percentage of correct predictions:  5.0 %
For Euclidean distance:
Percentage of correct predictions:  4.0 %
For k value = 7
For leave one out
For Manhattan distance:
Percentage of correct predictions:  7.000000000000001 %
For Euclidean distance:
Percentage of correct predictions:  5.0 %
