In [67]:
import numpy as np
import scipy.spatial
from collections import Counter
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import euclidean_distances, manhattan_distances
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [17]:
clm = [
    'class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises?', 'odor',
    'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape',
    'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring',
    'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
    'veil-color', 'ring-number', 'ring-type', 'spore-print-color', 'population',
    'habitat'
]

In [18]:
class KNN:

    def __init__(self, K, P, dist_algo):
        self.K = K
        self.P = P
        self.dist_algo = dist_algo

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def distance(self, X1, X2):
        distance = scipy.spatial.distance.euclidean(X1, X2)

    def predict(self, X_test):
        self.predictions = []

        if self.dist_algo == "minkowski":
            distances_fn = lambda x1, x2: (
                (abs(x1 - x2))**self.P).sum()**(1 / self.P)
        elif self.dist_algo == "euclidean":
            distances_fn = euclidean_distances
        else:
            print(
                "Invalid Distance Algorithm, user from minkowski, euclidean! ")
            return []

        for i in range(len(X_test)):
            distances = distances_fn(self.X_train,
                                     X_test.iloc[i].values.reshape(1, -1))

            # Store distances in a dataframe
            distance_data = pd.DataFrame(data=distances, columns=['dist'])

            # Sort distances, and only consider the k closest points
            nearest_neighbours = distance_data.sort_values(by=['dist'],
                                                           axis=0)[:self.K]

            # Create counter object to track the labels of k closest neighbors
            counter = Counter(self.y_train[nearest_neighbours.index])

            # Get most common label from all the nearest neighbors
            prediction = counter.most_common()[0][0]

            # Append predicted label to output list
            self.predictions.append(prediction)

        return self.predictions

    # function of Minkowski distance algorithm
    def minkowski(self, x, y, P=1):

        distance = 0
        for a, b in zip(x, y):
            distance += abs(a - b)**P
        distance = float(distance**(1 / float(P)))

        return distance

    # function of Euclidean distance algorithm
    def euclidean(self, x, y):

        distance = np.sqrt(sum(pow(a - b, 2) for a, b in zip(x, y)))

        return distance

In [19]:
def score(self, X_test, y_test):
    predictions = self.predict(X_test)
    return (predictions == y_test).sum() / len(y_test)

In [21]:
data = pd.read_csv("MushroomData_8000.txt", header=None, names=clm)
data

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,p,k,s,e,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,p
7996,e,f,s,n,f,n,a,c,b,o,...,s,o,o,p,n,o,p,y,v,l
7997,p,k,s,n,f,y,f,c,n,b,...,k,p,p,p,w,o,e,w,v,l
7998,p,k,y,n,f,s,f,c,n,b,...,s,p,p,p,w,o,e,w,v,p


In [74]:
le = LabelEncoder()
le2 = LabelEncoder()
y = data['class']
X = data.drop(['class'], axis=1)
y = le.fit_transform(y)
X = X.apply(le2.fit_transform)

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

In [76]:
X_test

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises?,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
2215,5,3,4,1,5,1,0,0,10,1,...,2,7,3,0,2,1,4,3,4,0
2582,2,0,2,1,5,1,0,0,5,1,...,2,7,3,0,2,1,4,3,4,0
1662,5,3,9,1,0,1,0,0,2,0,...,2,7,7,0,2,1,4,2,2,1
3027,5,0,3,0,2,1,0,0,2,0,...,1,4,0,0,2,1,2,1,5,0
4343,5,3,9,0,2,1,0,0,3,0,...,1,6,6,0,2,1,2,1,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1079,5,3,4,1,3,1,0,0,7,0,...,3,7,7,0,2,1,4,2,3,4
7979,3,2,4,0,5,0,0,0,11,0,...,2,5,5,0,0,1,4,4,4,2
1115,2,0,3,0,5,1,1,0,7,1,...,2,7,7,0,2,1,0,2,3,1
6093,2,3,4,0,2,1,0,1,0,1,...,1,6,6,0,2,1,0,7,4,2


In [77]:
y_test

array([0, 0, 0, ..., 0, 1, 1])

In [102]:
knn = KNN(K=6,P=2,dist_algo="euclidean")

In [103]:
knn.fit(X_train, y_train)

In [104]:
unknown_data = pd.read_csv('MushroomData_Unknwon_100.txt', names=clm[1:])
unknown_data = unknown_data.apply(le2.fit_transform)

In [105]:
y_pred = knn.predict(unknown_data)
print(y_pred)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [106]:
le.inverse_transform(y_pred)

array(['p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p',
       'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'e',
       'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p',
       'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p',
       'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p',
       'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'e', 'p',
       'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p',
       'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p'], dtype=object)