In [46]:
import pandas as pd
import numpy as np
from scipy.stats import mode
import math

In [47]:
data_source = 'http://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data'

names = ['buying',
         'maint',
         'doors',
         'persons',
         'lug_boot',
         'safety',
         'class']

cdf = pd.read_csv(
    filepath_or_buffer=data_source, 
    names=names, 
    sep=','
)

URLError: <urlopen error [Errno 11001] getaddrinfo failed>

In [48]:
cdf.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [49]:
class CategoricalNB():
    def __init__(self, train_df, test_df, label):
        self.X_train, self.y_train = self.split_features_targets(df=train_df, label=label)
        self.X_test, self.y_test = self.split_features_targets(df=test_df, label=label)

        self.X_test_vals = self.X_test.values
        self.y_test_vals = self.y_test.values

        self.X_likelihood, self.y_likelihood = self.compute_likelihood()

    def split_features_targets(self, df, label):
        X = df.drop(columns=[label], axis=1)
        y = df[label]
        return X, y

    def compute_likelihood(self):
        X_likelihood = {}

        yc_df = self.y_train.value_counts().to_frame()
        yc_df.reset_index(inplace=True)
        yc_df.columns = ['class', 'count']

        y_vc = {i : j for (i, j) in zip(yc_df['class'], yc_df['count'])}
        y_vc_k = list(y_vc.keys())

        for col in self.X_train:
            each_col_dict = {}
            x_col_vals = self.X_train[col].value_counts().to_frame().index.to_list()
            fydf = pd.DataFrame(data={col : self.X_train[col], 'y' : self.y_train})

            for ex in x_col_vals:
                each_x_dict = {}
                x_ex_df = fydf[fydf[col] == ex]

                for ey in y_vc_k:
                    x_y_df = x_ex_df[x_ex_df['y'] == ey]
                    each_x_dict[ey] = len(x_y_df) / y_vc[ey]

                each_col_dict[ex] = each_x_dict
            X_likelihood[col] = each_col_dict
        y_likelihood = {i : j / sum(list(y_vc.values())) for (i, j) in y_vc.items()}

        return X_likelihood, y_likelihood

    def predictor(self, X_new):
        cols = list(self.X_likelihood.keys())
        col_new = {i : j for (i, j) in zip(cols, X_new)}

        lprobs = {}
        for l, v in self.y_likelihood.items():
            cate_v = [self.X_likelihood[cn][cl][l] for (cn, cl) in col_new.items()]
            lprobs[l] = round((np.prod(cate_v) * v), 4)

        prob_ks = list(lprobs.keys())
        prob_vs = list(lprobs.values())

        return prob_ks[np.argmax(prob_vs)]

    def predict(self):
        if len(self.X_test_vals) == 1:
            return self.predictor(X_new=self.X_test_vals[0])
        preds = [self.predictor(X_new=i) for i in self.X_test_vals]
        return preds

    def accuracy_score(self, preds):
        actual_vals = np.array(self.y_test_vals)
        preds = np.array(preds)
        corrects = np.count_nonzero(np.where((actual_vals == preds), 1, 0))
        return corrects / len(actual_vals)

In [50]:
def splitter(dframe, percentage=0.75, random_state=True):
    if random_state:
        dframe = dframe.sample(frac=1)

    thresh = round(len(dframe) * percentage)
    train_df = dframe.iloc[:thresh]
    test_df = dframe.iloc[thresh:]

    return train_df, test_df

In [51]:
X_train, X_test = splitter(dframe=cdf)
XTRAIN,XTEST=X_train, X_test
print(len(XTRAIN))
print(len(XTEST))
print(len(cdf))


1296
432
1728


In [52]:
nb = CategoricalNB(
    train_df=X_train, 
    test_df=X_test, 
    label='class'
)

In [53]:
preds = nb.predict()

In [54]:
acc = nb.accuracy_score(preds=preds)
print(acc)

0.8310185185185185


In [55]:
mapping = {'vhigh':0,'high':1,'med':2,'low':3}
XTRAIN['buying'] = XTRAIN['buying'].map(mapping)
XTEST['buying'] = XTEST['buying'].map(mapping)

XTRAIN['maint'] = XTRAIN['maint'].map(mapping)
XTEST['maint'] = XTEST['maint'].map(mapping)

mapping_1 = {'2':0,'3':1,'4':2,'5more':3}
XTRAIN['doors'] = XTRAIN['doors'].map(mapping_1)
XTEST['doors'] = XTEST['doors'].map(mapping_1)

mapping_2 = {'2':0,'4':1,'more':2}
XTRAIN['persons'] = XTRAIN['persons'].map(mapping_2)
XTEST['persons'] = XTEST['persons'].map(mapping_2)

mapping_3 = {'small':0,'med':1,'big':2}
XTRAIN['lug_boot'] = XTRAIN['lug_boot'].map(mapping_3)
XTEST['lug_boot'] = XTEST['lug_boot'].map(mapping_3)

mapping_4 = {'low':0,'med':1,'high':2}
XTRAIN['safety'] = XTRAIN['safety'].map(mapping_4)
XTEST['safety'] = XTEST['safety'].map(mapping_4)


In [56]:
def split_features(df, label):
    X = df.drop(columns=[label], axis=1)
    y = df[label]
    return X, y

In [57]:
X_T,y_T=split_features(XTRAIN,'class')
X_Test,y_Test=split_features(XTEST,'class')

In [58]:
print("X_train:")
print(X_T)
print("\ny_train:")
print(y_T)

X_train:
      buying  maint  doors  persons  lug_boot  safety
1353       3      0      2        0         1       0
206        0      1      3        1         2       2
544        1      1      0        0         1       1
990        2      1      0        2         0       0
676        1      2      1        0         0       1
...      ...    ...    ...      ...       ...     ...
446        1      0      0        1         1       2
1703       3      3      3        0         0       2
1050       2      1      2        2         2       0
1727       3      3      3        2         2       2
871        2      0      0        0         2       1

[1296 rows x 6 columns]

y_train:
1353    unacc
206     unacc
544     unacc
990     unacc
676     unacc
        ...  
446     unacc
1703    unacc
1050    unacc
1727    vgood
871     unacc
Name: class, Length: 1296, dtype: object


In [59]:
X_train= np.array(X_T)
X_test = np.array(X_Test)
y_train= np.array(y_T)
y_test = np.array(y_Test)

In [60]:
def eucledian(p1,p2):
    dist = np.sqrt(np.sum((p1-p2)**2))
    return dist


# Function to calculate KNN
def predict(x_train, y_train, x_test, k):
    op_labels = []

    # Loop through the Datapoints to be classified
    for item in x_test:

        # Array to store distances
        point_dist = []

        # Loop through each training Data
        for j in range(len(x_train)):
            distances = eucledian(np.array(x_train[j, :]), item)
            # Calculating the distance
            point_dist.append(distances)
        point_dist = np.array(point_dist)

        # Sorting the array while preserving the index
        # Keeping the first K datapoints
        dist = np.argsort(point_dist)[
               :k]  # dist are the indices of the train sample that has the minimum distance # k is the nearest train samples to this test sample
        # print("dist",dist)
        # Labels of the K datapoints from above
        labels = y_train[dist]  # here i get the class label of each train sample in k datapoints
        # print("labels",labels)

        # Majority voting
        lab = mode(labels)
        # print("most frequent label and its count",lab)
        lab = lab.mode[0]
        # print("final label",lab)
        op_labels.append(lab)
        # print("op_labels",op_labels)

    return op_labels

y_pred = predict(X_train,y_train,X_test , 4)

def accuracy_score(y_test, y_pred):
	return round(float(sum(y_pred == y_test))/float(len(y_test)) * 100 ,2)

accuracy = accuracy_score(y_test, y_pred)
print("KNN Accuracy = ",accuracy)

KNN Accuracy =  92.82
