In [2]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split


In [116]:
header = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
df = pd.read_csv('data/cars/car.data' ,names=header)

df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [117]:
le = preprocessing.LabelEncoder()
original_classes = np.unique(df['class'].values)
for column_name in df.columns:
    df[column_name] = le.fit_transform(df[column_name])
y = df['class']
X = df.loc[:, df.columns != 'class']

In [118]:
X = X.values
Y = y.values.astype(int)
feature_num = X.shape[1]
classes_num = len(pd.unique(y))
print('Numb of features: ', feature_num)
print('Numb of classes: ', classes_num)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
print(x_train.shape)
print(y_test.shape)

Numb of features:  6
Numb of classes:  4
(1382, 6)
(346,)


In [119]:

# Функции расстояния
def minkowski_distance(x1, x2, p=3):
    minkowski = np.sum(np.abs(x1-x2)**p, axis=1)**(1/p)
    return minkowski

def euclidean_distance(x1, x2):
    euclidean = np.sqrt(np.sum((x1-x2)**2, axis=1))
    return euclidean

def manhattan_distance(x1, x2):
    manhattan = np.sum(np.abs(x1-x2), axis=1)
    return manhattan

def def_dist(x_i: np.ndarray, X: np.ndarray) -> np.ndarray:
    distances = np.linalg.norm(X - x_i, axis=1)
    return distances

# Ядра
def gaussian_kernel(u):
    return np.exp(-0.5 * u**2) / ((2 * np.pi))

def epanechnikov_kernel(u):
    return 3/4 * (1 - u**2)

In [122]:
from multipledispatch import dispatch
from copy import copy

class ParzenRozenblatt:
    def __init__(self, kernel_func=gaussian_kernel, dist_func=euclidean_distance) -> None:
        self.kernel = kernel_func
        self.dist_func = dist_func

    @dispatch(np.ndarray, np.ndarray, list)
    def fit(self, x_train, y_train, h_candidates):
        self.x = x_train
        self.y = y_train
        self.h = self._find_best_h(h_candidates)
    
    @dispatch(np.ndarray, np.ndarray, float)
    def fit(self, x_train, y_train, h):
        self.x = x_train
        self.y = y_train
        self.h = h

    def _find_best_h(self, h_candidates):
        best_h = None
        best_acc = 0
        for h in h_candidates:
            acc = self.leave_one_out(copy(self.x), copy(self.y), h)
            if acc > best_acc:
                best_acc = acc
                best_h = h
        return best_h
    
    def leave_one_out(self, X, y, h:float) -> int:
        y_preds = np.array([])
        y_trues = np.array([])

        for i in range(len(self.x)):
            x_test_ = X[i]
            y_test_ = y[i]
            X_ = np.delete(X, i, 0)
            y_ = np.delete(y, i, 0)
            self.fit(X_, y_, h)
            y_pred = self.predict_single_row(x_test_, h)

            y_trues = np.append(y_trues, y_test_)
            y_preds = np.append(y_preds, y_pred)
        acc = accuracy_score(y_trues, y_preds)
        return acc

    def _get_V_h(self, distances):
        from scipy.integrate import quad as integrate
        a = np.max(distances)
        b = np.min(distances)
        V_h, _ = integrate(self.kernel, b, a)
        return V_h

    def predict_single_row(self, x_row, h=None):
        # Общая формула:
        # a = argmax[lam_y / V(h) * sum([if y_i == y] * self.kernel(dist(row, x_i)))]
        if h == None:
            h = self.h
        scores = np.zeros(np.unique(self.y).shape[0])
        distances_ = self.dist_func(x_row, self.x)

        # Нормировочный множитель
        V_h = self._get_V_h(distances_)

        for y_i in set(self.y):
            # p(x|y)
            indexes = np.where(self.y == y_i)[0]
            distances = distances_[indexes]
            weights = self.kernel(u=distances / h)
            scores[y_i] = sum(weights)
        
        prediction = np.argmax(scores/(len(self.x)*V_h), axis=0)
        return prediction
    
    def predict(self, x):
        predictions = np.array([])
        for row in x:
            prediction = self.predict_single_row(row)
            predictions = np.append(predictions, prediction)
        return predictions

    def get_h(self):
        return self.h
            

In [123]:
kernel, distance = gaussian_kernel, euclidean_distance
classifier = ParzenRozenblatt(kernel_func=kernel, dist_func=distance)
classifier.fit(x_train, y_train, [10.0, 1.0, 0.1])

preds = classifier.predict(x_test)
accuracy = accuracy_score(y_test, preds)
print(f'Accuracy: {accuracy}\nBest h: {classifier.get_h()}')

Accuracy: 0.9479768786127167
Best h: 0.1


## Использование различных функций ядра и дистанции

In [124]:
dist_funcs = {
    'euclidean_dist': euclidean_distance,
    'minkowski_dist': minkowski_distance,
    'manhattan_distance': manhattan_distance
}

kernel_funcs = {
    'epanechnikov_kernel': epanechnikov_kernel,
    'gaussian_kernel': gaussian_kernel
}

In [125]:
h_candidates = [10.0, 1.0, 0.1, 0.01]
results = []
for kernel_func in kernel_funcs.keys():
    for dist_func in dist_funcs.keys():

        dist = dist_funcs[dist_func]
        kernel = kernel_funcs[kernel_func]
        
        classifier = ParzenRozenblatt(kernel_func=kernel, dist_func=dist)
        classifier.fit(x_train, y_train, h_candidates)
        preds = classifier.predict(x_test)
        
        accuracy = accuracy_score(y_test, preds)
        results.append((kernel_func, dist_func, classifier.get_h(), accuracy))

df = pd.DataFrame(results, columns=['Distance function', 'Kernel function', 'Best h', 'Accuracy'])
df


Unnamed: 0,Distance function,Kernel function,Best h,Accuracy
0,epanechnikov_kernel,euclidean_dist,1.0,0.67052
1,epanechnikov_kernel,minkowski_dist,1.0,0.67052
2,epanechnikov_kernel,manhattan_distance,1.0,0.67052
3,gaussian_kernel,euclidean_dist,0.1,0.947977
4,gaussian_kernel,minkowski_dist,0.1,0.930636
5,gaussian_kernel,manhattan_distance,0.1,0.947977
