In [10]:
import pandas as pd
import numpy as np

from scipy.stats import mode
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt
import seaborn as sns

# Загрузка и обработка данных

In [2]:
!wget https://archive.ics.uci.edu/static/public/52/ionosphere.zip
!mkdir ion_data
!unzip ionosphere.zip -d ion_data
!rm ionosphere.zip

--2023-10-06 17:51:37--  https://archive.ics.uci.edu/static/public/52/ionosphere.zip
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘ionosphere.zip’

ionosphere.zip          [  <=>               ]  29.28K   142KB/s    in 0.2s    

2023-10-06 17:51:39 (142 KB/s) - ‘ionosphere.zip’ saved [29983]

Archive:  ionosphere.zip
  inflating: ion_data/Index          
  inflating: ion_data/ionosphere.data  
  inflating: ion_data/ionosphere.names  


In [102]:
header = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']
df = pd.read_csv('data/cars/car.data' ,names=header)

Проверка типов данных признаков

In [9]:
pd.Series([str(df[name].dtype) for name in df.columns]).value_counts()

object    7
Name: count, dtype: int64

Проверка выборки на наличие нулей

In [6]:
percent_missing = df.isnull().sum() * 100 / len(df)
pd.DataFrame({'percent_missing': percent_missing})

Unnamed: 0,percent_missing
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0


Подсчёт примеров каждого класса

In [104]:
# подсчёт сэмплов каждого класса
def calc_classes_exmpls(dataframe):
    print('Разброс по классам:')
    for cls in dataframe['class'].unique():
        print(f"{cls}: {len(dataframe[dataframe['class'] == cls])}")
calc_classes_exmpls(df)

Разброс по классам:
unacc: 1210
acc: 384
vgood: 65
good: 69


Кодирование категориальных признаков

In [105]:
le = preprocessing.LabelEncoder()
original_classes = np.unique(df['class'].values)
for column_name in df.columns:
    df[column_name] = le.fit_transform(df[column_name])
y = df['class']
X = df.loc[:, df.columns != 'class']

In [106]:
X = X.values
Y = y.values.astype(int)
feature_num = X.shape[1]
classes_num = len(pd.unique(y))
print('Numb of features: ', feature_num)
print('Numb of classes: ', classes_num)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
print(x_train.shape)
print(y_test.shape)

Numb of features:  6
Numb of classes:  4
(1382, 6)
(346,)


# Реализация алгоритма и построение модели

*Метрики*

In [107]:
def minkowski_distance(x1, x2, p=3):
    minkowski = np.sum(np.abs(x1-x2)**p)**(1/p)
    return minkowski

def euclidean_distance(x1, x2):
    euclidean = np.sqrt(np.sum((x1-x2)**2))
    return euclidean

def manhattan_distance(x1, x2):
    manhattan = np.sum(np.abs(x1-x2))
    return manhattan

*Реализация класса k-Nearest Neighbours*

In [108]:
class KNN_classifier():
    def __init__(self, k, metric=minkowski_distance, p=3) -> None:
        self.k = k
        self.metric = metric
        self.p = p

    def fit(self, X, y):
        self.X = X
        self.y = y
        return self
    
    def predict_one(self, x):
        neighbours = self._get_neighbours(x)
        predicted_class = max(set(neighbours), key=neighbours.count)
        return predicted_class

    def predict(self, x):
        return [self.predict_one(_) for _ in x]

    def score(self, x_test, y_test):
        predictions = []
        for x, y in zip(x_test, y_test):
            y_pred = self.predict_one(x)
            predictions.append(y_pred)
        
        accuracy = np.mean(y_test == predictions)
        return accuracy

    def _get_neighbours(self, x):
        distances = []
        # считаем дистанцию на всей тренировочной выборке
        for x_train, y_train in zip(self.X, self.y):
            distance = self.metric(x_train, x)
            distances.append((distance, y_train))

        # сортируем по возрастанию дистанции
        distances.sort(key=lambda d: d[0])

        # ищем соседей
        neighbours = [distances[i][1] for i in range(self.k)]
        return neighbours


Сравнение точности алгоритмов

In [109]:
class Scorer (object):
  def __init__(self, y_true, y_pred):
    tp, fp, _, fn = self._perf_measure(y_true, y_pred)
    self.tp = tp
    self.fp = fp
    self.fn = fn

  @staticmethod
  def _perf_measure(y_true, y_pred):
      TP = 0
      FP = 0
      TN = 0
      FN = 0

      for i in range(len(y_pred)): 
          if y_true[i] == y_pred[i] == 1:
            TP += 1
          if y_pred[i] == 1 and y_true[i] != y_pred[i]:
            FP += 1
          if y_true[i] == y_pred[i] == 0:
            TN += 1
          if y_pred[i] == 0 and y_true[i] != y_pred[i]:
            FN += 1

      return (TP, FP, TN, FN)

  def get_recall_score(self) -> float:
    return self.tp / (self.tp + self.fn)

  def get_precision_score(self) -> float:
    return self.tp / (self.tp + self.fp)

  def get_f1_score(self) -> float:
    return 2 * self.tp / (2 * self.tp + self.fp + self.fn)


In [110]:
metrics = {
  'Minkowski': minkowski_distance, 
  'Euclidean': euclidean_distance, 
  'Manhattan': manhattan_distance
}
neighbours_count = [3, 5, 8, 10]

In [111]:
def get_scores_df(model: KNN_classifier, neighbours: list, metrics: dict) -> pd.DataFrame:
  scores = pd.DataFrame(columns=metrics.keys(), index=neighbours)

  for k in neighbours:
    row = []
    for metric in metrics.values():
      KNN = model(k, metric=metric).fit(x_train, y_train)
      y_pred = KNN.predict(x_test)
      scorer = Scorer(y_test, y_pred)
      row.append(scorer.get_f1_score())
    scores.loc[k] = row

  return scores

## KNN + Оконный метод

In [112]:
def gaussian_kernel(u):
    return np.exp(-0.5 * u**2) / ((2 * np.pi))

def epanechnikov_kernel(u):
    return 3/4 * (1 - u**2)

In [114]:
class KNN_classifier_Parzen():
    def __init__(self, k, metric=minkowski_distance, p=3, kernel=epanechnikov_kernel) -> None:
        self.k = k
        self.metric = metric
        self.p = p
        self.kernel = kernel

    def fit(self, X, y):
        self.X = X
        self.y = y
        return self
    
    def set_k(self, k):
        self.k = k
    
    def predict_one(self, x):
        neighbours = self._get_neighbours(x)
        predicted_class = max(set(neighbours), key=neighbours.count)
        return predicted_class

    def predict(self, x):
        return [self.predict_one(_) for _ in x]

    def score(self, x_test, y_test):
        predictions = []
        for x, y in zip(x_test, y_test):
            y_pred = self.predict_one(x)
            predictions.append(y_pred)
        
        accuracy = np.mean(y_test == predictions)
        return accuracy

    def _get_neighbours(self, x):
        distances = []
        # считаем дистанцию на всей тренировочной выборке
        for x_train, y_train in zip(self.X, self.y):
            distance = self.metric(x_train, x)
            
            #weight = self.kernel(distance / self.h)
            distances.append((distance, y_train))
            #print(weight)

        # сортируем по возрастанию дистанции
        distances.sort(key=lambda d: d[0])

        # Применяем окно парзена
        max_dist = distances[self.k+1][0]
        weights = []
        for i, (distance, label) in enumerate(distances):
            if i == self.k:
                break
            weights.append((self.kernel(distance / max_dist), label))
        
        # Сортируем и ищем соседей
        weights.sort(key=lambda d: d[0])
        neighbours = [weights[i][1] for i in range(self.k)]
        return neighbours


In [97]:
neighbours_count = [2, 5, 8, 10]
get_scores_df(KNN_classifier, neighbours_count, metrics)


Unnamed: 0,Minkowski,Euclidean,Manhattan
2,0.128205,0.128205,0.128205
5,0.461538,0.461538,0.461538
8,0.545455,0.545455,0.5625
10,0.4375,0.4375,0.451613


In [98]:
get_scores_df(KNN_classifier_Parzen, neighbours_count, metrics)

Unnamed: 0,Minkowski,Euclidean,Manhattan
2,0.128205,0.128205,0.128205
5,0.461538,0.461538,0.461538
8,0.545455,0.545455,0.5625
10,0.4375,0.4375,0.451613


In [99]:
get_scores_df(KNeighborsClassifier, neighbours_count, metrics)

Unnamed: 0,Minkowski,Euclidean,Manhattan
2,0.029851,0.029851,0.0625
5,0.272727,0.272727,0.272727
8,0.592593,0.571429,0.62069
10,0.411765,0.375,0.466667


## LOO

In [87]:
def find_best_k(classifier, k_list):
    results = []
    for k in k_list:
        classifier.set_k(k=k)
        correct_sum = leave_one_out(classifier, x_train, y_train)
        results.append(correct_sum)

    best_k = k_list[(max(enumerate(results), key=(lambda x: x[1])))[0]]
    return best_k

def leave_one_out(model, X : np.array, y : np.array) -> int:
    correct_sum = 0

    for i in range(len(X)):
        x_test_ = X[i]
        y_test_ = y[i]
        X_ = np.delete(X, i, 0)
        y_ = np.delete(y, i, 0)
        model.fit(X_, y_)
        y_pred = model.predict_one(x_test_)
        if y_test_ == y_pred:
            correct_sum += 1

    return correct_sum

In [100]:
k_list = [i for i in range(3, 10, 1)]
classifier = KNN_classifier_Parzen(None, euclidean_distance, kernel=gaussian_kernel)
best_k = find_best_k(classifier, k_list)

get_scores_df(KNN_classifier_Parzen, [best_k], metrics)



Unnamed: 0,Minkowski,Euclidean,Manhattan
7,0.518519,0.518519,0.5
