In [1]:
import pandas as pd
import numpy as np
import random

In [155]:
class MyKNNClf:
    def __init__(self, k = 3, metric = "euclidean", weight = "uniform"):
        self.k = k
        self.train_size = None
        self.X_train = None
        self.y_train = None
        self.metric = metric
        self.weight = weight
        
    def __repr__(self):
        return f"MyKNNClf class: k={self.k}"
        
    def _calculate_distance(self, X_train, x_test):
        if self.metric == "euclidean":
            return np.sqrt(np.sum((X_train - x_test) ** 2, axis = 1))
        elif self.metric == "manhattan":
            return np.sum(np.abs(X_train - x_test), axis = 1)
        elif self.metric == "chebyshev":
            return np.max(np.abs(X_train - x_test), axis = 1)
        elif self.metric == "cosine":
            dot = X_train @ x_test
            norm_X_train = np.linalg.norm(X_train, axis = 1)
            norm_x_test = np.linalg.norm(x_test)
            return 1- dot / (norm_X_train * norm_x_test)
        else:
            raise ValueError("Неправильное название метрики!")
            
    def _calculate_weights(self, distances, classes, pred_type = "value"):
        if pred_type not in ["value", "prob"]:
            raise ValueError("Неправильный тип предсказания!")
        if self.weight == "uniform":
            if pred_type == "value":
                unique, counts = np.unique(classes, return_counts = True)
                max_count = np.max(counts)
                modes = unique[max_count == counts]
                if len(modes) > 1:
                    return 1
                else:
                    return modes[0]
            else:
                return np.mean(classes)

        elif self.weight == "rank":
            sum_zeros = 0
            sum_ones = 0
            for i, label in enumerate(classes):
                if label == 0:
                    sum_zeros += 1/ (i+1)
                else:
                    sum_ones += 1 / (i+1)
            q0 = sum_zeros / (sum_zeros + sum_ones)
            q1 = sum_ones / (sum_zeros + sum_ones)
            if pred_type == "value":
                if q0 > q1:
                    return 0
                else:
                    return 1
            else:
                return q1
                
        elif self.weight == "distance":
            sum_zeros = 0
            sum_ones = 0
            for dist, label in zip(distances, classes):
                if label == 0:
                     sum_zeros += 1/ dist
                else:
                    sum_ones += 1 / dist
                
            q0 = sum_zeros / (sum_zeros + sum_ones)
            q1 = sum_ones / (sum_zeros + sum_ones)
            if pred_type == "value":
                if q0 > q1:
                    return 0
                else:
                    return 1
            else:
                return q1
        else:
            raise ValueError("Неправильное название метрики!")

    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
             X_array = X.values
        else:
             X_array = np.asarray(X)
        if isinstance(y, pd.DataFrame):
             y_array = y.values.flatten()
        else:
             y_array = np.asarray(y)
        self.train_size = X_array.shape
        self.X_train = X_array
        self.y_train = y_array
        
    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X_test = X.values
        else:
            X_test = np.asarray(X)
        assert X_test.shape[1] == self.train_size[1]
        predictions = []
        for x_test in X_test:
            distances = self._calculate_distance(self.X_train, x_test)
            nn_indexes = np.argsort(distances)[:self.k]
            nn_classes = self.y_train[nn_indexes]
            nn_distances = distances[nn_indexes]
            prediction = self._calculate_weights(nn_distances, nn_classes)
            predictions.append(prediction)
        return np.array(predictions)
        
    def predict_proba(self, X):
        if isinstance(X, pd.DataFrame):
            X_test = X.values
        else:
            X_test = np.asarray(X)
        assert X_test.shape[1] == self.train_size[1]
        predictions = []
        for x_test in X_test:
            distances = self._calculate_distance(self.X_train, x_test)
            nn_indexes = np.argsort(distances)[:self.k]
            nn_classes = self.y_train[nn_indexes]
            nn_distances = distances[nn_indexes]
            prediction = self._calculate_weights(nn_distances, nn_classes, pred_type = "prob")
            predictions.append(prediction)
        return np.array(predictions)
        

In [156]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, roc_auc_score
# Фиксируем сид для воспроизводимости
np.random.seed(42)

# Синтетические данные
X, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_informative=5,
    n_redundant=0,
    n_classes=2
)

# Train / test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [159]:
model = MyKNNClf(k = 10, metric = "cosine", weight = "uniform")
model.fit(X_train, y_train)

In [160]:
y_pred = model.predict_proba(X_test)

In [161]:
y_pred

array([0. , 0. , 1. , 0. , 1. , 0. , 1. , 0.1, 0.6, 1. , 0. , 0. , 0.6,
       0.8, 1. , 0. , 0. , 0.2, 0. , 0.1, 0.5, 1. , 1. , 0.7, 0.1, 0.8,
       1. , 1. , 1. , 1. , 0.2, 1. , 0.1, 0. , 1. , 0.8, 0.3, 0.9, 0.3,
       0.8, 0. , 0. , 0.1, 0.7, 0. , 1. , 1. , 0.4, 1. , 0. , 0.4, 0. ,
       0.8, 0.4, 1. , 0. , 0. , 0.4, 0.2, 0.4, 1. , 0.9, 1. , 0.7, 0. ,
       0.7, 0.1, 0.4, 0.7, 0.2, 0.6, 0. , 0. , 1. , 0.2, 0. , 0. , 0. ,
       1. , 0.1, 0.1, 0.5, 0.9, 1. , 1. , 1. , 0.7, 0.1, 0.3, 1. , 0. ,
       0. , 0. , 0.1, 0. , 1. , 0.9, 0.1, 0.4, 0.1, 1. , 0. , 0.9, 0.9,
       0.1, 0.9, 0. , 0.6, 0.6, 1. , 0.3, 0.3, 0.1, 0.1, 0.4, 0.3, 0.2,
       0.7, 0.3, 1. , 1. , 0.9, 0.1, 0.8, 0.6, 0.9, 0.2, 0. , 0.5, 1. ,
       0.1, 1. , 1. , 0.1, 1. , 1. , 0.1, 1. , 0.7, 0.1, 1. , 0.1, 0.6,
       0. , 0.6, 0.9, 0.9, 0.9, 1. , 1. , 0. , 1. , 0.4, 0. , 0.6, 1. ,
       1. , 0.3, 0.8, 0.1, 0.4, 0.2, 1. , 0.2, 0.1, 0. , 0.9, 0.3, 0.1,
       0.9, 1. , 0.6, 1. , 0.6, 0.4, 0.2, 0.8, 0.1, 0. , 0.9, 0.

In [235]:
class MyKNNReg:
    def __init__(self, k = 3, metric = "euclidean", weight = "uniform"):
        self.k = k
        self.train_size = None
        self.X_train = None
        self.y_train = None
        self.metric = metric
        self.weight = weight
    def __repr__(self):
        return f"MyKNNReg class: k={self.k}"
        
    def _calculate_weights(self, distances):
        if self.weight == "rank":
            weights = []
            weights_sum = sum([1/(i+1) for i in range(self.k)])
            for i in range(self.k):
                weight = (1 / (i+1)) / weights_sum
                weights.append(weight)
            return np.array(weights)
        elif self.weight == "distance":
            weights = []
            distances_sum = sum([1/dist for dist in distances])
            for dist in distances:
                weight = (1/dist) / distances_sum
                weights.append(weight)
            return np.array(weights)
        else:
           raise ValueError("Неправильное назваени метода")
           
    def _calculate_distance(self, X_train, x_test):
        if self.metric == "euclidean":
            return np.sqrt(np.sum((X_train - x_test) ** 2, axis = 1))
        elif self.metric == "manhattan":
            return np.sum(np.abs(X_train - x_test), axis = 1)
        elif self.metric == "chebyshev":
            return np.max(np.abs(X_train - x_test), axis = 1)
        elif self.metric == "cosine":
            dot = X_train @ x_test
            norm_X_train = np.linalg.norm(X_train, axis = 1)
            norm_x_test = np.linalg.norm(x_test)
            return 1- dot / (norm_X_train * norm_x_test)
        else:
            raise ValueError("Неправильное название метрики!")
            
    def fit(self, X, y):
        if isinstance(X, pd.DataFrame):
             X_array = X.values
        else:
             X_array = np.asarray(X)
        if isinstance(y, pd.DataFrame):
             y_array = y.values.flatten()
        else:
             y_array = np.asarray(y)
        self.train_size = X_array.shape
        self.X_train = X_array
        self.y_train = y_array
        
    def predict(self, X):
        if isinstance(X, pd.DataFrame):
            X_test = X.values
        else:
            X_test = np.asarray(X)
        assert X_test.shape[1] == self.train_size[1]
        predictions = []
        for x_test in X_test:
            distances = self._calculate_distance(self.X_train, x_test)
            nn_indexes = np.argsort(distances)[:self.k]
            nn_targets = self.y_train[nn_indexes]
            if self.weight == "uniform":
                predictions.append(np.mean(nn_targets))
            else:
                weights = self._calculate_weights(distances[nn_indexes])
                predictions.append(nn_targets @ weights)
        return np.array(predictions)
    

In [228]:
from sklearn.datasets import make_regression
import pandas as pd
import numpy as np

x_train, y_train = make_regression(n_samples=5000,
                                   n_features=5,
                                   n_informative=5,
                                   noise=15,
                                   random_state=42)
x_train = pd.DataFrame(x_train)
y_train = pd.Series(y_train)
x_train.columns = [f'col_{col}' for col in x_train.columns]


x_test, y_test = make_regression(n_samples=100,
                                   n_features=5,
                                   n_informative=2,
                                   noise=15,
                                   random_state=42)
x_test = pd.DataFrame(x_test)
y_test = pd.Series(y_test)
x_test.columns = [f'col_{col}' for col in x_test.columns]


model = MyKNNReg(k = 5, metric = "chebyshev", weight = "uniform")
model.fit(x_train, y_train)

In [229]:
model.predict(x_test)

array([-287.10525735, -102.26095598, -102.86897594, -233.84082178,
       -217.1423544 ,  319.80383035,  -58.12403593,  -10.72873339,
         12.15204544,  -40.23297037,  172.04889358, -356.05582189,
        -18.0156314 , -172.56354289,  -47.33181865,  202.85977745,
         39.31412888,   92.84817889,  -26.44522324,  139.99581962,
         85.03237792,   32.7731269 ,  -47.61352412,  152.03004761,
         48.57819133,   52.55156814, -128.221536  ,  -61.42842838,
         46.21397302,   24.62029367,    1.24706839,   31.3574376 ,
        227.40889772,  139.06298996,   50.33366563,    1.07661103,
        -21.85635921, -161.25474104, -218.32318417,  108.60053884,
         10.10568976,   60.35478779,  -14.95771223,   -7.83642035,
         51.39329609,  -34.27227613,   14.21453793,  102.1338094 ,
         97.00623699,   71.90673976,   42.63326787, -306.08991243,
       -213.10625002,  152.60527496,   29.69175095,   20.29217571,
       -216.89015309,  275.35381525,  248.93240559,   26.83968