In [1]:
class KNN():
    
    def __init__(self, k):
        """ 
        超參數k: int, 選擇最近的k的資料去做後續的Vote。 
        """
        self.k = k
    
    def fit(self, x_train, y_train):
        """
        x: list or ndarray(1d array)
        y: target
        """
        self.x_train = x_train
        self.y_train = y_train
    
    def predict(self, x_test):
        """
        1. 先計算每一個x_test資料與x_train資料的距離
            疑問: 裝入什麼資料結構會比較方便排序, tuple合適嗎?
        2. 將1.之結果排序
        3. 將2.之結果抽取最近(距離最小)的k的資料並計算分類比例
        4. 選擇最高的分類比例賦予y之pred值
        """
        return self._calcualte_dist(x_test)
        
        
    def _calcualte_dist(self, x_test):
        """
        計算距離的函數, 分開來比較好管理。
        """
        # 這邊先以一個預測為例
        def calculate_one_sample(x_test):
            dist = []    # 裝所有點與預測點
            
            for i, x in enumerate(self.x_train):
                dist_sum = 0
                for x_feat_train, x_feat_test in zip(x, x_test):
                    dist_sum += (x_feat_train-x_feat_test)**2    # 加總完
                dist_sum = (dist_sum)**0.5    # 開根號算出歐式距離
                dist.append((i, dist_sum))

            # sort by dist
            # k: index, v: dist
            dist_sort = sorted(dist, key=lambda z: z[1], reverse=False)

            # 統計分類比例, 根據k個最近的鄰居
            d = {}    # k: index, v: count(freq)
            max_count_class = (0, 0)    # 次數, class
            for i in range(self.k):
                if self.y_train[dist_sort[i][0]] not in d:
                    d[self.y_train[dist_sort[i][0]]] = 1
                else:
                    d[self.y_train[dist_sort[i][0]]] += 1
                if d[self.y_train[dist_sort[i][0]]] > max_count_class[0]:    # 如果次數大於現在最多次數
                    max_count_class = (d[self.y_train[dist_sort[i][0]]], self.y_train[dist_sort[i][0]])
            
            return max_count_class[1]
        
        # 全部預測
        y_pred = []
        
        for x in x_test:
            y_pred.append(calculate_one_sample(x))
        
        
        return y_pred

In [2]:
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()

In [3]:
x = data['data']
y = data['target']

In [4]:
x.shape

(569, 30)

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y)

In [6]:
x_train.shape

(426, 30)

In [7]:
x_test.shape

(143, 30)

In [8]:
# model build

knn = KNN(5)

In [9]:
# train model

knn.fit(x_train , y_train)

In [10]:
# predict y

y_pred = knn.predict(x_test)

In [11]:
y_test

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1])

In [12]:
y_pred

[1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1]

In [13]:
# 算acc

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.9300699300699301

In [14]:
# 利用sklearn的模組看看
# 得到結果一模一樣, 模型理解正確!!

from sklearn.neighbors import KNeighborsClassifier

knn_sklearn = KNeighborsClassifier(5)
knn_sklearn.fit(x_train, y_train)
y_pred_sklearn = knn_sklearn.predict(x_test)

accuracy_score(y_test, y_pred_sklearn)

0.9300699300699301