# Phân loại với K-Nearest Neighbors
#### <span style="color: gold;"> Thầy/cô chỉ cần chạy cell này</span>

In [19]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix
)
import seaborn as sns
import numpy as np

SPLIT_RATIOS = [(0.8, 0.2), (0.7, 0.3), (0.6, 0.4)]

import os
BASE_DIR = os.path.dirname(
    os.path.dirname(
        os.path.dirname(
            os.path.abspath('notebook/preprocessing.ipynb'))))

df_4 = pd.read_csv(os.path.join(BASE_DIR, 'data/processed_data_4_classes.csv'))
df_9 = pd.read_csv(os.path.join(BASE_DIR, 'data/processed_data_9_classes.csv'))

X_4 = df_4.drop(columns=['Classification'])
y_4 = df_4['Classification']
X_9 = df_9.drop(columns=['Classification'])
y_9 = df_9['Classification']

def distance(array, value):
  array = np.array(array)
  value = np.array(value)
  return np.linalg.norm(array - value, ord = 1, axis=1)

def find_k_nearest_index(array, value, k):
  dists = distance(array, value)
  return np.argsort(dists)[:k]

def knn(X_features, y_class, split_ratios):
    acc_list =  []
    for k in [1,3,5,7,9,11,13,15]:
        avg_acc = 0
        for ratio in split_ratios:
            X_train, X_test, y_train, y_test = train_test_split(
                X_features, y_class, test_size=ratio[1], random_state=42, stratify=y_class
            )
            y_pred = np.zeros((len(y_test), 12))

            for i in range(len(X_test)):
                indexis = find_k_nearest_index(X_train, X_test.iloc[i], k=k)
                # print(f'{i}: ', end='')
                for idx in indexis:
                    # print(f'{idx} ', end='')
                    y_pred[i, y_train[idx]] += 1

            y_pred_class = np.zeros(len(y_test))
            for i in range(len(y_pred)):
                y_pred_class[i] = np.argmax(y_pred[i])

            y_test.reshape(len(y_test))

            knn_raw_acc = accuracy_score(y_test.reshape(len(y_test)), y_pred_class)
            avg_acc += knn_raw_acc
            print(f'Accuracy \033[31m{k}-NN\033[0m bằng dữ liệu gốc với train/test = \033[31m{ratio[0]}/{ratio[1]}\033[0m: \033[31m{knn_raw_acc}\033[0m')
        print(f'Avg accuracy với k = \033[31m{k}\033[0m: \033[31m{avg_acc / 3}\033[0m', end='\n')
        acc_list.append(avg_acc / 3)
            
    return acc_list

y_4_encoded = pd.factorize(y_4)[0]
y_9_encoded = pd.factorize(y_9)[0]
knn_acc_4 = knn(X_4, y_4_encoded, SPLIT_RATIOS)
knn_acc_9 = knn(X_9, y_9_encoded, SPLIT_RATIOS)

Accuracy [31m1-NN[0m bằng dữ liệu gốc với train/test = [31m0.8/0.2[0m: [31m0.9439252336448598[0m
Accuracy [31m1-NN[0m bằng dữ liệu gốc với train/test = [31m0.7/0.3[0m: [31m0.9439252336448598[0m
Accuracy [31m1-NN[0m bằng dữ liệu gốc với train/test = [31m0.6/0.4[0m: [31m0.955503512880562[0m
Avg accuracy với k = [31m1[0m: [31m0.9477846600567604[0m
Accuracy [31m3-NN[0m bằng dữ liệu gốc với train/test = [31m0.8/0.2[0m: [31m0.9766355140186916[0m
Accuracy [31m3-NN[0m bằng dữ liệu gốc với train/test = [31m0.7/0.3[0m: [31m0.956386292834891[0m
Accuracy [31m3-NN[0m bằng dữ liệu gốc với train/test = [31m0.6/0.4[0m: [31m0.9531615925058547[0m
Avg accuracy với k = [31m3[0m: [31m0.9620611331198125[0m
Accuracy [31m5-NN[0m bằng dữ liệu gốc với train/test = [31m0.8/0.2[0m: [31m0.9766355140186916[0m
Accuracy [31m5-NN[0m bằng dữ liệu gốc với train/test = [31m0.7/0.3[0m: [31m0.9532710280373832[0m
Accuracy [31m5-NN[0m bằng dữ liệu gốc với train/test =

# Các thử nghiệm

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import os
BASE_DIR = os.path.dirname(
    os.path.dirname(
        os.path.dirname(
            os.path.abspath('notebook/preprocessing.ipynb'))))

df_4 = pd.read_csv(os.path.join(BASE_DIR, 'data/processed_data_4_classes.csv'))
df_9 = pd.read_csv(os.path.join(BASE_DIR, 'data/processed_data_9_classes.csv'))

Unnamed: 0,gwl,pH,TDS,HCO3,Cl,F,NO3,SO4,Na,K,Ca,Mg,T.H,SAR,Classification
0,5.09,8.28,476.80,220.0,60,0.44,42.276818,46.0,49.00,4.00,48.0,38.896,279.934211,1.273328,C2S1
1,5.10,8.29,589.44,230.0,80,0.56,100.659091,68.0,42.00,5.00,56.0,63.206,399.893092,0.913166,C3S1
2,4.98,7.69,326.40,200.0,30,0.66,41.471545,44.0,45.00,2.00,24.0,38.896,219.934211,1.319284,C2S1
3,5.75,8.09,270.08,160.0,10,0.58,10.669864,35.0,27.00,1.00,32.0,19.448,159.967105,0.928155,C2S1
4,2.15,8.21,1485.44,300.0,340,2.56,128.843636,280.0,298.00,5.00,56.0,92.378,519.843750,5.682664,C4S2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1062,9.90,7.80,1487.36,370.0,370,0.58,336.161100,33.0,169.30,2.60,160.0,97.240,799.835526,2.602728,C4S1
1063,5.74,8.26,1349.76,430.0,260,1.08,332.175000,33.0,211.30,43.30,48.0,116.688,599.802632,3.751176,C3S1
1064,1.72,8.77,713.60,180.0,220,0.34,44.201420,15.0,60.44,3.04,80.0,53.482,419.909539,1.282386,C3S1
1065,1.65,7.76,3233.92,280.0,1360,0.44,76.355960,109.0,465.20,3.30,400.0,92.378,1379.843750,5.444988,C4S1


In [None]:
from sklearn.model_selection import train_test_split

X_4 = df_4.drop(columns=['Classification'])
y_4 = df_4['Classification']
X_9 = df_9.drop(columns=['Classification'])
y_9 = df_9['Classification']

X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(
    X_4, y_4, test_size=0.2, random_state=42, stratify=y_4)
X_train_9, X_test_9, y_train_9, y_test_9 = train_test_split(
    X_9, y_9, test_size=0.2, random_state=42, stratify=y_9)

