<a href="https://colab.research.google.com/github/FarisIftikharAlfarisi/knn-experiments/blob/main/KNNPengembangan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

#Menerapkan 3 variasi algoritma KNN

- Weight Adjusted K-Nearest Neighbour
- K Support Vector Nearest Neighbour
- Modified K Nearest Neighbour
- fuzzy k nearest neighbour

###Data yang dipakai :
- data diabetes
- data bunga irish

WAKNN (Weight Adjusted K Nearest Neighbour)

collaborator : ridwan

In [None]:
import numpy as np

class WAKNearestNeighbour:
    def __init__(self, k=3, max_iter=10, factors=[0.2]):
        self.k = k
        self.max_iter = max_iter
        self.weights = None
        default_factors = np.array([0.2, 0.8, 1.5, 2.0, 4.0])
        self.factors = factors if factors else np.random.choice(default_factors, size=1)

    @staticmethod
    def entropy(y):
        unique, counts = np.unique(y, return_counts=True)
        probs = counts / len(y)
        return -np.sum(probs * np.log2(probs))

    def information_gain(self, X, y):
        S_entropy = self.entropy(y)
        weighted_entropies = np.zeros(X.shape[1])

        for i in range(X.shape[1]):
            values, counts = np.unique(X[:, i], return_counts=True)
            weighted_entropy = np.sum([
                (counts[j] / len(y)) * self.entropy(y[X[:, i] == values[j]])
                for j in range(len(values))
            ])
            weighted_entropies[i] = S_entropy - weighted_entropy

        return weighted_entropies

    @staticmethod
    def weighted_cosine_similarity_matrix(X1, X2, weights):
        weights_sqrt = np.sqrt(weights)
        X1_weighted = X1 * weights_sqrt
        X2_weighted = X2 * weights_sqrt

        dot_products = np.dot(X1_weighted, X2_weighted.T)
        norm_X1 = np.linalg.norm(X1_weighted, axis=1, keepdims=True)
        norm_X2 = np.linalg.norm(X2_weighted, axis=1, keepdims=True)
        similarities = dot_products / (norm_X1 @ norm_X2.T + 1e-10)
        return similarities

    def predict_single(self, X_train, y_train, x_test):
        similarities = self.weighted_cosine_similarity_matrix(X_train, np.expand_dims(x_test, axis=0), self.weights).flatten()
        k_indices = np.argsort(similarities)[-self.k:]
        k_labels = y_train[k_indices]
        return np.argmax(np.bincount(k_labels))

    def fit(self, X_train, y_train):
        self.weights = self.information_gain(X_train, y_train)
        self.weights /= np.max(self.weights)

    def optimize_weights(self, X_train, y_train):
        factors = self.factors
        best_weights = self.weights
        best_obj_value = self.evaluate_obj_function(X_train)

        for i in range(len(self.weights)):
            original_weight = self.weights[i]
            for factor in factors:
                self.weights[i] = original_weight * factor
                obj_value = self.evaluate_obj_function(X_train)

                if obj_value < best_obj_value:
                    best_obj_value = obj_value
                    best_weights = self.weights.copy()

            self.weights[i] = original_weight

        self.weights = best_weights

    def evaluate_obj_function(self, X_train):
        distances = np.sqrt(np.sum((X_train[:, None, :] - X_train[None, :, :])**2 * self.weights, axis=-1))
        sorted_distances = np.sort(distances, axis=1)[:, :self.k]
        return np.mean(sorted_distances)

    def predict(self, X_train, y_train, X_test):
        similarities = self.weighted_cosine_similarity_matrix(X_test, X_train, self.weights)
        k_indices = np.argsort(similarities, axis=1)[:, -self.k:]
        k_labels = y_train[k_indices]
        predictions = np.array([np.argmax(np.bincount(labels)) for labels in k_labels])
        return predictions

    def get_weights(self):
        return self.weights


In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

waknn = WAKNearestNeighbour(k=3)
waknn.fit(X_train, y_train)

predictions = waknn.predict(X_train, y_train, X_test)

accuracy = np.mean(predictions == y_test)
print(f"Accuracy WAKNN: {accuracy * 100:.0f}%")

Accuracy WAKNN: 90%


KSVNN (K Support Vector Nearest Neighbour)

collaborator : parrriiisss

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

class KSVNearestNeighbour:
    def __init__(self, k=3, C=1.0, kernel='linear'):
        self.k = k
        self.C = C
        self.kernel = kernel
        self.svm = SVC(C=C, kernel=kernel, probability=True)

    def fit(self, X_train, y_train):
        '''
        melatih model menggunakan support vectir
        '''
        self.svm.fit(X_train, y_train)
        self.support_vectors = self.svm.support_vectors_
        self.support_indices = self.svm.support_

    def predict(self, X_test):
      '''
      Prediksi pada data uji
      '''
      distances = np.linalg.norm(X_test[:, np.newaxis] - self.support_vectors, axis=2)
      nearest_indices = np.argsort(distances, axis=1)[:, :self.k]

      predictions = []
      for i, indices in enumerate(nearest_indices):
          neighbors = indices  # tidak perlu mengambil dari self.support_indices
          labels = [self.svm.predict(self.support_vectors[neighbor].reshape(1, -1))[0] for neighbor in neighbors]
          prediction = np.bincount(labels).argmax()
          predictions.append(prediction)

      return np.array(predictions)

    def score(self, X_test, y_test):
        '''
        mengembalikan nilai akurasi dari model
        '''
        predictions = self.predict(X_test)
        return accuracy_score(y_test, predictions)


In [None]:
iris = load_iris()
X = iris.data  # Fitur
y = iris.target  # Label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ksvnn = KSVNearestNeighbour(k=3, C=1.0, kernel='linear')
ksvnn.fit(X_train, y_train)

accuracy = ksvnn.score(X_test, y_test)
print(f"akurasi KSVNN : {accuracy * 100 :.0f}%")

akurasi KSVNN : 93%


MKNN (Modified K Nearest Neighbour)

collaborator : parrriiisss

In [None]:
class MKNN:
    def __init__(self, k=5, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X_test):
        y_pred = []
        for x_test in X_test:
            distances = self.calculate_distance(x_test, self.distance_metric)
            indices = np.argsort(distances)[:self.k]
            labels = [self.y_train[i] for i in indices]
            label = self.majority_vote(labels)
            y_pred.append(label)
        return np.array(y_pred)

    def calculate_distance(self, x_test, distance_metric):
        distances = []
        for x_train in self.X_train:
            if distance_metric == 'euclidean':
                distance = self.euclidean_distance(x_test, x_train)
            elif distance_metric == 'manhattan':
                distance = self.manhattan_distance(x_test, x_train)
            elif distance_metric == 'minkowski':
                distance = self.minkowski_distance(x_test, x_train)
            else:
                distance = self.euclidean_distance(x_test, x_train)
            distances.append(distance)
        return np.array(distances)

    def euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))

    def manhattan_distance(self, x1, x2):
        return np.sum(np.abs(x1 - x2))

    def minkowski_distance(self, x1, x2, p=2):
        return np.power(np.sum(np.power(np.abs(x1 - x2), p)), 1/p)

    def majority_vote(self, labels):
        return np.bincount(labels).argmax()

    def score(self, X_test, y_test):
        y_pred = self.predict(X_test)
        return accuracy_score(y_test, y_pred)

In [None]:
iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mknn = MKNN(k=5, kernel='linear', distance_metric='euclidean')

mknn.fit(X_train, y_train)

y_pred = mknn.predict(X_test)

accuracy = mknn.score(X_test, y_test)
print(f"Akurasi: {accuracy*100 :.0f}%")

Akurasi: 100%


Fuzzy K Nearest Neighbour

collaborator : pariss

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# Menggunakan kelas FuzzyKNN yang sudah dibuat sebelumnya
class FuzzyKNN:
    def __init__(self, k=3, m=2):
        self.k = k #nilai tetangga
        self.m = m #derajat keanggotaan
        self.X_train = None
        self.y_train = None
        self.classes = None

    @staticmethod
    def normalize_data(X):
        scaler = MinMaxScaler()
        return scaler.fit_transform(X)

    def fit(self, X_train, y_train):
        self.X_train = self.normalize_data(X_train)
        self.y_train = y_train
        self.classes = np.unique(y_train)

    def membership_degrees(self, distances):

        if np.any(distances == 0):  # kalau jaraknya 0, set membership degree ke 1 untuk titik tersebut
            memberships = np.zeros_like(distances)
            memberships[distances == 0] = 1
            return memberships

        weights = 1 / (distances ** (2 / (self.m - 1)) + 1e-10)  # jangan sampe dibagi nol
        memberships = weights / np.sum(weights)
        return memberships

    def predict_single(self, x):
        distances = np.sqrt(np.sum((self.X_train - x) ** 2, axis=1))
        k_indices = np.argsort(distances)[:self.k]  # Ambil k tetangga terdekat
        k_distances = distances[k_indices]
        k_labels = self.y_train[k_indices]

        memberships = self.membership_degrees(k_distances)
        class_memberships = np.zeros(len(self.classes))

        for idx, cls in enumerate(self.classes):
            class_memberships[idx] = np.sum(memberships[k_labels == cls])

        return self.classes[np.argmax(class_memberships)]

    def predict(self, X_test):
        X_test = self.normalize_data(X_test)
        predictions = [self.predict_single(x) for x in X_test]
        return np.array(predictions)

    def evaluate(self, X_test, y_test):
        predictions = self.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        print(f"Accuracy: {accuracy * 100:.0f}%")


In [None]:
iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

fuzzy_knn = FuzzyKNN(k=5, m=2)

fuzzy_knn.fit(X_train, y_train)

fuzzy_knn.evaluate(X_test, y_test)

Accuracy: 97%


#Kita terapkan ke dataset sendiri

link dataset : https://www.kaggle.com/datasets/hasibur013/diabetes-dataset


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
path = "drive/MyDrive"
data = pd.read_csv(path + "/diabetes_dataset.csv")
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
#proses EDA
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [None]:
ss = StandardScaler()

norm_data = data.copy()
norm_data.iloc[:,:-1] = ss.fit_transform(norm_data.iloc[:,:-1])

 -0.25095213  1.82781311 -0.54791859  1.23388019  0.04601433  1.82781311
  1.82781311 -0.84488505  0.3429808   0.93691372 -1.14185152  0.93691372
 -0.84488505 -0.84488505 -0.25095213  1.23388019  0.93691372  1.53084665
  2.12477957  1.82781311  0.93691372 -0.84488505  2.7187125   0.3429808
  0.3429808  -0.25095213 -0.25095213  0.63994726  1.82781311  0.04601433
  2.12477957  1.53084665 -0.54791859  0.04601433 -0.25095213  0.93691372
  0.93691372  1.53084665  0.93691372 -1.14185152 -0.84488505 -0.54791859
  0.93691372  0.93691372 -0.84488505 -0.84488505  0.3429808   1.23388019
  0.93691372 -0.84488505  0.93691372 -1.14185152 -1.14185152 -1.14185152
 -0.54791859  1.23388019  0.3429808  -0.54791859  0.93691372  0.3429808
 -1.14185152 -0.54791859 -0.84488505  0.04601433 -0.54791859  0.3429808
  2.7187125   0.04601433 -0.84488505 -0.84488505  0.93691372  0.3429808
 -1.14185152 -0.54791859 -0.25095213 -0.54791859  0.93691372 -1.14185152
  0.3429808  -0.54791859  2.7187125  -0.54791859  3.312

In [None]:
import time
from sklearn.model_selection import train_test_split

X = norm_data.iloc[:, :-1].values
y = norm_data.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

waknn = WAKNearestNeighbour(k=7, max_iter=50, factors=[1.5])

start_fit = time.time()
waknn.fit(X_train, y_train)
end_fit = time.time()
fit_duration = end_fit - start_fit
print(f"Waktu komputasi untuk training (fit): {fit_duration:.2f} detik")

start_predict = time.time()
predictions = waknn.predict(X_train, y_train, X_test)
end_predict = time.time()
predict_duration = end_predict - start_predict
print(f"Waktu komputasi untuk prediksi: {predict_duration:.2f} detik")

# Akurasi
accuracy = np.mean(predictions == y_test)
print(f"Accuracy WAKNN pada data Diabetes: {accuracy * 100:.0f}%")


Waktu komputasi untuk training (fit): 0.12 detik
Waktu komputasi untuk prediksi: 0.03 detik
Accuracy WAKNN pada data Diabetes: 71%


In [None]:
#Model KSVNN

import time
from sklearn.model_selection import train_test_split

X = norm_data.iloc[:, :-1].values
y = norm_data.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ksvnn = KSVNearestNeighbour(k=7, )
ksvnn.fit(X_train, y_train)

accuracy = ksvnn.score(X_test, y_test)
print(f"akurasi KSVNN : {accuracy * 100 :.0f}%")


akurasi KSVNN : 75%


In [None]:
#Model MKNN

import time
from sklearn.model_selection import train_test_split

X = norm_data.iloc[:, :-1].values
y = norm_data.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

mknn = MKNN(k=7, distance_metric='euclidean')

mknn.fit(X_train, y_train)

y_pred = mknn.predict(X_test)

accuracy = mknn.score(X_test, y_test)
print(f"Akurasi: {accuracy*100 :.0f}%")

Akurasi: 68%


In [None]:
# model FuzzyK

from sklearn.model_selection import train_test_split

X = norm_data.iloc[:, :-1].values
y = norm_data.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

fknn = FuzzyKNN(k=7, m=2)

fknn.fit(X_train, y_train)

fknn.evaluate(X_test, y_test)

Accuracy: 73%


In [None]:
#kita komparasi dengan KNN biasa
from sklearn.neighbors import KNeighborsClassifier

X = norm_data.iloc[:, :-1].values
y = norm_data.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

KNN = KNeighborsClassifier(metric='euclidean', n_neighbors=7)
KNN.fit(X_train, y_train)
predictions = KNN.predict(X_test)
accuracy = np.mean(predictions == y_test)
print(f"Accuracy KNN pada data Diabetes: {accuracy * 100:.0f}%")

Accuracy KNN pada data Diabetes: 68%
