In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math
from collections import Counter

In [None]:
train = pd.read_csv("/content/drive/MyDrive/112_1/MachineLearning/train_new.csv", index_col=0)
test = pd.read_csv("/content/drive/MyDrive/112_1/MachineLearning/test_new.csv", index_col=0)

In [None]:
train0 = train[train["is_claim"] == 0].sample(1000, random_state=42)
train1 = train[train["is_claim"] == 1].sample(500, random_state=42)
train_new = pd.concat([train0, train1], ignore_index=False)
train_new = train_new.sample(frac=1, random_state=42, ignore_index=False)

In [None]:
x_train = train_new.drop(["is_claim"], axis="columns")
y_train = train_new["is_claim"]

In [None]:
def split(x, y, test_size=0.2):
    i = int((1 - test_size) * x.shape[0])
    o = np.random.permutation(x.shape[0])

    x_train, x_test = np.split(np.take(x,o,axis=0), [i])
    y_train, y_test = np.split(np.take(y,o), [i])
    return x_train, x_test, y_train, y_test

In [None]:
x_knn_train, x_knn_valid, y_knn_train, y_knn_valid = split(x_train, y_train, test_size=0.2)

In [None]:
x_knn_train = np.array(x_knn_train)
x_knn_valid = np.array(x_knn_valid)
y_knn_train = np.array(y_knn_train)
y_knn_valid = np.array(y_knn_valid)

In [None]:
def euclidean_distance(p, q):
  temp = (p - q)**2
  euclidean_distance=np.sqrt(temp)
  return euclidean_distance

def manhattan_distance(p, q):
  manhattan_distance = np.sum(np.abs(p-q))
  return manhattan_distance

def chebyshev_distance(p, q):
  chebyshev_distance = np.max(np.abs(p-q))
  return chebyshev_distance

In [None]:
class KNN_euclidian:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        predictions = [self._predict(x) for x in X_test]
        return np.array(predictions)

    def _predict(self, x):
        # Calculate distances between x and all examples in the training set
        distances = [self.euclidean_distance(x, x_train) for x_train in self.X_train]

        # Get indices of k-nearest training data points
        k_neighbors_indices = np.argsort(distances)[:self.k]

        # Get the labels of the k-nearest training data points
        k_neighbor_labels = [self.y_train[i] for i in k_neighbors_indices]

        # Perform majority voting to find the most common class label
        most_common = np.bincount(k_neighbor_labels).argmax()
        return most_common

    def euclidean_distance(self, p, q):
        temp = (p - q)**2
        euclidean_distance = np.sqrt(temp.sum())
        return euclidean_distance

In [None]:
def accuracy(y_true, y_pred):
  accuracy = 0
  for i in range(y_true.shape[0]):
      if y_true[i] == y_pred[i]:
        accuracy = accuracy + 1
  accuracy = (accuracy/y_true.shape[0])*100
  print("Test Accuracy = ",accuracy, "%")

In [None]:
knn_euclidian = KNN_euclidian(k=3)
knn_euclidian.fit(x_knn_train, y_knn_train)
y_knn_pred = knn_euclidian.predict(x_knn_valid)

In [None]:
accuracy(y_knn_valid, y_knn_pred)

Test Accuracy =  61.66666666666667 %


In [None]:
class KNN_manhattan:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        predictions = [self._predict(x) for x in X_test]
        return np.array(predictions)

    def _predict(self, x):
        # Calculate distances between x and all examples in the training set
        distances = [self.manhattan_distance(x, x_train) for x_train in self.X_train]

        # Get indices of k-nearest training data points
        k_neighbors_indices = np.argsort(distances)[:self.k]

        # Get the labels of the k-nearest training data points
        k_neighbor_labels = [self.y_train[i] for i in k_neighbors_indices]

        # Perform majority voting to find the most common class label
        most_common = np.bincount(k_neighbor_labels).argmax()
        return most_common

    def manhattan_distance(self, p, q):
      manhattan_distance = np.sum(np.abs(p-q))
      return manhattan_distance

In [None]:
knn_manhattan = KNN_manhattan(k=3)
knn_manhattan.fit(x_knn_train, y_knn_train)
y_knn_pred = knn_manhattan.predict(x_knn_valid)

In [None]:
accuracy(y_knn_valid, y_knn_pred)

Test Accuracy =  61.33333333333333 %


In [None]:
class KNN_chebyshev:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        predictions = [self._predict(x) for x in X_test]
        return np.array(predictions)

    def _predict(self, x):
        # Calculate distances between x and all examples in the training set
        distances = [self.chebyshev_distance(x, x_train) for x_train in self.X_train]

        # Get indices of k-nearest training data points
        k_neighbors_indices = np.argsort(distances)[:self.k]

        # Get the labels of the k-nearest training data points
        k_neighbor_labels = [self.y_train[i] for i in k_neighbors_indices]

        # Perform majority voting to find the most common class label
        most_common = np.bincount(k_neighbor_labels).argmax()
        return most_common

    def chebyshev_distance(self, p, q):
      chebyshev_distance = np.max(np.abs(p-q))
      return chebyshev_distance

In [None]:
knn_chebyshev = KNN_chebyshev(k=3)
knn_chebyshev.fit(x_knn_train, y_knn_train)
y_knn_pred = knn_chebyshev.predict(x_knn_valid)

In [None]:
accuracy(y_knn_valid, y_knn_pred)

Test Accuracy =  58.333333333333336 %
