# KNN implementation on Kaggle dataset

For this implementation a kaggle dataset is used, with the aim of predicting a students droupout or academic success rates. The link to the data set is: [link](https://www.kaggle.com/datasets/thedevastator/higher-education-predictors-of-student-retention). In order for the code to run, upload the dataset to the colab workspace, or into google drive.


## Libraries

In [2]:
import pandas as pd
import numpy as np

## External methods

In [3]:
#Other methods that don't need to belong to the class

def euclidean_distance(x1,x2): #for euclidean distance
  return np.sqrt(np.sum((x1-x2)**2))

def manhattan_distance(x1,x2): #for manhattan distance
  return np.sum(np.abs(x1-x2))

def mahalanobis_distance(x1,x2,cov_inv): #mahalanobis distance
  #cov_inv = np.linalg.inv(np.cov(X_train, rowvar=False)) passed directly from fit so its not calculated each time
  return np.sqrt(np.dot(np.dot((x1-x2), cov_inv), (x1-x2)))

def cosine_similarity(x1,x2):
  return np.dot(x1,x2)/(np.linalg.norm(x1)*np.linalg.norm(x2))

def most_common(values): #for classification target
  counts={}
  for item in values:
    counts[item]=counts.get(item, 0)+1
  sorted_counts=sorted(counts.items(),key=lambda x: x[1], reverse=True)
  return sorted_counts[0][0]

def calculate_distance(x1,x2,metric,cov_inv):
  if metric=='euclidean':
    return euclidean_distance(x1,x2)
  if metric=='manhattan':
    return manhattan_distance(x1,x2)
  if metric=='mahalanobis':
    return mahalanobis_distance(x1,x2,cov_inv)
  if metric=='cosine similarity':
    return cosine_similarity(x1,x2)
  if callable(metric):
    try:
      return metric(x1, x2)
    except ValueError as e:
      print('Error in callable function: ',e)


def find_target(nearest_y_value,target):
  if target=='classification':
    return most_common(nearest_y_value)
  if target=='regression':
    return np.mean(nearest_y_value)

def class_accuracy(y_pred, y_test):
    error=0
    for i in range(len(y_pred)):
      if y_pred[i]!=y_test[i]:
        error=error+1
    return (len(y_pred)-error)/len(y_pred)*100

def reg_accuracy(y_pred, y_test, dev):
  error=0
  for i in range(len(y_pred)):
    if abs(y_pred[i]-y_test[i])>dev:
      error=error+1
  return (len(y_pred)-error)/len(y_pred)*100

def mean_squared_error(y_pred, y_test):
  squared_errors = (y_test - y_pred) ** 2
  mse = np.mean(squared_errors)
  return mse


## Class definition

In [4]:
class KNN:

  def __init__(self, k=5, metric='euclidean',target='classification', dev=None):
    #Parameter fail safes
    if not isinstance(k, int) or k <0:
      raise ValueError('k must be a non-negative integer')
    possible_metrics={'euclidean', 'manhattan','mahalanobis', 'cosine similarity'}
    if not (isinstance(metric, str) and metric in {'euclidean', 'manhattan', 'mahalanobis', 'cosine similarity'} or callable(metric)):
      raise ValueError('Invalid metric. Allowed metrics are \'euclidean\' , \'manhattan\',\'mahalanobis\'or \'cosine similarity\'; or a callable function')
    possible_targets={'classification', 'regression'}
    if target not in possible_targets:
      raise ValueError('Invalid target. Allowed metrics are \'classification\' or \'regression\'')
    if (not isinstance(dev, int) or dev <0) and (dev is not None):
      raise ValueError('Standard deviation must be a non-negative integer')



    self.k=k #parameter amount of neighbors
    self.metric=metric #parameter type of metric
    self.target=target #parameter target (classification or regression)
    try: #parameter deviation for accepted deviation in regression models, if it isnt regression this parameter doesnt exist
      if target=='classification' and dev!=0 and (dev is not None):
        self.dev=None
        raise ValueError("Deviation won't be used in classification models, parameter ignored")
      else:
        self.dev=dev
    except ValueError as e:
      print(e)
    self.cov_inv=0

  def fit(self, X, y): #fit in KNN stores the data to compare the given point
    self.X_train=X
    self.y_train=y
    if self.metric=='mahalanobis':
      self.cov_inv=np.linalg.inv(np.cov(X, rowvar=False))
    #check that k<than amount of data points
    if self.k>len(X):
      raise ValueError("k is bigger than the amount of data points, recreate KNN with a different k, or add more data")



  def predict(self, X): # calls make prediction method for each point of X
    y_pred=[self.make_prediction(x) for x in X]
    return np.array(y_pred)


  def make_prediction(self, x):
    #calculates the distance depending on the metric chosen
    distances=[calculate_distance(x, x_train, self.metric, self.cov_inv) for x_train in self.X_train]
    #sort distance, returns indexes of the closest k
    nearest_index=np.argsort(distances)[:self.k]
    #supervised learning compares with true value, get value of closest
    nearest_y_value=[self.y_train[i] for i in nearest_index]
    if self.target=='regression' and self.dev==None:
      self.dev=np.std(nearest_y_value)
    #get y value depending on the target
    return find_target(nearest_y_value, self.target)

  def evaluate(self, y_pred, y_test, eval='accuracy', custom_dev=None):
    if self.target == 'classification':
      try:
        if eval != 'accuracy':
            raise ValueError("eval parameter is ignored in classification models, evaluation method is accuracy")
      except ValueError as e:
        print(e)
      try:
        if custom_dev is not None:
          raise ValueError("Deviation won't be used in classification models, parameter ignored")
      except ValueError as e:
        print(e)
      return class_accuracy(y_pred, y_test)
    elif self.target == 'regression':
        if eval == 'accuracy':
            if custom_dev is not None:
              if (not isinstance(custom_dev, int) or custom_dev <0) and (custom_dev is not None):
                raise ValueError('Standard deviation must be a non-negative integer')
              elif custom_dev >= (max(y_test) - min(y_test)):
                raise ValueError("Deviation higher than y range, guaranteed 100% accuracy, no significance in results")
              self.dev=custom_dev
            elif self.dev is None:
                raise ValueError("For regression evaluation with 'accuracy', you must specify a non-negative deviation (dev).")
            elif self.dev >= (max(y_test) - min(y_test)):
                raise ValueError("Deviation higher than y range, guaranteed 100% accuracy, no significance in results")
            else:
              self.dev = self.dev
            return reg_accuracy(y_pred, y_test, self.dev)
        elif eval == 'score':
            return mean_squared_error(y_pred, y_test)
        else:
            raise ValueError("Invalid eval. Allowed evaluation methods are: 'accuracy' or 'score'")
    else:
        raise ValueError("Invalid target. Allowed targets are 'classification' or 'regression'")



  def cross_validation(self, X, y, folds=5, eval='accuracy'): #cross validation for how well the model does splitting the data in different ways
    shuffle_index=np.random.permutation(len(X))
    fold_size=len(X)//folds
    fold_index=[shuffle_index[i:i+fold_size] for i in range(0, len(X), fold_size)]
    metrics=[]
    for i in range(folds):
      test_indices = fold_index[i]
      train_indices = np.concatenate([fold_index[j] for j in range(folds) if j != i])
      X_train = X[train_indices]
      y_train = y[train_indices]
      X_test = X[test_indices]
      y_test = y[test_indices]
      self.fit(X_train, y_train)
      y_pred = self.predict(X_test)
      metrics.append(self.evaluate(y_pred, y_test, eval))
    return np.mean(metrics)



## Education dataset
kaggle classification set

In [5]:
#Dataset can be manually uploaded by downloading it from kaggle and uploading it into colab
#path='dataset.csv' #uncomment this code if dataset is in colab and check correct path
#or with the following code if the dataset is saved in drive, if using previous code, comment following lines
from google.colab import drive
drive.mount('/content/drive')
path='/content/drive/MyDrive/dataset.csv'

Mounted at /content/drive


In [6]:
df=pd.read_csv(path) #dataset saved in the df variable

In [8]:
df.dtypes

Marital status                                      int64
Application mode                                    int64
Application order                                   int64
Course                                              int64
Daytime/evening attendance                          int64
Previous qualification                              int64
Nacionality                                         int64
Mother's qualification                              int64
Father's qualification                              int64
Mother's occupation                                 int64
Father's occupation                                 int64
Displaced                                           int64
Educational special needs                           int64
Debtor                                              int64
Tuition fees up to date                             int64
Gender                                              int64
Scholarship holder                                  int64
Age at enrollm

Encode the object variable so that the type instead of being an object is an int. And separate data frame into x values and y values

In [9]:
y_encoder={'Target':{'Dropout':0, 'Graduate':1,'Enrolled':2}}
y_df=df.replace(y_encoder)
y_c=y_df['Target']
X_c=df.drop(['Target'],axis=1)

In [10]:
set_distribution=0.75
len_data=df.shape[0]
len_train=int(len_data*set_distribution) #rounded
index=np.arange(0,len_data)
np.random.shuffle(index)
X_train_c = X_c.iloc[index[:len_train]].to_numpy() #start 'till len_train
y_train_c = y_c.iloc[index[:len_train]].to_numpy()
X_test_c = X_c.iloc[index[len_train:]].to_numpy() #from len_train on
y_test_c = y_c.iloc[index[len_train:]].to_numpy()

# Using the model for a non toy dataset

## Variations in k

In [11]:
knn3=KNN(k=3)
knn3.fit(X_train_c, y_train_c)
y_pred=knn3.predict(X_test_c)
print(knn3.evaluate(y_pred,y_test_c))

knn5=KNN(k=5)
knn5.fit(X_train_c, y_train_c)
y_pred=knn5.predict(X_test_c)
print(knn5.evaluate(y_pred,y_test_c))

knn10=KNN(k=10)
knn10.fit(X_train_c, y_train_c)
y_pred=knn10.predict(X_test_c)
print(knn10.evaluate(y_pred,y_test_c))


66.2748643761302
67.26943942133815
67.99276672694394
Exception caught: k is bigger than the amount of data points, recreate KNN with a different k, or add more data


## Variations in metrics

In [12]:
knn1=KNN(k=5,metric='euclidean') #euclidean distance
knn1.fit(X_train_c, y_train_c)
y_pred=knn1.predict(X_test_c)
print(knn1.evaluate(y_pred,y_test_c))

knn2=KNN(k=5,metric='manhattan') #manhattan distance
knn2.fit(X_train_c, y_train_c)
y_pred=knn2.predict(X_test_c)
print(knn2.evaluate(y_pred,y_test_c))

knn3=KNN(k=5,metric='mahalanobis') #mahalanobis distance
knn3.fit(X_train_c, y_train_c)
y_pred=knn3.predict(X_test_c)
print(knn3.evaluate(y_pred,y_test_c))

knn4=KNN(k=5,metric='cosine similarity') #cosine similarity distance
knn4.fit(X_train_c, y_train_c)
y_pred=knn4.predict(X_test_c)
print(knn4.evaluate(y_pred,y_test_c))



67.26943942133815
69.16817359855335
70.97649186256781
25.85895117540687


Performing cross validation for the best parameters found

In [16]:
X=X_c.to_numpy()
y=y_c.to_numpy()

In [18]:
knn_cv=KNN(k=10, metric='mahalanobis')
knn_cv.cross_validation(X, y)

70.74660633484163