# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

# Data preprocessing

In [2]:
data = pd.read_csv('yeast.data', delim_whitespace = True, header=None)

In [3]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,ADT1_YEAST,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT
1,ADT2_YEAST,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
2,ADT3_YEAST,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT
3,AAR2_YEAST,0.58,0.44,0.57,0.13,0.5,0.0,0.54,0.22,NUC
4,AATM_YEAST,0.42,0.44,0.48,0.54,0.5,0.0,0.48,0.22,MIT


In [4]:
#dropping redundant columns
data.drop(0, axis = 1, inplace=True)

In [5]:
#checking for null values
data.isna().sum()

1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64

In [6]:
#seperating features and labels
X = data.drop(9, axis = 1)
y = data[9]

In [7]:
#getting the training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 42)

# Modelling

In [8]:
#Built using OOP
class KNN():
    def __init__(self):
        #initialize the params and set them all to None
        self.k = None
        self.mu = None
        self.std = None
        self.X = None
        self.y = None
        
    def fit(self, X, y, k):
        #error Handling
        if k < 1: raise ValueError("K cannot be less than 1")
        if k > X.shape[0]: raise ValueError("K cannot be more than the number of training samples")
        self.k = k
        #normalize the data
        self.mu = X.mean()
        self.std = X.std()
        self.X = (X - self.mu) / self.std
        #data conversion for faster manipulation
        if type(self.X) is not np.ndarray: self.X = self.X.to_numpy()
        self.y = y
        if type(y) is not pd.core.series.Series : self.y = pd.Series(y)
        
    def predict(self, X):
        #normalize the data
        X = ((X - self.mu) / self.std)
        if type(X) is not np.ndarray: X = X.to_numpy()
        op = []
        for a in X:
            dist = self.getDist(self.X - a)
            #get k smallest values
            kidx = np.argpartition(dist, self.k)[:self.k]
            preds = self.y.iloc[kidx]
            pred = preds.mode()
            op.append(pred[0])
        return op
        
    def getDist(self, a):
        #return Euclidean Distance
        return np.sqrt(np.sum(np.square(a), axis = 1))

# Training the model

In [9]:
knn = KNN()
knn.fit(X_train, y_train, 5)

# Testing the model's performance

In [10]:
pred = knn.predict(X_val)
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

         CYT       0.55      0.63      0.59       119
         EXC       0.50      0.44      0.47         9
         ME1       0.69      0.92      0.79        12
         ME2       0.40      0.22      0.29         9
         ME3       0.81      0.81      0.81        42
         MIT       0.61      0.62      0.62        64
         NUC       0.55      0.50      0.52       105
         POX       1.00      0.67      0.80         3
         VAC       0.00      0.00      0.00         8

    accuracy                           0.59       371
   macro avg       0.57      0.53      0.54       371
weighted avg       0.58      0.59      0.58       371



# Hyperparameter tuning

In [11]:
def kcv(kfold, X, y, knn):
    #returns the best k based on accuracy
    dataSize = int(X.shape[0] / kfold)
    score = 0
    op = 0
    for i in range(kfold):
        X_val = X[i * dataSize : (i + 1) * dataSize]
        y_val = y[i * dataSize : (i + 1) * dataSize]
        X_train = np.concatenate((X[ : i * dataSize], X[(i + 1) * dataSize : ]), axis = 0)
        y_train = np.concatenate((y[ : i * dataSize], y[(i + 1) * dataSize : ]), axis = 0)
        for k in knn:
            model = KNN()
            model.fit(X_train, y_train, k)
            pred = model.predict(X_val)
            acc = accuracy_score(y_val, pred)
            if acc > score:
                score = acc
                op = k
    return op

In [12]:
#Initialize different values of k for tuning
k1 = [i for i in range(1, 6)]
k5 = [i for i in range(10, 49, 5)]
k10 = [i for i in range(50, 101, 10)]
k = k1 + k5 + k10

In [13]:
k = kcv(5, X_train, y_train, k)

# Using optimal K and training the model

In [14]:
knn = KNN()
knn.fit(X_train, y_train, k)
pred = knn.predict(X_val)
print(classification_report(y_val, pred))

              precision    recall  f1-score   support

         CYT       0.54      0.69      0.61       119
         EXC       0.36      0.44      0.40         9
         ME1       0.83      0.83      0.83        12
         ME2       0.67      0.44      0.53         9
         ME3       0.82      0.79      0.80        42
         MIT       0.70      0.61      0.65        64
         NUC       0.55      0.50      0.52       105
         POX       0.00      0.00      0.00         3
         VAC       0.00      0.00      0.00         8

    accuracy                           0.60       371
   macro avg       0.50      0.48      0.48       371
weighted avg       0.60      0.60      0.59       371



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
