In [2]:
import numpy as np
import pandas as pd

## Assignments

### 1. train test split from scratch

Create a function my_train_test_split() that takes ipnput X, y and fraction of train. And ouputs the list or tuple containing splits

In [3]:
data = np.array([[1, 2, 0], [3, 4, 1], [5, 6, 1], [7, 8, 0], [9, 10, 1], [11, 12, 0]])
print('data:')
print(data)

data:
[[ 1  2  0]
 [ 3  4  1]
 [ 5  6  1]
 [ 7  8  0]
 [ 9 10  1]
 [11 12  0]]


In [4]:
# Many issues with this function of yours

# You were passing y as argument in your original code.
# But the function body was never really using y

# The split logic is using the variable named "data".
# But the data variable is not a argument to this function
# It was defined outside the function.
# Hence, in essence you were passing something to be split into the function
# but splitting on something totally different
def my_train_test_split(data, split_percent= 0.8):  # I removed the X and y and kept data as the only input
    split = int(data.shape[0] * split_percent)  #Your original code for this line was using X.
    # X is not an input argument to this function. I changed it to data

    # Split the data into training and testing sets
    X_train = data[:split, :-1]
    y_train = data[:split, -1]
    X_test = data[split:, :-1]
    y_test = data[split:, -1]

    return X_train, y_train, X_test, y_test

In [5]:
# Enhanced version of splitting that shuffles the input data
def my_train_test_split(data, split_percent= 0.8):  # includes shuffling
    split = int(data.shape[0] * split_percent)

    # Split the data into training and testing sets
    #X_train = data[:split, :-1]
    #y_train = data[:split, -1]
    #X_test = data[split:, :-1]
    #y_test = data[split:, -1]

    num_samples = data.shape[0]
    shuffled_indices = np.random.choice(num_samples, num_samples, replace = False)
    train_indices_split = shuffled_indices[:split]
    test_indices_split = shuffled_indices[split:]

    X_train = data[train_indices_split, :-1]
    y_train = data[train_indices_split, -1]
    X_test = data[test_indices_split, :-1]
    y_test = data[test_indices_split, -1]

    return X_train, y_train, X_test, y_test

In [6]:
train_frac = 0.8  # 80% for training, 20% for testing

X_train, y_train, X_test, y_test = my_train_test_split(data, train_frac)

# Printing the results
print("X_train:")
print(X_train)
print("y_train:")
print(y_train)
print("X_test:")
print(X_test)
print("y_test:")
print(y_test)

X_train:
[[ 7  8]
 [ 3  4]
 [ 1  2]
 [11 12]]
y_train:
[0 1 0 0]
X_test:
[[ 5  6]
 [ 9 10]]
y_test:
[1 1]


### 2. kNN from scratch

In [7]:
# KNN class that allows setting the number of neighbours and weight=uniform or distance
class KNN:
    def __init__(self, k = 7): #Fill this out
        self.k = k
        #pass - After filling put the function, you have to remove this pass 

    def fit(self, X_train, y_train): # What is missing in function definition?
        self.X_train = X_train
        self.y_train = y_train
        #pass - After filling put the function, you have to remove this pass 

    def euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))

    def predict(self, X_test): # What is missing in function definition?
        # Corrected  tab indentation. Python is very picky about it 
        # is one of the top causes for bugs
        y_pred = [self._predict(x) for x in X_test]
        return np.array(y_pred)
        #pass - After filling put the function, you have to remove this pass 
    
    def _predict(self, x):
        # Compute distances between x and all examples in the training set
        distances = [self.euclidean_distance(x, x_train) for x_train in self.X_train]
        
        # Sort by distance and return indices of the first k neighbors
        k_indices = np.argsort(distances)[:self.k]
        
        # Extract the labels of the k nearest neighbor training samples
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        
        # Return the most common class label
        most_common = np.bincount(k_nearest_labels).argmax()
        return most_common
    
    def evaluate(self, X_test, y_test):
         y_pred = self.predict(X_test)
         accuracy = sum(y_pred == y_test) / len(y_test)
         return accuracy

In [8]:
knn = KNN(k=7)
knn.fit(X_train, y_train)

In [9]:
y_pred = knn.predict(X_test)

In [10]:
print("Predicted Labels:")
print(y_pred)
print("True Labels:")
print(y_test)

Predicted Labels:
[0 0]
True Labels:
[1 1]


In [11]:
accuracies = []
ks = range(1,30)
for k in ks:
    knn.fit(X_train, y_train)
    accuracy = knn.evaluate(X_test, y_test)
    accuracies.append(accuracy)

print(accuracies)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


##### Using IRIS data with your custom KNN

In [13]:
from sklearn.datasets import load_iris

# save "bunch" object containing iris dataset and its attributes
iris = load_iris()

#store feature matrix in dataframe
df = pd.DataFrame(iris.data, columns=iris.feature_names)

# Add the target variable to the dataframe
df['target'] = iris.target

# Print the first 5 rows of the dataframe
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [14]:
train_frac = 0.8  # 80% for training, 20% for testing

X_train, y_train, X_test, y_test = my_train_test_split(df.to_numpy(), train_frac)
X_train, y_train

(array([[5.1, 3.7, 1.5, 0.4],
        [5.6, 2.7, 4.2, 1.3],
        [6.1, 3. , 4.6, 1.4],
        [7.4, 2.8, 6.1, 1.9],
        [5.1, 3.5, 1.4, 0.3],
        [6.8, 3.2, 5.9, 2.3],
        [6. , 2.7, 5.1, 1.6],
        [5.2, 4.1, 1.5, 0.1],
        [6.9, 3.1, 5.1, 2.3],
        [5. , 3.3, 1.4, 0.2],
        [6.1, 2.9, 4.7, 1.4],
        [6.7, 3. , 5.2, 2.3],
        [5.9, 3. , 4.2, 1.5],
        [6.2, 2.2, 4.5, 1.5],
        [5.9, 3. , 5.1, 1.8],
        [7.3, 2.9, 6.3, 1.8],
        [5.1, 3.8, 1.6, 0.2],
        [5.1, 3.5, 1.4, 0.2],
        [6.5, 2.8, 4.6, 1.5],
        [4.7, 3.2, 1.6, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [6.5, 3.2, 5.1, 2. ],
        [4.6, 3.4, 1.4, 0.3],
        [4.9, 2.5, 4.5, 1.7],
        [6.4, 3.1, 5.5, 1.8],
        [7.9, 3.8, 6.4, 2. ],
        [4.8, 3. , 1.4, 0.1],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.6, 1. , 0.2],
        [6.3, 2.5, 5. , 1.9],
        [7.6, 3. , 6.6, 2.1],
        [6.3, 2.3, 4.4, 1.3],
        [4.8, 3.4, 1.9, 0.2],
        [4

In [15]:
knn = KNN(k=7)
knn.fit(X_train, y_train)

In [16]:
y_pred = knn.predict(X_test)
print("Predicted Labels:")
print(y_pred)
print("True Labels:")
print(y_test)

Predicted Labels:
[0 1 2 1 2 1 1 2 0 1 0 0 2 2 0 1 2 0 1 2 0 1 2 1 2 2 0 1 0 0]
True Labels:
[0. 1. 2. 1. 2. 1. 1. 2. 0. 1. 0. 0. 2. 2. 0. 1. 2. 0. 1. 2. 0. 1. 2. 1.
 2. 2. 0. 1. 0. 0.]


In [17]:
accuracies = []
ks = range(1,30)
for k in ks:
    knn.fit(X_train, y_train)
    accuracy = knn.evaluate(X_test, y_test)
    accuracies.append(accuracy)

print(accuracies)

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


### 3. GridSearch from scratch

1. Grid search should happen over two hyperparameters - k and weight.
2. This means the k and weights should really be defined as instance variables for KNN class 

In [18]:
k_values = list(range(3, 8, 2))
weight_values = ['uniform', 'distance']

best_accuracy = 0
best_k = None
best_metric = None

In [19]:
# KNN class that allows setting the number of neighbours and weight=uniform or distance
class KNN:
    def __init__(self, k = 7, weight="uniform"):
        self.k = k
        self.weight = weight
        self

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def euclidean_distance(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2))

    def predict(self, X_test):
            y_pred = [self._predict(x) for x in X_test]
            return np.array(y_pred)
    
    def _predict(self, x):
        # Compute distances between x and all examples in the training set
        distances = [self.euclidean_distance(x, x_train) for x_train in self.X_train]
        
        # Sort by distance and return indices of the first k neighbors
        k_indices = np.argsort(distances)[:self.k]
        
        # Extract the labels of the k nearest neighbor training samples
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        
        # TODO: This line of evaluating the k will change when you have to support 
        # hyperparameters for weight=uniform and distance
        # the code you have only supports uniform.
        # Add the code to support distance
        # Weight each k nearest point by 1/distance-squared

        # Return the most common class label
        most_common = np.bincount(k_nearest_labels).argmax()
        return most_common
    
    def evaluate(self, X_test, y_test):
         y_pred = self.predict(X_test)
         accuracy = sum(y_pred == y_test) / len(y_test)
         return accuracy

In [20]:

from itertools import product
        
for k, weight in product(k_values, weight_values):
    knn = KNN(k = 5, weight = 'distance')
    knn.fit(X_train, y_train)
    accuracy = knn.evaluate(X_test, y_test)

    if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_k = k
            best_metric = weight

print(f"Best k: {best_k}")
print(f"Best weight: {best_metric}")
print(f"Best accuracy: {best_accuracy * 100:.2f}%")  

Best k: 3
Best weight: uniform
Best accuracy: 96.67%
