# Import libraries

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

# Set the random seed
np.random.seed(1234)

# Set the device into GPU

In [18]:
# if torch.cuda.is_available():
#   device = torch.device("cuda")
# print(f"Device using: {device}")

# Install the ucimlrepo package

In [None]:
# pip install ucimlrepo

# Load the dataframe 1 (df1)

*   Shape of X: (2278, 10)
*   Shape of y: (2278, 1)



In [24]:
# Dataset 1: NHANES age prediction.csv
#(National Health and Nutrition Health Sur- vey 2013-2014 (NHANES) Age Prediction Subset)
df1 = pd.read_csv('./dataset/NHANES_age_prediction.csv') # Change it to wherever you store your dataset

# Load and preprocess the dataset 2 (df2)

*   Shape of X: (499, 9)
*   Shape of y: (449, 1)

In [30]:
from ucimlrepo import fetch_ucirepo

dataset2 = fetch_ucirepo(id=15)
X2 = dataset2.data.features
y2 = dataset2.data.targets
df2 = pd.concat([X2,y2], axis = 1)
df2 = df2.drop_duplicates()
df2 = df2.dropna()


X = df2.iloc[:, :-1].values
y = pd.get_dummies(df2.iloc[:, -1]).values

# .values: Change the panda dataframe to numpy array

# Print the feature shape and classes of dataset
(N,D), C = X.shape, np.unique(y)
print(f'instances (N) \t {N} \n features (D) \t {D} \n classes (C) \t {C}')


#generates an indices array from 0 to N-1 and permutes it
inds = np.random.permutation(N)

train_split, validate_split, test_split = 0.33, 0.33, 0.33

# Calculate the indices for each split
train_end = int(len(X) * train_split)
validate_end = int(len(X) * (train_split + validate_split))

# Split the data
x_train, y_train = X[inds[:train_end]], y[inds[:train_end]]
x_validate, y_validate = X[inds[train_end:validate_end]], y[inds[train_end:validate_end]]
x_test, y_test = X[inds[validate_end:]], y[inds[validate_end:]]

# Calculate the mean and standard deviation of each feature in the training set
mean = np.mean(x_train, axis=0)
std = np.std(x_train, axis=0)

# Standardize the training data
x_train = (x_train - mean) / std

# Standardize the validation and test data using the same mean and std
x_validate = (x_validate - mean) / std
x_test = (x_test - mean) / std

instances (N) 	 449 
 features (D) 	 9 
 classes (C) 	 [False  True]


# Preprocess the dataset 1


In [25]:
# Basic information of df1
df1.info()

# Clean the dataset
df1.isnull().sum()
print("Note: There is no missing value.\n")

# Drop duliplicate
df1.drop_duplicates(inplace=True)

# As this task is for classification the input feature and the Target variable will be as follows:
# Input Feature : ['RIAGENDR', 'PAQ605', 'BMXBMI', 'LBXGLU', 'DIQ010', 'LBXGLT', 'LBXIN']
# Target : ['age_group']
X = df1[['RIAGENDR', 'PAQ605', 'BMXBMI', 'LBXGLU', 'DIQ010', 'LBXGLT', 'LBXIN']].values
y = pd.get_dummies(df1['age_group']).values # One-hot encoding (Change the categorial y into integer array)

# .values: Change the panda dataframe to numpy array

# Print the feature shape and classes of dataset
(N,D), C = X.shape, np.unique(y)
print(f'instances (N) \t {N} \n features (D) \t {D} \n classes (C) \t {C}')


#generates an indices array from 0 to N-1 and permutes it
inds = np.random.permutation(N)

train_split, validate_split, test_split = 0.33, 0.33, 0.33

# Calculate the indices for each split
train_end = int(len(X) * train_split)
validate_end = int(len(X) * (train_split + validate_split))

# Split the data
x_train, y_train = X[inds[:train_end]], y[inds[:train_end]]
x_validate, y_validate = X[inds[train_end:validate_end]], y[inds[train_end:validate_end]]
x_test, y_test = X[inds[validate_end:]], y[inds[validate_end:]]

# Calculate the mean and standard deviation of each feature in the training set
mean = np.mean(x_train, axis=0)
std = np.std(x_train, axis=0)

# Standardize the training data
x_train = (x_train - mean) / std

# Standardize the validation and test data using the same mean and std
x_validate = (x_validate - mean) / std
x_test = (x_test - mean) / std

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2278 entries, 0 to 2277
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SEQN       2278 non-null   float64
 1   age_group  2278 non-null   object 
 2   RIDAGEYR   2278 non-null   float64
 3   RIAGENDR   2278 non-null   float64
 4   PAQ605     2278 non-null   float64
 5   BMXBMI     2278 non-null   float64
 6   LBXGLU     2278 non-null   float64
 7   DIQ010     2278 non-null   float64
 8   LBXGLT     2278 non-null   float64
 9   LBXIN      2278 non-null   float64
dtypes: float64(9), object(1)
memory usage: 178.1+ KB
Note: There is no missing value.

instances (N) 	 2278 
 features (D) 	 7 
 classes (C) 	 [False  True]


# KNN model

In [26]:
class KNN:
    def __init__(self, K, distance_fn):
        self.K = K
        self.distance_fn = distance_fn
        return

    def fit(self, x, y):
        self.x = x
        self.y = y

        # Number of labels
        self.C = len(np.unique(y))
        return self

    def predict(self, x_test):
        # Calculate distances using the distance function
        distances = self.distance_fn(self.x[None,:,:], x_test[:,None,:])

        num_test = len(x_test)
        # Stores the indices of k closest training samples to each test sample
        knns = np.zeros((num_test, self.K), dtype=int)
        # Stores the probability distribution over C classes
        y_prob = np.zeros((num_test, self.C))

        for i in range(num_test):
            knn_indices = np.argsort(distances[i])[:self.K]
            for k in knn_indices:
                neighbor_label_vector = self.y[k]
                weight = 1 / (distances[i][k] + 1e-5)  # inverse distance as weight
                y_prob[i] += weight * neighbor_label_vector

        y_pred = np.argmax(y_prob, axis=1)
        return y_pred


# Choose the best hyperparameter K

In [31]:
best_k = None
best_accuracy = 0
euclidean = lambda x1, x2: np.sqrt(np.sum((x1 - x2)**2, axis=-1))

# Convert y_validate from one-hot encoding to class indices if necessary
y_validate_indices = np.argmax(y_validate, axis=1)

# Try different values of K
for K in range(1, 30):  # Assuming we are testing K from 1 to 19
    model = KNN(K=K, distance_fn=euclidean)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_validate)

    # Calculate accuracy
    accuracy = np.sum(y_pred == y_validate_indices) / len(y_validate_indices)
    print(f'K = {K}, Validation Accuracy = {accuracy * 100:.2f}%')

    # Update best K if current accuracy is better
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_k = K

print(f'Best K: {best_k} with Validation Accuracy: {best_accuracy * 100:.2f}%')


K = 1, Validation Accuracy = 93.24%
K = 2, Validation Accuracy = 93.24%
K = 3, Validation Accuracy = 95.27%
K = 4, Validation Accuracy = 95.27%
K = 5, Validation Accuracy = 95.95%
K = 6, Validation Accuracy = 95.95%
K = 7, Validation Accuracy = 95.95%
K = 8, Validation Accuracy = 96.62%
K = 9, Validation Accuracy = 95.95%
K = 10, Validation Accuracy = 96.62%
K = 11, Validation Accuracy = 95.27%
K = 12, Validation Accuracy = 95.95%
K = 13, Validation Accuracy = 95.27%
K = 14, Validation Accuracy = 95.27%
K = 15, Validation Accuracy = 95.27%
K = 16, Validation Accuracy = 95.27%
K = 17, Validation Accuracy = 95.27%
K = 18, Validation Accuracy = 95.95%
K = 19, Validation Accuracy = 95.27%
K = 20, Validation Accuracy = 95.95%
K = 21, Validation Accuracy = 94.59%
K = 22, Validation Accuracy = 95.27%
K = 23, Validation Accuracy = 94.59%
K = 24, Validation Accuracy = 93.92%
K = 25, Validation Accuracy = 93.92%
K = 26, Validation Accuracy = 93.92%
K = 27, Validation Accuracy = 93.92%
K = 28, Va

# Calculate the final evaluation metric

In [21]:
def evaluate_acc(y_pred, y_test_indices):
    return np.sum(y_pred == y_test_indices)/y_test_indices.shape[0]

In [32]:
# Choosing the best K according to the validation set above
myK = 8
euclidean = lambda x1, x2: np.sqrt(np.sum((x1 - x2)**2, axis=-1))

model = KNN(K=myK, distance_fn=euclidean)

y_pred = model.fit(x_train, y_train).predict(x_test)

# This step is converting y_test from one-hot encoding back to class index
y_test_indices = np.argmax(y_test, axis=1)

accuracy = evaluate_acc(y_pred, y_test_indices)
print(f'accuracy is {accuracy*100:.1f}.')



accuracy is 94.1.
