In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# importing library
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
import statistics
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# kNN Classifier

In [None]:
class KNN_Classifier1():

  # initiating the parameters
  def __init__(self,k,X,y):
        self.X_train = X
        self.y_train = y
        training_data=pd.concat([X_train,y_train], axis=1)
        self.training_data1= np.array(training_data)
        self.k=k



  # getting the  euclidean distance
  def get_distance(self,training_data_point, test_data_point):
      dist = 0
      dist = np.sum((training_data_point[:-1] - test_data_point)**2)

      euclidean_dist = np.sqrt(dist)

      return euclidean_dist



  # getting the nearest neighbors
  def nearest_neighbors(self, test_data):
    training_data1 = self.training_data1
    k=self.k
    distance_list = []
    for training_data_point in self.training_data1:
            distance = self.get_distance(training_data_point, test_data)
            distance_list.append((training_data_point, distance))

    distance_list.sort(key=lambda x: x[1])

    neighbors_list = [x[0] for x in distance_list[:k]]

    return neighbors_list


  # predict the class of the new data point:
  def predict(self, X_test):
        y_pred = []

        for test_data in X_test:
            neighbors = self.nearest_neighbors(test_data)
            label = [data[-1] for data in neighbors]
            predicted_class = statistics.mode(label)
            y_pred.append(predicted_class)

        return y_pred





In [None]:
df=pd.read_csv(f'/content/drive/MyDrive/ML DATASETS/BRFSS22CleanedSelected.csv')

In [None]:

# Separate features and target variable
X = df.drop('DiabetesBinary', axis=1)
y = df['DiabetesBinary']

In [None]:
df.shape[0]

253791

**Train,validation and Test split**

In [None]:
# Split the data into training (80%), and testing (20%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
X_train.shape

(203032, 9)

In [None]:
X_test.shape



(10152, 9)

**Handling Imbalanced Data**

In [None]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
dfResampled= pd.concat([y_train_resampled,X_train_resampled], axis=1)

In [None]:
dfResampled.describe()

Unnamed: 0,DiabetesBinary,BMI,Stroke,HeartDiseaseorAttack,RoutineCheckupSpan,GenHlth,PhysHlth,DiffWalk,Age,Income
count,345770.0,345770.0,345770.0,345770.0,345770.0,345770.0,345770.0,345770.0,345770.0,345770.0
mean,0.5,30.166168,0.039975,0.108494,1.233765,2.796851,5.866894,0.202392,8.234916,6.472123
std,0.500001,7.07074,0.1959,0.311004,0.656725,1.029327,9.934843,0.401783,3.244868,2.451615
min,0.0,12.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
25%,0.0,25.0,0.0,0.0,1.0,2.0,0.0,0.0,6.0,5.0
50%,0.5,29.0,0.0,0.0,1.0,3.0,0.0,0.0,9.0,7.0
75%,1.0,34.0,0.0,0.0,1.0,3.0,6.0,0.0,11.0,8.0
max,1.0,98.0,1.0,1.0,4.0,5.0,30.0,1.0,13.0,11.0


**Feature Scaling**

In [None]:

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)



X_train_scaled --> Scaled training data with features

y_train_resampled   --> target values of training datas

X_test  --> test data without target

Model Training a classifier

In [None]:
classifier1 = KNN_Classifier1(k=5,X=X_train_scaled,y=y_train_resampled)

In [None]:
X_test=np.array(X_test)

In [None]:
# import pickle


# Assuming classifier1 is already fitted with training data
X_test = np.array(X_test)  # Convert X_test to a NumPy array
y_pred_test = classifier1.predict(X_test)  # Batch prediction for all test data

accuracy = accuracy_score(y_test, y_pred_test)
print(accuracy * 100)  # accuracy score in %




85.4117415287628


# **Saving The Model**

In [None]:
filename = 'knnTrainedModelFromScratch.sav'
pickle.dump(classifier1, open(filename, 'wb'))