In [20]:
#TRAIN/TEST SPLIT FROM QUESTION 4
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer #Added so we can deal with NaN values

kidney_disease_data_frame = pd.read_csv("D:/datasets/kidney_disease.csv")

#EXTRA EDIT: Removing id column because it does not have data, just id numbers
kidney_disease_data_frame = kidney_disease_data_frame.drop(columns=['id'], errors='ignore')
kidney_disease_data_frame['classification'] = kidney_disease_data_frame['classification'].str.strip()


#Including all columns except the last.
feature_matrix = kidney_disease_data_frame.drop(columns=['classification'])

#EXTRA EDIT: Changing strings into number so KNN model can use all the data
feature_matrix = pd.get_dummies(feature_matrix, drop_first=True)

#Making labels = last column
target_labels = kidney_disease_data_frame['classification']


model_features_train, model_features_test, model_labels_train, model_labels_test = train_test_split(feature_matrix, target_labels, test_size= 0.3, random_state=69)

#EXTRA EDIT: Replacing null values with median of column to retain as much data as possible instead of dropping row.
imputer = SimpleImputer(strategy='median')
model_features_train = imputer.fit_transform(model_features_train)
model_features_test = imputer.transform(model_features_test)

In [21]:
#FINDING ACCURACY FOR DIFFERENT K VALUES
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


accuracy_list = []

possible_k_values = [1, 3, 5, 7, 9]

for i in possible_k_values:
    knn_model = KNeighborsClassifier(n_neighbors=i)
    knn_model.fit(model_features_train, model_labels_train) #Model Training

    predicted_labels = knn_model.predict(model_features_test) #Getting estimated values from test set

    accuracy = accuracy_score(model_labels_test, predicted_labels) #Calculated accuracy score for each k
    accuracy_list.append(accuracy)



In [22]:
#CREATING A TABLE FROM ACCURACY LIST

results_table = pd.DataFrame({
    'K-Value': possible_k_values,
    'Accuracy': accuracy_list
})

print(results_table)

   K-Value  Accuracy
0        1  0.808333
1        3  0.808333
2        5  0.783333
3        7  0.808333
4        9  0.800000


In [None]:
#How changing k affects the behavior of the model
#Changing k changes the behavior of the model because "k" is the number of neighbors, or the mathematically closest data points to the given data point to make a prediction. This ultimately can lead to overfitting or underfitting as too little or too many parameters are given.

#Why very small values of k may cause overfitting
#Very small values of k may cause overfitting as only a small amount of other data points are taken into consideration when making a prediction, making it more sensitive to outliers as the prediction is reliant on only a few points that it memorizes.

#Why very large values of k may cause underfitting
#Very large values of k make the model oversimplify itself as it greatly broadens the number of inputs taken to predict values, this means specific data may get washed out and predictions made may just be based on what class is a majority among the neighbours. This is problematic in situations like this one where we are looking for potential kidney failure, we do not want the model to instantly claim a patient does not have kidney disease because the majority of the neighbours of the data are patients without kidney disease.