In [None]:
# This code block will contain implementations of the KNN algorithm and some of its variants.
import numpy as np
import pandas as pd
class KNN:
  def __init__(self, k):
    """
    Pass in one argument - k, the number of nearest neighbors to consider.

    You have access to the following variables:
    self.k - the number of nearest neighbors to consider
    self.training_set - the training set we will inevitably pass to this algorithm. This variable is instantiated as an empty Pandas Dataframe
    """
    self.k = k
    self.training_set = pd.DataFrame()
    self.label_set = []
  
  def train(self, csvfile):
    """
    Takes in:
      csvfile - A string path to the csv file we want to take in. We desire an absolute path.
    
    Outputs:
      Nothing
    
    Side effects:
      self.training_set = csv_in_panda_form, where
      * csv_in_panda_form is a pandas dataframe that contains the information from the linked csv file.
    """
    csv_in_panda_form = pd.read_csv(csvfile)
    self.training_set = csv_in_panda_form
    self.label_set = self.training_set['Outcome'].unique()
  
  def get_training_set(self):
    """
    Returns the training set
    """
    return self.training_set
  
  def nearest_neighbors(self, x):
    """
    Goes through every row to find the k nearest neighbors of testing instance x
    """
    #print(f"X:\n{x}\n\n")
    training_set_test_instance_difference = []
    training_set_rows = []
    feature_columns = self.training_set.columns.drop("Outcome")
    for index, row in self.training_set.iterrows():
      #print(f"Difference between test instance and training instances: {(x - row).sum()}")
      #print(f"X length {len(x)}")
      #if len(x) == 8:
      #  print(f"incorrect x {x}")
        
      #print(f"Row length {len(row)}")
      training_set_test_instance_difference.append(float((x[feature_columns] - row[feature_columns]).abs().sum()))
      training_set_rows.append(row)
    #print(f"Training set test instance difference: {training_set_test_instance_difference}")

    # Zip together the training set test instance differences and the training set rows, and sort them by the training set test instance differences.
    # Once sorted, separate the differences and rows into their respective lists using zip(*sorted_zipped_list).
    similarity_scores, corresponding_rows = list(zip(*sorted(zip(training_set_test_instance_difference, training_set_rows), key=lambda x: x[0])))
    #print(f"Similarity scores: {similarity_scores}")
    #print(f"Corresponding rows: {corresponding_rows}")

    # Get the k nearest neighbors 
    return corresponding_rows[:self.k]

  def predict(self, x):
    """
    Predicts a label, y, for the testing instance, x, based on the k nearest neighbors.
    """
    nearest_neighbors_of_x = self.nearest_neighbors(x)
    label_counts = []
    for label in self.label_set:
      count_for_current_label = 0
      for neighbor in nearest_neighbors_of_x:
        if label == neighbor["Outcome"]:
          count_for_current_label += 1
      label_counts.append(count_for_current_label)
    label_counts = np.array(label_counts)

    assert label_counts.sum() == self.k, "Total number of labels assigned and number of nearest neighbors picked do not align."
    return self.label_set[label_counts.argmax()]
    



In [23]:
# This code block acts as a kind of main function, running code to use the larger class.

knn_classifier = KNN(k=16)
knn_classifier.train("/Users/alifabdullah/Collaboration/Kaggle-ML-Algorithm-Musings/datasets/diabetes.csv")
pandafied_csv = knn_classifier.get_training_set()
#print(f"Print the panda-fied csv file:\n {pandafied_csv.head()}")
#print(f"Length of the panda-fied csv file: {len(pandafied_csv)}")
#print(f"Set of discrete labels in the target column of the panda-fied csv file: {pandafied_csv['Outcome'].unique()}")
# 1 means they have diabetes

# nearest_neighbors_example = knn_classifier.nearest_neighbors(pandafied_csv.iloc[0])

#print(pandafied_csv.iloc[0,1])
#print(f"Nearest neighbors of {pandafied_csv.iloc[0]}:\n\n")
#print(nearest_neighbors_example,"bruh")
#print(len(nearest_neighbors_example),"huh?")
print(pandafied_csv.iloc[0])
print(knn_classifier.predict(pandafied_csv.iloc[0]))
print(len(pandafied_csv))



Pregnancies                   6.000
Glucose                     148.000
BloodPressure                72.000
SkinThickness                35.000
Insulin                       0.000
BMI                          33.600
DiabetesPedigreeFunction      0.627
Age                          50.000
Outcome                       1.000
Name: 0, dtype: float64
1
768
