In [2]:
import pandas as pd
import math

In [3]:
def minmax_normalize(serie: pd.Series):
    return (serie - serie.min()) / (serie.max() - serie.min())

In [4]:
def normalize_numerical_attributes(dataset: pd.DataFrame, target_attribute: str):
    normalized = dataset
    for attribute in [attr for attr in dataset.columns if dataset[attr].dtype.name != 'object' and attr != target_attribute]: # only numerical values
        normalized[attribute] = minmax_normalize(dataset[attribute])
    return normalized

In [5]:
class KNNPoint:
    def __init__(self, vector, distance):
        self.vector = vector
        self.distance = distance

In [6]:
class KNN:
    def fit(self, train_dataset: pd.DataFrame, target_attribute):
        normalized_data = normalize_numerical_attributes(train_dataset, target_attribute)
        self.data = normalized_data
        self.target_attribute = target_attribute
    def predict(self, k, test_tuple: pd.Series):
        vector_distances = []
        
        for index, row in self.data.iterrows(): # Setting distance
            sum = 0
            for column in self.data.columns:
                if self.data[column].dtype == 'object': # nominal field
                    if row[column] is None or test_tuple[column] is None:
                        sum += 1
                else: # numerical
                    if row[column] is None and test_tuple[column] is None: # both null
                        sum += 1
                    elif row[column] is not None and test_tuple[column] is not None: # both not-null
                        sum += pow(row[column] - test_tuple[column], 2)# euclidean
                    else: # one of them is null
                        sum += max(1 - row[column], 1 - test_tuple[column])

            vector_distances.append(KNNPoint(row, math.sqrt(sum)))

        vector_distances = sorted(vector_distances, key=lambda x: x.distance)
        nearest_k = vector_distances[:k]

        return pd.Series([item.vector[self.target_attribute] for item in nearest_k]).mode()[0] # major voting

In [7]:
df = pd.read_csv('./datasets/student_admission_record_dirty.csv')
knn_classifier = KNN()

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Name                    147 non-null    object 
 1   Age                     147 non-null    float64
 2   Gender                  147 non-null    object 
 3   Admission Test Score    146 non-null    float64
 4   High School Percentage  146 non-null    float64
 5   City                    147 non-null    object 
 6   Admission Status        147 non-null    object 
dtypes: float64(3), object(4)
memory usage: 8.7+ KB


In [9]:
df.drop(columns=['Name'],inplace=True)

In [10]:
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)


In [11]:
def partition_set(dataset: pd.DataFrame, number_of_sets: int):
    dataset_length = len(dataset)
    partition_size = int(dataset_length / number_of_sets)
    return [dataset[counter * partition_size:(counter + 1) * partition_size] for counter in range(0, number_of_sets)]

In [12]:
def perform_k_fold_cross_validation(classifier: KNN, dataset: pd.DataFrame,k: int, candidate_neighbor_numbers: list):
    data_partitions = partition_set(dataset, k)
    best_neighbor_number, best_accuracy = 0, 0
    for neighbor_number in candidate_neighbor_numbers:
        for index in range(0, k):
            test_dataset = data_partitions[index]
            training_dataset = pd.concat([partition for partition in data_partitions if partition is not test_dataset], axis=0)
            classifier.fit(training_dataset, 'Admission Status')
            tn, tp, fn, fp = 0, 0, 0, 0
            for test_index, test_tuple in test_dataset.iterrows():
                prediction = classifier.predict(neighbor_number, test_tuple)
                if prediction == 'Accepted' and test_tuple['Admission Status'] == 'Accepted':
                    tp += 1
                elif prediction == 'Rejected' and test_tuple['Admission Status'] == 'Rejected':
                    tn += 1
                elif prediction == 'Accepted' and test_tuple['Admission Status'] == 'Rejected':
                    fp += 1
                elif prediction == 'Rejected' and test_tuple['Admission Status'] == 'Accepted':
                    fn += 1

            accuracy = (tp + tn) / len(test_dataset)
            if accuracy >= best_accuracy:
                best_accuracy = accuracy
                best_neighbor_number = neighbor_number

    return best_neighbor_number

In [13]:
print(perform_k_fold_cross_validation(knn_classifier, df, 5, [n for n in range(2, 20)]))

18
