# KNN

In [110]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
import os
import math
import tqdm

In [111]:
# Move up 2 directories
data_directory = '../..' 

# Load the CSV files
asthma_df = pd.read_csv(os.path.join(data_directory, 'Data', 'astma.csv'))

In [112]:
# In the asthma dataset there seems to be a problem with HR00 to HR23
# The heart rate goes above a million sometimes
# It seems like the first couple digits correspond with the expected heart rate.
# 2 cases: HR below 100 and HR above 100

# Loop through each column in the dataframe
def preprocess(asthma_df):
    for col in asthma_df.loc[:, "HR00":"HR23"]:
        for i in range(len(asthma_df[col])):
            if (asthma_df.loc[i, col] > 1000):
                # Take first 2 digits of float
                first_2_digits = str(asthma_df.loc[i, col])[:2]
                temp_number = float(first_2_digits)
                # If heart rate lower than 20 it means it should be greater than 100 (assuming heart rates < 200)
                if temp_number < 20:
                    val = str(asthma_df.loc[i, col])[:3] + '.' + str(asthma_df.loc[i, col])[3]
                    asthma_df.loc[i, col] = float(val)
                else:
                    val = str(asthma_df.loc[i, col])[:2] + '.' + str(asthma_df.loc[i, col])[2]
                    asthma_df.loc[i, col] = float(val)
                pass
    return asthma_df


In [113]:
# get index of a random element in a dataframe
def get_random_index(df, columns):
    # select a random element that is not Nan
    # take a random column in columns
    random_index = np.random.randint(0, len(df))
    random_column = columns[np.random.randint(0, len(columns))]

    df.loc[random_index, random_column]

    while pd.isna(df.loc[random_index, random_column]):
        random_index = np.random.randint(0, len(df))
        random_column = columns[np.random.randint(0, len(columns))]

    return [random_index, random_column]

In [115]:
# Play with the number of neighbors
# Look at the kids with the fewest missing data
k = []
for j in range(1, 6):
    difference = []
    for i in tqdm.tqdm(range(1000)):
        asthma_df = preprocess(pd.read_csv(os.path.join(data_directory, 'Data', 'astma.csv')))
        random_index = get_random_index(asthma_df, ["HR00", "HR01", "HR02", "HR03", "HR04", "HR05", "HR06", "HR07", "HR08", "HR09", "HR10", "HR11", "HR12", "HR13",
                    "HR14", "HR15", "HR16", "HR17", "HR18", "HR19", "HR20", "HR21", "HR22", "HR23"])
        tmp = asthma_df.loc[random_index[0], random_index[1]]
        asthma_df.loc[random_index[0], random_index[1]] = np.nan
        imputer = KNNImputer(n_neighbors = j)          # n_neighbors=2
        asthma_df.loc[:, "HR00":"HR23"] = imputer.fit_transform(asthma_df.loc[:, "HR00":"HR23"])
        # get the absolute value of the difference between the original value and the imputed value
        difference.append(abs(tmp - asthma_df.loc[random_index[0], random_index[1]]))
    k.append(sum(difference))


print(k)

100%|██████████| 1000/1000 [54:15<00:00,  3.26s/it]
100%|██████████| 1000/1000 [51:19<00:00,  3.08s/it]
100%|██████████| 1000/1000 [52:11<00:00,  3.13s/it] 
100%|██████████| 1000/1000 [38:29<00:00,  2.31s/it]
100%|██████████| 1000/1000 [38:37<00:00,  2.32s/it]

[9886.800000000008, 8846.235000000004, 7750.6833333333425, 7681.775000000005, 7417.959999999996]





In [116]:
print(k)

[9886.800000000008, 8846.235000000004, 7750.6833333333425, 7681.775000000005, 7417.959999999996]


In [102]:
k = []
for j in range(10, 20):
    difference = []
    for i in tqdm.tqdm(range(10000)):
        random_index = get_random_index(asthma_df, ["HR00", "HR01", "HR02", "HR03", "HR04", "HR05", "HR06", "HR07", "HR08", "HR09", "HR10", "HR11", "HR12", "HR13",
                    "HR14", "HR15", "HR16", "HR17", "HR18", "HR19", "HR20", "HR21", "HR22", "HR23"])
        tmp = asthma_df.loc[random_index[0], random_index[1]]
        asthma_df.loc[random_index[0], random_index[1]] = np.nan
        imputer = KNNImputer(n_neighbors = j)          # n_neighbors=2
        asthma_df.loc[:, "HR00":"HR23"] = imputer.fit_transform(asthma_df.loc[:, "HR00":"HR23"])
        # get the absolute value of the difference between the original value and the imputed value
        difference.append(abs(tmp - asthma_df.loc[random_index[0], random_index[1]]))
    k.append(sum(difference))

100%|██████████| 10000/10000 [02:11<00:00, 76.20it/s]
100%|██████████| 10000/10000 [01:59<00:00, 83.39it/s]
100%|██████████| 10000/10000 [02:12<00:00, 75.40it/s]
100%|██████████| 10000/10000 [02:58<00:00, 56.15it/s]
100%|██████████| 10000/10000 [02:25<00:00, 68.87it/s]
100%|██████████| 10000/10000 [01:53<00:00, 88.20it/s]
100%|██████████| 10000/10000 [01:49<00:00, 91.11it/s]
100%|██████████| 10000/10000 [01:48<00:00, 92.39it/s]
100%|██████████| 10000/10000 [01:54<00:00, 87.47it/s]
100%|██████████| 10000/10000 [01:51<00:00, 90.05it/s]


In [103]:
print(k)

[20215.63292948599, 18617.698144328155, 17360.77517638567, 16301.231740872949, 15287.57299783265, 14196.36419843131, 13318.431648314765, 12265.154200059853, 11843.357969773851, 10958.839494960448]


## Compute the kids with the fewest missing values

In [104]:
def extractChild(df):

    # Create a new database per children and save it into a list of all the same subject numbers.
    df_children = []    # List of all the children
    
    # Loop through all the subject numbers
    for subject in df['SubjectNr'].unique():
        df_children.append(df[df['SubjectNr'] == subject])
    
    return df_children

In [109]:
# Move up 2 directories
data_directory = '../..' 

# Load the CSV files
asthma_df = pd.read_csv(os.path.join(data_directory, 'Data', 'astma.csv'))

asthma_df_children = extractChild(asthma_df)

counter = []
#for every child compute the number of Nan in the HR columns
for i in range(len(asthma_df_children)):
    counter.append((i, asthma_df_children[i].loc[:, "HR00":"HR23"].isna().sum().sum()))

print(counter)

# sort the counter list by the number of Nan
counter.sort(key=lambda x: x[1])

print(counter)

[(0, 24), (1, 91), (2, 72), (3, 15), (4, 23), (5, 16), (6, 76), (7, 61), (8, 25), (9, 53), (10, 85), (11, 273), (12, 56), (13, 230), (14, 42), (15, 613), (16, 108), (17, 36), (18, 16), (19, 13), (20, 18), (21, 100), (22, 98), (23, 108), (24, 42), (25, 39), (26, 40), (27, 163), (28, 47), (29, 420), (30, 109), (31, 21), (32, 38), (33, 36), (34, 37), (35, 460), (36, 24), (37, 23), (38, 22), (39, 93), (40, 36), (41, 47), (42, 345), (43, 28), (44, 27), (45, 28), (46, 431), (47, 52), (48, 156), (49, 130), (50, 29), (51, 39), (52, 76), (53, 411), (54, 92), (55, 225), (56, 44), (57, 190), (58, 41), (59, 284), (60, 19), (61, 19), (62, 72), (63, 38), (64, 20), (65, 54), (66, 43), (67, 69), (68, 42), (69, 26), (70, 82), (71, 26), (72, 124), (73, 67), (74, 38), (75, 40), (76, 28), (77, 27), (78, 60), (79, 213), (80, 131), (81, 42), (82, 102), (83, 32), (84, 32), (85, 293), (86, 117), (87, 59), (88, 46), (89, 20)]
[(19, 13), (3, 15), (5, 16), (18, 16), (20, 18), (60, 19), (61, 19), (64, 20), (89, 2

In [None]:
k = []
for j in range(10, 20):
    difference = []
    for i in tqdm.tqdm(range(10000)):
        random_index = get_random_index(asthma_df, ["HR00", "HR01", "HR02", "HR03", "HR04", "HR05", "HR06", "HR07", "HR08", "HR09", "HR10", "HR11", "HR12", "HR13",
                    "HR14", "HR15", "HR16", "HR17", "HR18", "HR19", "HR20", "HR21", "HR22", "HR23"])
        tmp = asthma_df.loc[random_index[0], random_index[1]]
        asthma_df.loc[random_index[0], random_index[1]] = np.nan
        imputer = KNNImputer(n_neighbors = j)          # n_neighbors=2
        asthma_df.loc[:, "HR00":"HR23"] = imputer.fit_transform(asthma_df.loc[:, "HR00":"HR23"])
        # get the absolute value of the difference between the original value and the imputed value
        difference.append(abs(tmp - asthma_df.loc[random_index[0], random_index[1]]))
    k.append(sum(difference))