# KNN

In [128]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
import os
import math
import tqdm

In [129]:
# Move up 2 directories
data_directory = '../..' 

# Load the CSV files
asthma_df = pd.read_csv(os.path.join(data_directory, 'Data', 'astma.csv'))

In [130]:
# In the asthma dataset there seems to be a problem with HR00 to HR23
# The heart rate goes above a million sometimes
# It seems like the first couple digits correspond with the expected heart rate.
# 2 cases: HR below 100 and HR above 100

# Loop through each column in the dataframe
def preprocess(asthma_df):
    for col in asthma_df.loc[:, "HR00":"HR23"]:
        for i in range(len(asthma_df[col])):
            if (asthma_df.loc[i, col] > 1000):
                # Take first 2 digits of float
                first_2_digits = str(asthma_df.loc[i, col])[:2]
                temp_number = float(first_2_digits)
                # If heart rate lower than 20 it means it should be greater than 100 (assuming heart rates < 200)
                if temp_number < 20:
                    val = str(asthma_df.loc[i, col])[:3] + '.' + str(asthma_df.loc[i, col])[3]
                    asthma_df.loc[i, col] = float(val)
                else:
                    val = str(asthma_df.loc[i, col])[:2] + '.' + str(asthma_df.loc[i, col])[2]
                    asthma_df.loc[i, col] = float(val)
                pass
    return asthma_df


In [131]:
# get index of a random element in a dataframe
def get_random_index(df, columns):
    # select a random element that is not Nan
    # take a random column in columns
    random_index = np.random.randint(0, len(df))
    random_column = columns[np.random.randint(0, len(columns))]

    df.loc[random_index, random_column]

    while pd.isna(df.loc[random_index, random_column]):
        random_index = np.random.randint(0, len(df))
        random_column = columns[np.random.randint(0, len(columns))]

    return [random_index, random_column]

In [132]:
asthma_tmp = preprocess(pd.read_csv(os.path.join(data_directory, 'Data', 'astma.csv')))
asthma_tmp.to_csv(('asthma_tmp.csv'), index=False)

In [133]:
# Play with the number of neighbors
# Look at the kids with the fewest missing data
difference = []
for i in tqdm.tqdm(range(1000)):
    asthma_df = pd.read_csv('asthma_tmp.csv')
    random_index = get_random_index(asthma_df, ["HR00", "HR01", "HR02", "HR03", "HR04", "HR05", "HR06", "HR07", "HR08", "HR09", "HR10", "HR11", "HR12", "HR13",
                "HR14", "HR15", "HR16", "HR17", "HR18", "HR19", "HR20", "HR21", "HR22", "HR23"])
    tmp = asthma_df.loc[random_index[0], random_index[1]]
    asthma_df.loc[random_index[0], random_index[1]] = np.nan
    imputer = KNNImputer(n_neighbors = j)          # n_neighbors=2
    asthma_df.loc[:, "HR00":"HR23"] = imputer.fit_transform(asthma_df.loc[:, "HR00":"HR23"])
    # get the absolute value of the difference between the original value and the imputed value
    difference.append(abs(tmp - asthma_df.loc[random_index[0], random_index[1]]))



print(difference)

100%|██████████| 1000/1000 [16:19<00:00,  1.02it/s]

[1.9449999999999932, 10.349999999999994, 8.905000000000015, 9.570000000000007, 8.209999999999994, 0.09999999999999432, 23.75, 0.17500000000001137, 15.974999999999994, 15.294999999999987, 20.900000000000006, 6.769999999999996, 7.75, 6.009999999999991, 11.21, 11.959999999999994, 19.99000000000001, 2.5449999999999875, 14.245000000000005, 2.240000000000009, 9.25, 6.1499999999999915, 1.0049999999999955, 11.180000000000021, 6.160000000000011, 2.1500000000000057, 3.769999999999996, 3.740000000000009, 48.49000000000001, 5.144999999999996, 8.02000000000001, 6.089999999999989, 1.7800000000000011, 5.179999999999993, 3.0150000000000006, 10.920000000000002, 3.3599999999999994, 8.08999999999999, 20.710000000000008, 6.8999999999999915, 13.414999999999992, 3.7700000000000102, 6.530000000000001, 2.9550000000000125, 6.960000000000008, 13.424999999999997, 10.100000000000009, 0.29999999999999716, 15.200000000000003, 4.280000000000015, 1.2049999999999983, 3.0250000000000057, 5.4599999999999795, 4.420000000




In [134]:
print(difference)

[1.9449999999999932, 10.349999999999994, 8.905000000000015, 9.570000000000007, 8.209999999999994, 0.09999999999999432, 23.75, 0.17500000000001137, 15.974999999999994, 15.294999999999987, 20.900000000000006, 6.769999999999996, 7.75, 6.009999999999991, 11.21, 11.959999999999994, 19.99000000000001, 2.5449999999999875, 14.245000000000005, 2.240000000000009, 9.25, 6.1499999999999915, 1.0049999999999955, 11.180000000000021, 6.160000000000011, 2.1500000000000057, 3.769999999999996, 3.740000000000009, 48.49000000000001, 5.144999999999996, 8.02000000000001, 6.089999999999989, 1.7800000000000011, 5.179999999999993, 3.0150000000000006, 10.920000000000002, 3.3599999999999994, 8.08999999999999, 20.710000000000008, 6.8999999999999915, 13.414999999999992, 3.7700000000000102, 6.530000000000001, 2.9550000000000125, 6.960000000000008, 13.424999999999997, 10.100000000000009, 0.29999999999999716, 15.200000000000003, 4.280000000000015, 1.2049999999999983, 3.0250000000000057, 5.4599999999999795, 4.420000000

## Compute the kids with the fewest missing values

In [135]:
def extractChild(df):

    # Create a new database per children and save it into a list of all the same subject numbers.
    df_children = []    # List of all the children
    
    # Loop through all the subject numbers
    for subject in df['SubjectNr'].unique():
        df_children.append(df[df['SubjectNr'] == subject])
    
    return df_children

In [136]:
# Move up 2 directories
data_directory = '../..' 

# Load the CSV files
asthma_df = pd.read_csv(os.path.join(data_directory, 'Data', 'astma.csv'))

asthma_df_children = extractChild(asthma_df)

counter = []
#for every child compute the number of Nan in the HR columns
for i in range(len(asthma_df_children)):
    counter.append((i, asthma_df_children[i].loc[:, "HR00":"HR23"].isna().sum().sum()))

print(counter)

# sort the counter list by the number of Nan
counter.sort(key=lambda x: x[1])

print(counter)

[(0, 24), (1, 91), (2, 72), (3, 15), (4, 23), (5, 16), (6, 76), (7, 61), (8, 25), (9, 53), (10, 85), (11, 273), (12, 56), (13, 230), (14, 42), (15, 613), (16, 108), (17, 36), (18, 16), (19, 13), (20, 18), (21, 100), (22, 98), (23, 108), (24, 42), (25, 39), (26, 40), (27, 163), (28, 47), (29, 420), (30, 109), (31, 21), (32, 38), (33, 36), (34, 37), (35, 460), (36, 24), (37, 23), (38, 22), (39, 93), (40, 36), (41, 47), (42, 345), (43, 28), (44, 27), (45, 28), (46, 431), (47, 52), (48, 156), (49, 130), (50, 29), (51, 39), (52, 76), (53, 411), (54, 92), (55, 225), (56, 44), (57, 190), (58, 41), (59, 284), (60, 19), (61, 19), (62, 72), (63, 38), (64, 20), (65, 54), (66, 43), (67, 69), (68, 42), (69, 26), (70, 82), (71, 26), (72, 124), (73, 67), (74, 38), (75, 40), (76, 28), (77, 27), (78, 60), (79, 213), (80, 131), (81, 42), (82, 102), (83, 32), (84, 32), (85, 293), (86, 117), (87, 59), (88, 46), (89, 20)]
[(19, 13), (3, 15), (5, 16), (18, 16), (20, 18), (60, 19), (61, 19), (64, 20), (89, 2

In [137]:
k = []
for j in range(10, 20):
    difference = []
    for i in tqdm.tqdm(range(10000)):
        random_index = get_random_index(asthma_df, ["HR00", "HR01", "HR02", "HR03", "HR04", "HR05", "HR06", "HR07", "HR08", "HR09", "HR10", "HR11", "HR12", "HR13",
                    "HR14", "HR15", "HR16", "HR17", "HR18", "HR19", "HR20", "HR21", "HR22", "HR23"])
        tmp = asthma_df.loc[random_index[0], random_index[1]]
        asthma_df.loc[random_index[0], random_index[1]] = np.nan
        imputer = KNNImputer(n_neighbors = j)          # n_neighbors=2
        asthma_df.loc[:, "HR00":"HR23"] = imputer.fit_transform(asthma_df.loc[:, "HR00":"HR23"])
        # get the absolute value of the difference between the original value and the imputed value
        difference.append(abs(tmp - asthma_df.loc[random_index[0], random_index[1]]))
    k.append(sum(difference))

 40%|████      | 4000/10000 [02:14<03:22, 29.66it/s]


KeyboardInterrupt: 