In [130]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
import os

In [131]:
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 5000)
pd.set_option('display.max_colwidth', 5000)

In [132]:
# Move up 2 directories
data_directory = '../..' 

# Load the CSV files
asthma_df = pd.read_csv(os.path.join(data_directory, 'Data', 'astma.csv'))

# Drop a kid

That kid has more than 24 missing columns. The decision has been to delete him from the database.

In [133]:
# delete rows with subjectNr
asthma_df = asthma_df.drop(asthma_df[asthma_df['SubjectNr'] == 2429672].index)

Detect the columns with lots of Nan

In [134]:
# Compute for all the columns, the percentages of Nan per column
# We will drop the columns that have more than 50% Nan
percentNan = []
for col in asthma_df.columns:
    nan_percentage = asthma_df[col].isnull().sum() / len(asthma_df[col]) * 100
    percentNan.append((nan_percentage, col))

percentNan.sort(reverse=True)

for el in percentNan:
    if el[0] > 50:
        print(el)

(98.33526906697638, 'steps03')
(98.29655439411536, 'steps04')
(97.67711962833914, 'steps02')
(96.36082075106465, 'steps05')
(96.36082075106465, 'steps01')
(92.1796360820751, 'EventDay')
(90.59233449477352, 'steps00')
(79.90708478513356, 'steps23')
(77.66163375919474, 'steps06')
(76.38404955478126, 'EOS_weight_tot')
(76.38404955478126, 'EOS_treatmentburden_tot')
(76.38404955478126, 'EOS_school_tot')
(76.38404955478126, 'EOS_respiratory_tot')
(76.38404955478126, 'EOS_physical_tot')
(76.38404955478126, 'EOS_health_tot')
(76.38404955478126, 'EOS_emotional_tot')
(76.38404955478126, 'EOS_digestion_tot')
(76.38404955478126, 'EOS_bodyimage_tot')
(75.30003871467285, 'Eosinophils')
(73.94502516453736, 'Symptomscore')
(69.68641114982579, 'Serum_IgE')
(69.26054974835463, 'ACD6')
(67.36353077816493, 'Pseudomonas')
(67.36353077816493, 'BASELINE_weight_tot')
(67.36353077816493, 'BASELINE_treatmentburden_tot')
(67.36353077816493, 'BASELINE_school_tot')
(67.36353077816493, 'BASELINE_respiratory_tot')
(

#### Technique
About technique 2 Matthijs wrote this:

Is it possible to do your analyses with both options to see whether it impacts the performance of your models? If not, I would just use the ones marked ‘2’ anyway to include as much data as possible.


In [135]:
asthma_df['Technique'].value_counts()
asthma_df = asthma_df.drop(asthma_df[asthma_df['Technique'] == 1].index).reset_index().drop('index', axis=1)
# IF WE DELETE THE WRONG TECHNIQUE HERE, ANY dadaset.loc[i] WILL GIVE AN ERROR AT THE FIRST MISSING ROW
# We need to reset the indicies to avoid this

## Child exrtaction

Let's first separate the databese into chilferen. This will create on database per children in order to identify them more easily. This can be useful if one data is repeated only once per child. We could then spread the data through the child.£

In [136]:
def extractChild(df):

    # Create a new database per children and save it into a list of all the same subject numbers.
    df_children = []    # List of all the children
    
    # Loop through all the subject numbers
    for subject in df['SubjectNr'].unique():
        df_children.append(df[df['SubjectNr'] == subject])
    
    return df_children

# Dealing with Missing Values
## Issue with HR columns

In the asthma dataset there seems to be a problem with HR00 to HR23. Sometimes, the heart rate goes above a million sometimes. It seems like the first couple digits correspond with the expected heart rate. 

In [137]:
# In the asthma dataset there seems to be a problem with HR00 to HR23
# The heart rate goes above a million sometimes
# It seems like the first couple digits correspond with the expected heart rate.
# 2 cases: HR below 100 and HR above 100

# Loop through each column in the dataframe
for col in asthma_df.loc[:, "HR00":"HR23"]:
    for i in range(len(asthma_df[col])):
        if (asthma_df.loc[i, col] > 1000):
            # Take first 2 digits of float
            first_2_digits = str(asthma_df.loc[i, col])[:2]
            temp_number = float(first_2_digits)
            # If heart rate lower than 20 it means it should be greater than 100 (assuming heart rates < 200)
            if temp_number < 20:
                val = str(asthma_df.loc[i, col])[:3] + '.' + str(asthma_df.loc[i, col])[3]
                asthma_df.loc[i, col] = float(val)
            else:
                val = str(asthma_df.loc[i, col])[:2] + '.' + str(asthma_df.loc[i, col])[2]
                asthma_df.loc[i, col] = float(val)
            pass

asthma_children = extractChild(asthma_df)

In [138]:
asthma_children[10].loc[:, "HR00":"HR23"]

Unnamed: 0,HR00,HR01,HR02,HR03,HR04,HR05,HR06,HR07,HR08,HR09,HR10,HR11,HR12,HR13,HR14,HR15,HR16,HR17,HR18,HR19,HR20,HR21,HR22,HR23
287,,,,,,,,,,,,,,,,95.3,104.7,88.6,91.7,94.25,83.6,70.0,69.8,57.3
288,64.6,54.0,58.8,59.5,60.0,74.2,83.7,90.0,103.8,96.8,95.0,94.2,97.1,91.3,92.5,82.1,88.0,96.8,94.0,78.1,76.2,68.4,61.5,68.7
289,59.6,62.3,59.4,62.5,66.4,69.75,78.5,79.1,82.5,95.5,107.1,88.5,105.0,86.6,93.1,95.5,85.0,82.6,85.4,90.0,85.0,80.0,70.2,60.8
290,54.8,64.0,60.0,70.0,67.0,76.7,77.8,84.0,87.1,72.4,74.3,98.8,87.4,89.1,92.5,92.1,88.1,92.8,100.1,94.8,78.1,79.3,78.1,79.8
291,78.0,82.2,76.3,87.75,81.6,71.5,84.3,83.0,91.6,79.75,99.4,103.6,94.8,96.8,114.0,114.1,96.6,100.8,104.5,103.7,95.5,95.8,95.8,93.1
292,86.2,81.1,78.5,81.1,73.8,73.2,86.8,95.4,98.3,84.0,96.3,94.1,108.5,101.0,105.8,100.4,94.3,114.5,110.8,99.2,82.8,76.4,74.5,71.5
293,75.0,72.8,70.3,68.6,72.0,68.3,82.0,98.1,98.3,95.3,97.2,91.6,90.5,101.8,99.5,88.0,81.8,112.4,114.0,100.6,107.5,90.0,91.7,89.0
294,78.7,77.0,76.4,74.4,70.0,76.5,87.3,82.4,100.1,97.2,100.3,,103.8,98.1,104.25,100.75,96.6,114.7,103.5,104.4,104.1,97.0,89.7,87.6
295,73.7,70.3,70.8,68.6,64.6,66.5,67.6,81.3,89.8,83.1,89.0,75.5,80.1,97.0,,,,100.0,102.0,95.5,97.1,78.4,71.6,67.1
296,69.1,67.4,68.5,70.7,69.1,70.0,69.8,85.6,100.5,86.8,96.4,84.5,88.8,93.2,80.8,90.4,87.2,95.1,99.0,100.3,83.25,90.6,,


### Heart Rate

In [139]:
# For everty sick child, go through the heart rate and replace the Nan values with KNN
# For the documentation of interpolation: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.interpolate.html

# # Loop through all the children
# for child in asthma_children:
#     # Replace the missing values in the Heart rate using KNN
#     child.loc[:, "HR00":"HR23"] = child.loc[:, "HR00":"HR23"].interpolate(method='nearest',order = 5, limit_direction='forward', axis=0, limit = 5)
#     # pass

# child = asthma_children[0].interpolate(method='linear', limit_direction='forward', axis=1, limit = 3)


    

In [140]:
for child in asthma_children:
    imputer = KNNImputer()          # n_neighbors=2
    child.loc[:, "HR00":"HR23"] = imputer.fit_transform(child.loc[:, "HR00":"HR23"])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  child.loc[:, "HR00":"HR23"] = imputer.fit_transform(child.loc[:, "HR00":"HR23"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  child.loc[:, "HR00":"HR23"] = imputer.fit_transform(child.loc[:, "HR00":"HR23"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  child.loc[:, "HR00":"HR23"] = imputer.fit_t

In [141]:
# print the HR columns
asthma_children[10].loc[:, "HR00":"HR23"]

asthma_df = pd.concat(asthma_children)

## AVGHR_daily

In [142]:
# print the indexes of the nans in AVGHR_daily
index = asthma_df[asthma_df['AVGHR_daily'].isnull()].index

print(index)

for ind in index:
    # compute the average number of steps
    asthma_df.loc[ind, 'AVGHR_daily'] = (asthma_df.loc[ind, "HR00":"HR23"].mean())

asthma_df[asthma_df['AVGHR_daily'].isnull()].index

Int64Index([  83,  341,  343,  344,  401,  436,  437,  438,  439,  440,
            ...
            2267, 2341, 2407, 2408, 2409, 2410, 2411, 2412, 2429, 2447], dtype='int64', length=141)


Int64Index([], dtype='int64')

## HR05Perc & HR95Perc

In [143]:
index05 = asthma_df[asthma_df['HR05Perc'].isnull()].index
index95 = asthma_df[asthma_df['HR95Perc'].isnull()].index

print(index05)

for ind in index05:
    # compute the average number of steps
    asthma_df.loc[ind, 'HR05Perc'] = (asthma_df.loc[ind, "HR00":"HR23"].quantile(0.05))

for ind in index95:
    # compute the average number of steps
    asthma_df.loc[ind, 'HR95Perc'] = (asthma_df.loc[ind, "HR00":"HR23"].quantile(0.95))

Int64Index([  83,  341,  343,  344,  401,  436,  437,  438,  439,  440,
            ...
            2267, 2341, 2407, 2408, 2409, 2410, 2411, 2412, 2429, 2447], dtype='int64', length=141)


### Steps

In [144]:
# asthma_children[10].loc[:, "SubjectNr"]

asthma_children = extractChild(asthma_df)
asthma_children[10].loc[:, "steps00":"steps23"]

Unnamed: 0,steps00,steps01,steps02,steps03,steps04,steps05,steps06,steps07,steps08,steps09,steps10,steps11,steps12,steps13,steps14,steps15,steps16,steps17,steps18,steps19,steps20,steps21,steps22,steps23
287,,,,,,,,,,,6.0,13.0,,,1017.0,1474.0,1419.0,397.0,472.0,132.0,449.0,,,
288,,,,,,88.0,605.0,687.0,1265.0,976.0,758.0,107.0,1157.0,615.0,1264.0,494.0,42.0,283.0,323.0,25.0,128.0,,,
289,,,,,,,63.0,101.0,265.0,1175.0,1057.0,1094.0,369.0,248.0,138.0,322.0,35.0,183.0,211.0,301.0,279.0,79.0,50.0,7.0
290,,,,,,72.0,92.0,766.0,451.0,190.0,274.0,309.0,224.0,125.0,279.0,47.0,128.0,173.0,238.0,352.0,432.0,,,
291,,,,,,,,,296.0,449.0,258.0,698.0,82.0,176.0,374.0,1313.0,825.0,1037.0,80.0,158.0,234.0,103.0,131.0,
292,,,,,,,130.0,462.0,406.0,70.0,973.0,166.0,1593.0,359.0,352.0,644.0,317.0,601.0,1066.0,336.0,191.0,,,
293,,,,,,,97.0,408.0,36.0,108.0,197.0,18.0,94.0,59.0,182.0,114.0,172.0,342.0,730.0,1217.0,492.0,243.0,207.0,42.0
294,,,,,,20.0,57.0,194.0,409.0,162.0,187.0,67.0,60.0,678.0,494.0,1135.0,102.0,326.0,362.0,425.0,27.0,23.0,,
295,,,,,,,27.0,124.0,102.0,11.0,120.0,,327.0,188.0,,,76.0,329.0,765.0,70.0,579.0,6.0,,
296,,,,,,,,346.0,355.0,12.0,224.0,81.0,92.0,96.0,112.0,952.0,583.0,1366.0,563.0,310.0,45.0,6.0,,


In [145]:
for child in asthma_children:
    child.loc[:, "steps00":"steps06"] = child.loc[:, "steps00":"steps06"].fillna(0.0)
    child.loc[:, "steps22":"steps23"] = child.loc[:, "steps22":"steps23"].fillna(0.0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  child.loc[:, "steps00":"steps06"] = child.loc[:, "steps00":"steps06"].fillna(0.0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  child.loc[:, "steps22":"steps23"] = child.loc[:, "steps22":"steps23"].fillna(0.0)


In [146]:
asthma_children[59].loc[:, "steps00":"steps23"]

Unnamed: 0,steps00,steps01,steps02,steps03,steps04,steps05,steps06,steps07,steps08,steps09,steps10,steps11,steps12,steps13,steps14,steps15,steps16,steps17,steps18,steps19,steps20,steps21,steps22,steps23
1696,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,30.0,1376.0,511.0,570.0,284.0,270.0,1834.0,1708.0,349.0,276.0,0.0,0.0
1697,0.0,0.0,0.0,0.0,0.0,0.0,29.0,137.0,454.0,714.0,1963.0,419.0,207.0,469.0,76.0,102.0,944.0,330.0,70.0,418.0,218.0,20.0,0.0,0.0
1698,0.0,0.0,0.0,0.0,0.0,0.0,0.0,255.0,60.0,504.0,493.0,164.0,108.0,911.0,2705.0,2613.0,1065.0,467.0,60.0,327.0,277.0,9.0,0.0,0.0
1699,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.0,345.0,283.0,1425.0,125.0,783.0,318.0,275.0,827.0,105.0,40.0,178.0,393.0,198.0,,0.0,0.0
1700,0.0,0.0,0.0,0.0,0.0,0.0,0.0,190.0,483.0,132.0,315.0,176.0,243.0,939.0,1381.0,942.0,251.0,1838.0,4310.0,2465.0,112.0,117.0,9.0,0.0
1701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,116.0,723.0,996.0,369.0,315.0,766.0,184.0,1009.0,100.0,1864.0,553.0,197.0,299.0,258.0,,0.0,0.0
1702,0.0,0.0,0.0,0.0,0.0,26.0,31.0,144.0,1339.0,533.0,211.0,2159.0,943.0,459.0,328.0,1475.0,217.0,1059.0,415.0,662.0,307.0,,13.0,0.0
1703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,137.0,481.0,179.0,474.0,277.0,1016.0,670.0,954.0,257.0,529.0,384.0,264.0,648.0,21.0,307.0,0.0,0.0
1704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0,95.0,446.0,441.0,382.0,211.0,126.0,133.0,402.0,124.0,132.0,179.0,39.0,185.0,134.0,0.0,0.0
1705,0.0,0.0,0.0,0.0,0.0,0.0,0.0,107.0,79.0,973.0,252.0,411.0,171.0,92.0,951.0,529.0,106.0,217.0,49.0,272.0,319.0,471.0,0.0,0.0


In [147]:
print(len(asthma_children))

89


In [148]:
for child in asthma_children:
    try:
        imputer = KNNImputer()          # n_neighbors=2
        child.loc[:, "steps00":"steps23"] = imputer.fit_transform(child.loc[:, "steps00":"steps23"])
# asthma_children[59][10:].loc[:, "steps00":"steps23"] = imputer.fit_transform(asthma_children[59][10:].loc[:, "steps00":"steps23"])
    except:
        pass

# combine the child dataframes into one dataframe
asthma_df = pd.concat(asthma_children)
asthma_df.loc[:, "steps00":"steps23"] = imputer.fit_transform(asthma_df.loc[:, "steps00":"steps23"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  child.loc[:, "steps00":"steps23"] = imputer.fit_transform(child.loc[:, "steps00":"steps23"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  child.loc[:, "steps00":"steps23"] = imputer.fit_transform(child.loc[:, "steps00":"steps23"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  child.loc[:, "steps

In [149]:
# print the number of Nans in the HR columns
print(asthma_df.loc[:, "HR00":"HR23"].isnull().sum().sum())

0


TODO: Do the KNN with steps and heart rate at the same time.

TODO: Update the total step number.

## Step total daily & stepsTotalDetailed

In [150]:
# print the indexes of the nans in stepsTotalDaily
index = asthma_df[asthma_df['stepsTotalDaily'].isnull()].index

for ind in index:
    # compute the total number of steps
    asthma_df.loc[ind, 'stepsTotalDaily'] = (asthma_df.loc[ind, "steps00":"steps23"].sum())

In [151]:
# print the indexes of the nans in stepsTotalDetailed
index = asthma_df[asthma_df['stepsTotalDetailed'].isnull()].index

for ind in index:
    # compute the total number of steps
    asthma_df.loc[ind, 'stepsTotalDetailed'] = asthma_df.loc[ind]["stepsTotalDaily"]

## steps_hour_max

In [152]:
# print the indexes of the nans in stepsTotalDetailed
index = asthma_df[asthma_df['steps_hour_max'].isnull()].index

for ind in index:
    # compute the max number of steps
    asthma_df.loc[ind, 'steps_hour_max'] = asthma_df.loc[ind]["steps00":"steps23"].max()

## steps15_19

In [153]:
# print the indexes of the nans in steps15_19
index = asthma_df[asthma_df['steps15_19'].isnull()].index

for ind in index:
    # compute the max number of steps
    asthma_df.loc[ind, 'steps15_19'] = asthma_df.loc[ind]["steps15":"steps18"].sum()

## Weight and height

I assume that the child is a man. The assumption could have been done because the two sex on average have the same height, but only 1.5 kg of difference.

In [154]:
asthma_df.loc[np.isnan(asthma_df['Weight']), ['Age','Weight','Height','Gender']]

index = asthma_df.loc[np.isnan(asthma_df['Weight']), ['Weight']].index

for ind in index:
    asthma_df.loc[ind, 'Weight'] = 39.92
    asthma_df.loc[ind, 'Height'] = 149.1

## PM25

The average value has been proven to be the best. (see link below)

In [155]:
asthma_df[['PM25']].isna().sum()

# print the indexes where PM25 is Nan
asthma_df.loc[np.isnan(asthma_df['PM25']), ['PM25']]

# print Pm25 column
asthma_df['PM25']

0        13.04
1         3.09
2         4.94
3        12.79
4         3.31
5         2.83
6         5.36
7         8.23
8        10.17
9        16.34
10       22.08
11       20.55
12       18.76
13        3.85
14       11.30
15       20.79
16       32.33
17       24.28
18       25.54
19       25.07
20       26.89
21       27.63
22       11.76
23        3.96
24        4.60
25        5.59
26        1.93
27        6.06
28        1.50
29       20.79
30       32.33
31       24.28
32       25.54
33       25.07
34       26.89
35       27.63
36       11.76
37        3.96
38        4.60
39        5.59
40        1.93
41        6.06
42        1.50
43       11.45
44        3.47
45        0.70
46        6.11
47        2.84
48        3.42
49        8.81
50       10.94
51       16.37
52       19.59
53       17.61
54        7.71
55       10.98
56        5.29
57        8.23
58       11.30
59       24.28
60       25.54
61       25.07
62       26.89
63       27.63
64       11.76
65        3.96
66        

In [156]:
asthma_children = extractChild(asthma_df)

In [157]:
# https://www.researchgate.net/publication/237537115_Estimation_of_missing_values_in_air_pollution_data_using_single_imputation_techniques
# for child in asthma_children:
#     # get index of Nan in child
#     index = child.loc[np.isnan(child['PM25']), ['PM25']].index
#     if index != 0:
#     # Replace the missing values in the PM25 with the mean of its neighbours
#         child.loc[index, 'PM25'] = (child.loc[index-1, 'PM25'].values + child.loc[index+1, 'PM25'].values) / 2

index = asthma_df.loc[np.isnan(asthma_df['PM25']), ['PM25']].index

for ind in index:
    asthma_df.loc[ind, 'PM25'] = (asthma_df.loc[ind-1, 'PM25'] + asthma_df.loc[ind+1, 'PM25']) / 2


In [158]:
print(asthma_df['PM25'].isnull().sum())

0


## PM10

We do the same as above

In [159]:
# Print the number of Missing values in the PM10 column
print(asthma_df[['PM10']].isna().sum())

PM10    11
dtype: int64


In [160]:
index = asthma_df.loc[np.isnan(asthma_df['PM10']), ['PM10']].index

for ind in index:
    asthma_df.loc[ind, 'PM10'] = (asthma_df.loc[ind-1, 'PM10'] + asthma_df.loc[ind+1, 'PM10']) / 2

In [161]:
print(asthma_df[['PM10']].isna().sum())

PM10    0
dtype: int64


## AgeDiagnosis

This one cannot be replaced because we have no information about that kid

In [162]:
print(asthma_df[['AgeDiagnosis']].isna().sum())

# print the index of the missing values in the AgeDiagnosis column
index = asthma_df.loc[np.isnan(asthma_df['AgeDiagnosis']), ['AgeDiagnosis']].index
print(asthma_df.loc[index, ['SubjectNr','Age', 'AgeDiagnosis']])



AgeDiagnosis    29
dtype: int64
      SubjectNr  Age  AgeDiagnosis
1034    8364553    9           NaN
1035    8364553    9           NaN
1036    8364553    9           NaN
1037    8364553    9           NaN
1038    8364553    9           NaN
1039    8364553    9           NaN
1040    8364553    9           NaN
1041    8364553    9           NaN
1042    8364553    9           NaN
1043    8364553    9           NaN
1044    8364553    9           NaN
1045    8364553    9           NaN
1046    8364553    9           NaN
1047    8364553    9           NaN
1048    8364553    9           NaN
1049    8364553    9           NaN
1050    8364553    9           NaN
1051    8364553    9           NaN
1052    8364553    9           NaN
1053    8364553    9           NaN
1054    8364553    9           NaN
1055    8364553    9           NaN
1056    8364553    9           NaN
1057    8364553    9           NaN
1058    8364553    9           NaN
1059    8364553    9           NaN
1060    8364553    9   

## CONDITION

Same as above all the nan refer to one kid. Thus, we assumed it was controlled

In [163]:
print(asthma_df[['CONDITION']].isna().sum())

# print the index of the missing values in the AgeDiagnosis column using pd.isnull()
index = asthma_df.loc[pd.isnull(asthma_df['CONDITION']), ['CONDITION']].index
asthma_df.loc[index, ['CONDITION']] = 'Controlled asthma'
print(asthma_df.loc[index, ['SubjectNr', 'CONDITION']])



CONDITION    0
dtype: int64
Empty DataFrame
Columns: [SubjectNr, CONDITION]
Index: []


## DayCare

Replace by 0

In [164]:
print(asthma_df[['DayCare']].isna().sum())
print(asthma_df[asthma_df['DayCare'] != 0].shape[0])

# print the index of the missing values in the AgeDiagnosis column using pd.isnull()
index = asthma_df.loc[pd.isnull(asthma_df['DayCare']), ['DayCare']].index
asthma_df.loc[index, ['DayCare']] = 0
print(asthma_df.loc[index, ['SubjectNr', 'DayCare']])

DayCare    0
dtype: int64
0
Empty DataFrame
Columns: [SubjectNr, DayCare]
Index: []


## Dis_Chronic

In [165]:
# Count the number of rows that have a different value than 0
print(asthma_df[['Dis_Chronic']].isna().sum())
print(asthma_df[asthma_df['Dis_Chronic'] != 0].shape[0])

# print the index of the missing values in the AgeDiagnosis column using pd.isnull()
index = asthma_df.loc[pd.isnull(asthma_df['Dis_Chronic']), ['Dis_Chronic']].index
asthma_df.loc[index, ['Dis_Chronic']] = 0
# print(asthma_df.loc[index, ['SubjectNr', 'Dis_Chronic']])


Dis_Chronic    0
dtype: int64
0


## Hosp_tratme

!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!  DID NOT FINISH IT !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
Don't really know how to handle it.

In [166]:
# Count the number of rows that have a different value than 0
# print(asthma_df[['Hosp_treatme']].isna().sum())

for i in range(len(asthma_df)):
    if asthma_df.loc[i, 'Hosp_treatme'] != 0:
        print(i, asthma_df.loc[i, ['Hosp_treatme', 'SubjectNr']])

# print(asthma_df[asthma_df['Hosp_treatme'] != 0].shape[0])

# print the index of the missing values in the AgeDiagnosis column using pd.isnull()
# index = asthma_df.loc[pd.isnull(asthma_df['Hosp_treatme']), ['Hosp_treatme']].index
# asthma_df.loc[index, ['Hosp_treatme']] = 0


1852 Hosp_treatme        1.0
SubjectNr       4745136
Name: 1852, dtype: object
1853 Hosp_treatme        1.0
SubjectNr       4745136
Name: 1853, dtype: object
1854 Hosp_treatme        1.0
SubjectNr       4745136
Name: 1854, dtype: object
1855 Hosp_treatme        1.0
SubjectNr       4745136
Name: 1855, dtype: object
1856 Hosp_treatme        1.0
SubjectNr       4745136
Name: 1856, dtype: object
1857 Hosp_treatme        1.0
SubjectNr       4745136
Name: 1857, dtype: object
1858 Hosp_treatme        1.0
SubjectNr       4745136
Name: 1858, dtype: object
1859 Hosp_treatme        1.0
SubjectNr       4745136
Name: 1859, dtype: object
1860 Hosp_treatme        1.0
SubjectNr       4745136
Name: 1860, dtype: object
1861 Hosp_treatme        1.0
SubjectNr       4745136
Name: 1861, dtype: object
1862 Hosp_treatme        1.0
SubjectNr       4745136
Name: 1862, dtype: object
1863 Hosp_treatme        1.0
SubjectNr       4745136
Name: 1863, dtype: object
1864 Hosp_treatme        1.0
SubjectNr       4745136


1978 Hosp_treatme        1.0
SubjectNr       3762951
Name: 1978, dtype: object
1979 Hosp_treatme        1.0
SubjectNr       3762951
Name: 1979, dtype: object
1980 Hosp_treatme        1.0
SubjectNr       3762951
Name: 1980, dtype: object
1981 Hosp_treatme        1.0
SubjectNr       3762951
Name: 1981, dtype: object
1982 Hosp_treatme        1.0
SubjectNr       3762951
Name: 1982, dtype: object
1983 Hosp_treatme        1.0
SubjectNr       3762951
Name: 1983, dtype: object
1984 Hosp_treatme        1.0
SubjectNr       3762951
Name: 1984, dtype: object
1985 Hosp_treatme        1.0
SubjectNr       3762951
Name: 1985, dtype: object
1986 Hosp_treatme        1.0
SubjectNr       3762951
Name: 1986, dtype: object
1987 Hosp_treatme        1.0
SubjectNr       3762951
Name: 1987, dtype: object
1988 Hosp_treatme        1.0
SubjectNr       3762951
Name: 1988, dtype: object
1989 Hosp_treatme        1.0
SubjectNr       3762951
Name: 1989, dtype: object
1990 Hosp_treatme        1.0
SubjectNr       376295

## Gender

In [167]:
# print the index of the missing values in the AgeDiagnosis column using pd.isnull()
index = asthma_df.loc[pd.isnull(asthma_df['Gender']), ['Gender']].index
print(asthma_df.loc[index, ['SubjectNr', 'Gender']])

Empty DataFrame
Columns: [SubjectNr, Gender]
Index: []


## Hospital_Trt

In [168]:
# print the index of the missing values in the AgeDiagnosis column using pd.isnull()
index = asthma_df.loc[pd.isnull(asthma_df['Hospital_Trt']), ['Hospital_Trt']].index
print(asthma_df.loc[index, ['SubjectNr', 'Hospital_Trt']])

Empty DataFrame
Columns: [SubjectNr, Hospital_Trt]
Index: []


## PetsHome

In [169]:
# print the index of the missing values in the AgeDiagnosis column using pd.isnull()
index = asthma_df.loc[pd.isnull(asthma_df['PetsHome']), ['PetsHome']].index
print(asthma_df.loc[index, ['SubjectNr', 'PetsHome']])

Empty DataFrame
Columns: [SubjectNr, PetsHome]
Index: []


## Family_his

Still need to be done !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

In [170]:
asthma_children = extractChild(asthma_df)
countYes = 0
countNo = 0
for child in asthma_children:
    # verify the value of the first row of the child
    if child.iloc[1]['Family_his'] == "Yes":
        countYes += 1
    elif child.iloc[1]['Family_his'] == "No":
        countNo += 1

print(countYes)
print(countNo)

48
40


## SchoolYear & school_year_edit

The missing values refere to one kid. Thus, I put it at the same grade as a 7 year old kid should be (I assume that he did not fail a class).


In [171]:
# Primary school year 4

index1 = asthma_df.loc[pd.isnull(asthma_df['SchoolYear']), ['SchoolYear']].index
asthma_df.loc[index1, ['SchoolYear']] = 4
# print(asthma_df.loc[index1, ['SubjectNr', 'SchoolYear']])

index2 = asthma_df.loc[pd.isnull(asthma_df['school_year_edit']), ['school_year_edit']].index
asthma_df.loc[index2, ['school_year_edit']] = 'Primary school year 4'
# print(asthma_df.loc[index2, ['SubjectNr', 'school_year_edit']])

## Sports

In [172]:
index1 = asthma_df.loc[pd.isnull(asthma_df['Sports']), ['Sports']].index
print(asthma_df.loc[index1, ['SubjectNr', 'Sports']])

      SubjectNr Sports
1667    5578891    NaN
1668    5578891    NaN
1669    5578891    NaN
1670    5578891    NaN
1671    5578891    NaN
1672    5578891    NaN
1673    5578891    NaN
1674    5578891    NaN
1675    5578891    NaN
1676    5578891    NaN
1677    5578891    NaN
1678    5578891    NaN
1679    5578891    NaN
1680    5578891    NaN
1681    5578891    NaN
1682    5578891    NaN
1683    5578891    NaN
1684    5578891    NaN
1685    5578891    NaN
1686    5578891    NaN
1687    5578891    NaN
1688    5578891    NaN
1689    5578891    NaN
1690    5578891    NaN
1691    5578891    NaN
1692    5578891    NaN
1693    5578891    NaN
1694    5578891    NaN
1695    5578891    NaN
1793    5039256    NaN
1794    5039256    NaN
1795    5039256    NaN
1796    5039256    NaN
1797    5039256    NaN
1798    5039256    NaN
1799    5039256    NaN
1800    5039256    NaN
1801    5039256    NaN
1802    5039256    NaN
1803    5039256    NaN
1804    5039256    NaN
1805    5039256    NaN
1806    503

In [173]:
percentNan = []
for col in asthma_df.columns:
    nan_percentage = asthma_df[col].isnull().sum() / len(asthma_df[col]) * 100
    percentNan.append((nan_percentage, col))

percentNan.sort(reverse=True)

percentNan

[(92.14454045561665, 'EventDay'),
 (76.90494893951296, 'EOS_weight_tot'),
 (76.90494893951296, 'EOS_treatmentburden_tot'),
 (76.90494893951296, 'EOS_school_tot'),
 (76.90494893951296, 'EOS_respiratory_tot'),
 (76.90494893951296, 'EOS_physical_tot'),
 (76.90494893951296, 'EOS_health_tot'),
 (76.90494893951296, 'EOS_emotional_tot'),
 (76.90494893951296, 'EOS_digestion_tot'),
 (76.90494893951296, 'EOS_bodyimage_tot'),
 (75.21602513747054, 'Eosinophils'),
 (74.4304791830322, 'Symptomscore'),
 (69.7172034564022, 'Serum_IgE'),
 (69.16732128829537, 'ACD6'),
 (67.75333857030637, 'BASELINE_weight_tot'),
 (67.75333857030637, 'BASELINE_treatmentburden_tot'),
 (67.75333857030637, 'BASELINE_school_tot'),
 (67.75333857030637, 'BASELINE_respiratory_tot'),
 (67.75333857030637, 'BASELINE_physical_tot'),
 (67.75333857030637, 'BASELINE_health_tot'),
 (67.75333857030637, 'BASELINE_emotional_tot'),
 (67.75333857030637, 'BASELINE_digestion_tot'),
 (67.75333857030637, 'BASELINE_bodyimage_tot'),
 (67.71406127

## Wear information

When did the kid wear the watch. Such info is not useful to predict the asthma crisis.

In [174]:
asthma_df = asthma_df.drop(['wear24H', 'wear23', 'wear22', 'wear21', 'wear20', 'wear19','wear18','wear17','wear16H','wear16','wear15',
                                       'wear14','wear13','wear12','wear11','wear10','wear09','wear08','wear07','wear06','wear05','wear04','wear03',
                                       'wear02','wear01','wear00'], axis=1)

## BASELINE_PAQLQ_xyz

Since ALL these PAQLQ values were asked for asthma patients and the dataset's 1/3 is CF patients, it's obvious that we will have at least 33% missing data on these. <br> <br>
For the remaining 4% (bc 37% is missing):<br>
- ALL these values are asked once during the project (if it's missing, it's missing for all the 28 days) so we can't guess them by itself.<br> <br>

This is also true for the ones with ~66-68% missing values, those were filled only by CF patients. But we can't guess either of these...

### Data fixing for xxx_total and xxx_emotion

#### BASELINE_PAQLQ_total

In [175]:
# doc doesn't tell about this column in the legend
# needs to be fixed (5.913043e+09 vs 7.000000e+00)
# Is this on a ~1-10 scale or some values are really 10 times larger than others?
# And if so, is 7.000000e+00 7 or 70?

# I'll use the ~same code we used for HR

asthma_df['BASELINE_PAQLQ_total'].value_counts()

5.913043e+09    115
6.478261e+08    115
6.826087e+09     87
4.869565e+09     86
6.652174e+09     58
6.391304e+09     58
5.956522e+09     58
5.826087e+09     58
6.869565e+09     58
6.304348e+09     58
6.130435e+09     58
6.782609e+09     58
6.913043e+09     58
6.260870e+09     57
6.217391e+09     29
5.478261e+08     29
4.739130e+09     29
6.173913e+09     29
3.826087e+09     29
6.347826e+09     29
4.782609e+09     29
2.478261e+08     29
5.521739e+08     29
6.608696e+09     29
4.521739e+08     29
2.739130e+09     29
5.565217e+09     29
3.956522e+09     29
6.043478e+09     29
5.695652e+09     29
5.652174e+09     28
5.434783e+09     27
7.000000e+00     27
6.956522e+09     27
6.086957e+09     25
Name: BASELINE_PAQLQ_total, dtype: int64

In [176]:
col = "BASELINE_PAQLQ_total"

for i in range(len(asthma_df[col])):
    if (asthma_df.loc[i, col] > 10):
        # Take first 5 digits of float
        first_5_digits = str(asthma_df.loc[i, col])[:5]
        asthma_df.loc[i, col] = float(first_5_digits)/10000

In [177]:
asthma_df['BASELINE_PAQLQ_total'].value_counts()

5.9130    115
6.4782    115
6.8260     87
4.8695     86
6.6521     58
6.3913     58
5.9565     58
5.8260     58
6.8695     58
6.3043     58
6.1304     58
6.7826     58
6.9130     58
6.2608     57
6.2173     29
5.4782     29
4.7391     29
6.1739     29
3.8260     29
6.3478     29
4.7826     29
2.4782     29
5.5217     29
6.6086     29
4.5217     29
2.7391     29
5.5652     29
3.9565     29
6.0434     29
5.6956     29
5.6521     28
5.4347     27
7.0000     27
6.9565     27
6.0869     25
Name: BASELINE_PAQLQ_total, dtype: int64

#### BASELINE_PAQLQ_emotion

In [178]:
asthma_df['BASELINE_PAQLQ_emotion'].value_counts()

7.00       547
6.75       202
6.50       165
6625.00    145
6875.00     86
5.75        58
6.25        58
2125.00     58
6125.00     58
4875.00     29
5875.00     29
4.75        29
5125.00     29
5.25        29
5.50        29
6.00        29
Name: BASELINE_PAQLQ_emotion, dtype: int64

In [179]:
col = "BASELINE_PAQLQ_emotion"

for i in range(len(asthma_df[col])):
    if (asthma_df.loc[i, col] > 10):
     asthma_df.loc[i, col] = asthma_df.loc[i, col]/1000

In [180]:
asthma_df['BASELINE_PAQLQ_emotion'].value_counts()

7.000    547
6.750    202
6.500    165
6.625    145
6.875     86
5.750     58
6.250     58
2.125     58
6.125     58
4.875     29
5.875     29
4.750     29
5.125     29
5.250     29
5.500     29
6.000     29
Name: BASELINE_PAQLQ_emotion, dtype: int64

## Atopic Asthma

As before, 33% missing data is the minimum. Above that (2%) we can't guess precisely if they have it or not. <br>The number of kids with atopic asthma is 3x higher (1266 - 402). I'm not sure if it's enough to guess the 2 kid's with missing AtopicAsthma values as Yes. <br> <br>

But again, we can't do anything with the CF kids

In [181]:
asthma_df['AtopicAsthma'].value_counts()

Yes    1237
No      402
Name: AtopicAsthma, dtype: int64

In [182]:
asthma_df.loc[asthma_df['AtopicAsthma'].isnull(), 'SubjectNr'].unique() 
# first 2 are with asthma

array([1642849, 1438514, 4722486, 8533888, 8635768, 8196400, 5039256,
       4986804, 4745136, 1925533, 3132734, 3831912, 3762951, 7358340,
       3573690, 8185144, 7846328, 1701448, 8619726, 1143691, 8974471,
       8662594, 7367843, 2901469, 4213384, 6203575, 7530145, 2101629,
       6397303, 4468038, 1834023, 2211602], dtype=int64)

 ## SABA and all the others with 34.13592233009709 missing data

34.13592233009709% means above the 33% minimum (CF patients) there is 1 kid with asthma who has no data. <br>
Unfortunately (fortunately?) it's the same kid. <br> <br>

Some values has a big differenc bw Yes and No values, some not. Idk if we should guess or not...

In [183]:
asthma_df.loc[asthma_df['SABA'].isnull(), 'SubjectNr'].unique() 
# First one is with asthma, all the others with CF

asthma_df['SABA'].value_counts()

Yes    1696
Name: SABA, dtype: int64

In [184]:
asthma_df.loc[asthma_df['LABA'].isnull(), 'SubjectNr'].unique() 
asthma_df['LABA'].value_counts()


Yes    979
No     717
Name: LABA, dtype: int64

In [185]:
asthma_df.loc[asthma_df['Oral_Steroid'].isnull(), 'SubjectNr'].unique() 

asthma_df['Oral_Steroid'].value_counts()

No     1580
Yes     116
Name: Oral_Steroid, dtype: int64

In [186]:
asthma_df.loc[asthma_df['Monoclonals'].isnull(), 'SubjectNr'].unique()

asthma_df['Monoclonals'].value_counts()

No     1667
Yes      29
Name: Monoclonals, dtype: int64

In [187]:
asthma_df.loc[asthma_df['Leukotri_Mod'].isnull(), 'SubjectNr'].unique() 

asthma_df['Leukotri_Mod'].value_counts()

No     1466
Yes     230
Name: Leukotri_Mod, dtype: int64

In [188]:
asthma_df.loc[asthma_df['ICS'].isnull(), 'SubjectNr'].unique() 

asthma_df['ICS'].value_counts()

Yes    1638
No       58
Name: ICS, dtype: int64

In [189]:
asthma_df.loc[asthma_df['ExerciseSymp'].isnull(), 'SubjectNr'].unique() 

asthma_df['ExerciseSymp'].value_counts()

Yes    949
No     747
Name: ExerciseSymp, dtype: int64

## AllergicRhin

33%, everybody who was asked, answered.

In [190]:
asthma_df.loc[asthma_df['AllergicRhin'].isnull(), 'SubjectNr'].unique() 
# these are all CF patients

array([4722486, 8533888, 8635768, 8196400, 5039256, 4986804, 4745136,
       1925533, 3132734, 3831912, 3762951, 7358340, 3573690, 8185144,
       7846328, 1701448, 8619726, 1143691, 8974471, 8662594, 7367843,
       2901469, 4213384, 6203575, 7530145, 2101629, 6397303, 4468038,
       1834023, 2211602], dtype=int64)

## grade_fvc

In [191]:
percentNan = []
for col in asthma_df.columns:
    nan_percentage = asthma_df[col].isnull().sum() / len(asthma_df[col]) * 100
    percentNan.append((nan_percentage, col))

percentNan.sort(reverse=True)

percentNan

[(92.14454045561665, 'EventDay'),
 (76.90494893951296, 'EOS_weight_tot'),
 (76.90494893951296, 'EOS_treatmentburden_tot'),
 (76.90494893951296, 'EOS_school_tot'),
 (76.90494893951296, 'EOS_respiratory_tot'),
 (76.90494893951296, 'EOS_physical_tot'),
 (76.90494893951296, 'EOS_health_tot'),
 (76.90494893951296, 'EOS_emotional_tot'),
 (76.90494893951296, 'EOS_digestion_tot'),
 (76.90494893951296, 'EOS_bodyimage_tot'),
 (75.21602513747054, 'Eosinophils'),
 (74.4304791830322, 'Symptomscore'),
 (69.7172034564022, 'Serum_IgE'),
 (69.16732128829537, 'ACD6'),
 (67.75333857030637, 'BASELINE_weight_tot'),
 (67.75333857030637, 'BASELINE_treatmentburden_tot'),
 (67.75333857030637, 'BASELINE_school_tot'),
 (67.75333857030637, 'BASELINE_respiratory_tot'),
 (67.75333857030637, 'BASELINE_physical_tot'),
 (67.75333857030637, 'BASELINE_health_tot'),
 (67.75333857030637, 'BASELINE_emotional_tot'),
 (67.75333857030637, 'BASELINE_digestion_tot'),
 (67.75333857030637, 'BASELINE_bodyimage_tot'),
 (67.71406127