In [2294]:
import pandas as pd
from pandas.testing import assert_frame_equal
import numpy as np
import csv 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split

In [2295]:
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 5000)
pd.set_option('display.max_colwidth', 5000)

In [2296]:
# Read csv
asthma_df = pd.read_csv('astma.csv')
healthy_df = pd.read_csv("healthy_parsed.csv")

In [2297]:
print(healthy_df['screentime'].unique())
print(asthma_df['screentime'].unique())

[nan '2-4 hours' '0-30 min' '0.5-1 hours' '1-2 hours' '> 4 hours']
[nan 'B. 30 tot 60 minuten' 'C. 1 uur tot 2 uur' 'A. 0 tot 30 minuten'
 'D. 2 uur tot 4 uur' 'E. Meer dan 4 uur']


# Preprocessing

In [2298]:
healthy_df.head(5)

Unnamed: 0,Index,SubjectNr,DayNo,weekday,dayType,Age,sex,weight,height,BMI_SDS,ethnicity,school_year_final,sportsyesno,urbanisation,PedsQL_score_baseline,stepsTotalDaily,steps_hour_max,steps00,steps01,steps02,steps03,steps04,steps05,steps06,steps07,steps08,steps09,steps10,steps11,steps12,steps13,steps14,steps15,steps16,steps17,steps18,steps19,steps20,steps21,steps22,steps23,HR05Perc,HR95Perc,HRMinSleep,HRMaxSleep,AVGHR_daily,AVGHR_sleep,AVGHR_wake,HR00,HR01,HR02,HR03,HR04,HR05,HR06,HR07,HR08,HR09,HR10,HR11,HR12,HR13,HR14,HR15,HR16,HR17,HR18,HR19,HR20,HR21,HR22,HR23,wear05H,wear16H,wear24H,BODY_TEMPERATURE_DEG_C,DIASTOLIC_BLOOD_PRESSURE_MMHG,HEART_PULSE_BPM,SYSTOLIC_BLOOD_PRESSURE_MMHG,WEIGHT_KG,awakeDuration,lightSleepDuration,deepSleepDuration,wakeUpCount,sleeptime,waketime,fvc_best,fev1_best,pef_best,grade_fev1,grade_fvc,predicted_fvc_best,predicted_fev1_best,predicted_fev1_ratio_best,fev1_ratio_best,fev1_percentage,fvc1_percentage,fev1_ratio_percentage,school_yes_no,screentime,FG,FHX,FHN,TG,TN,TX,SQ,SP,DR,RH,RHX
0,1,2853189,0,Thu,holiday,2,Female,12.0,92.0,-1.8,Other / Mixed,Day care,No,Extremely urbanised,97.826087,2806.0,1992.0,,,,,,,,,,,,7.0,,,,,480.0,1992.0,156.0,51.0,61.0,39.0,,20.0,56.0,123.0,,,96.0,,97.0,,,,,,,,,,,,,,,,,,115.5,106.5,109.0,,56.0,,92.0,0,44,33,,,,,,,,,,,,,,,,,,,,,,,,,,5.6,8,4,5.2,3.4,7.5,2.0,25,1.9,9,2
1,2,2853189,1,Fri,holiday,2,Female,12.0,92.0,-1.8,Other / Mixed,Day care,No,Extremely urbanised,97.826087,2564.0,1205.0,151.0,,,,,,,,,,,,,32.0,9.0,58.0,,,,1205.0,599.0,,,,52.0,163.0,,,126.0,119.0,128.0,119.0,,,,,,,,,,,,,,,,,,,126.5,113.6,145.0,,,20,38,29,,,,,,,,,,,,,,,,,,,,,,,,Neither,,7.9,9,4,5.9,4.4,7.2,0.0,0,0.0,-1,-1
2,3,2853189,2,Sat,holiday,2,Female,12.0,92.0,-1.8,Other / Mixed,Day care,No,Extremely urbanised,97.826087,521.0,369.0,,,,,,,,,,,,369.0,78.0,,,,,,,74.0,,,,,71.0,137.0,,,103.0,,103.0,,,,,,,,,,,,107.0,,,,,,,102.5,100.333333,,,,,0,25,17,,,,,,,,,,,,,,,,,,,,,,,,Neither,,9.2,11,8,7.1,6.0,7.8,0.0,0,0.2,1,1
3,4,2853189,3,Sun,holiday,2,Female,12.0,92.0,-1.8,Other / Mixed,Day care,No,Extremely urbanised,97.826087,250.0,114.0,,,,,,,,,,,,,114.0,106.0,,,,,,,30.0,,,,151.0,156.0,,,154.0,,154.0,,,,,,,,,,,,,,153.5,,,,,,,,,,,0,19,12,38.6,,,,,,,,,,,,,,,,,,,,,,,,,4.3,8,1,6.4,4.5,7.0,0.0,0,0.0,-1,-1
4,5,2853189,4,Mon,school,2,Female,12.0,92.0,-1.8,Other / Mixed,Day care,No,Extremely urbanised,97.826087,9.0,9.0,,,,,,,,,,,,,,,,,,,,9.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,6,4,38.157143,,,,,,,,,,,,,,,,,,,,,,,Neither,,7.3,14,1,8.0,4.7,9.8,0.0,0,0.6,3,1


In [2299]:
asthma_df.head()

Unnamed: 0.1,Unnamed: 0,SubjectNr,DayNo,weekday,dayType,stepsTotalDaily,stepsTotalDetailed,steps00,steps01,steps02,steps03,steps04,steps05,steps06,steps07,steps08,steps09,steps10,steps11,steps12,steps13,steps14,steps15,steps16,steps17,steps18,steps19,steps20,steps21,steps22,steps23,HR05Perc,HR95Perc,HrAvgWake,HR00,HR01,HR02,HR03,HR04,HR05,HR06,HR07,HR08,HR09,HR10,HR11,HR12,HR13,HR14,HR15,HR16,HR17,HR18,HR19,HR20,HR21,HR22,HR23,awakeDuration,lightSleepDuration,deepSleepDuration,wakeUpCount,HRAvgSleep,HRMinSleep,HRMaxSleep,activity_score_parent,school_yes_no,activity_score_child,screentime,sleep_score_child,bedtimeReport,waketimeReport,fvc_best,fev1_best,pef_best,grade_fev1,grade_fvc,Technique,fev1_ratio_best,fev1_percentage,fvc1_percentage,fev1_ratio_percentage,ACD1,ACD2,ACD3,ACD4,ACD5,ACD6,ACD6_use,ACD6score,BASELINE_ACQ6_score,EOS_ACQ6_score,BASELINE_physical_tot,EOS_physical_tot,BASELINE_emotional_tot,EOS_emotional_tot,BASELINE_school_tot,EOS_school_tot,BASELINE_bodyimage_tot,EOS_bodyimage_tot,BASELINE_treatmentburden_tot,EOS_treatmentburden_tot,BASELINE_health_tot,EOS_health_tot,BASELINE_weight_tot,EOS_weight_tot,BASELINE_respiratory_tot,EOS_respiratory_tot,BASELINE_digestion_tot,EOS_digestion_tot,BASELINE_PAQLQ_total,EOS_PAQLQ_total,BASELINE_PAQLQ_activity,EOS_PAQLQ_activity,BASELINE_PAQLQ_symptoms,EOS_PAQLQ_symptoms,BASELINE_PAQLQ_emotion,EOS_PAQLQ_emotion,BASELINE_PedsQL_score,EOS_PedsQL_score,FG,FHX,FHN,TG,TN,TX,SQ,SP,DR,RH,RHX,pollenTotal,NO,NO2,Nox,O3,Ox,PM10,PM25,Symptomscore,Admission_LY,Age,AgeDiagnosis,AllergicRhin,AtopicAsthma,BirthMonth,BirthYear,BMI,BMI_SDS,CF_Asthma,DailyActiv,DayCare,Dis_Chronic,Eosinophils,ExerciseSymp,Family_his,Gender,Height,Hosp_treatme,Hospital_Trt,ICS,LABA,Leukotri_Mod,Monoclonals,Oral_Steroid,OtherClass,PancrInsuffi,PetsHome,Pseudomonas,SABA,SchoolYear,Serum_IgE,SmokeHome,Sports,Vaccination,Weight,school_year_edit,Urbanisation,urbanisation2,totalSleepDuration,wear00,wear01,wear02,wear03,wear04,wear05,wear06,wear07,wear08,wear09,wear10,wear11,wear12,wear13,wear14,wear15,wear16,wear17,wear18,wear19,wear20,wear21,wear22,wear23,wear24H,wear16H,steps15_19,week,steps_hour_max,AVGHR_daily,AVGHR_wake,AVGHR_sleep,CONDITION,EventDay
0,1,1771258,0,Tue,school,3723.0,3723.0,,,,,,,,,,63.0,137.0,,,605.0,1640.0,,82.0,451.0,245.0,135.0,160.0,164.0,41.0,,68.0,120.0,8689362000.0,,,,,,,,,,,,120.0,,,,,,8985714000.0,8342857000.0,87875.0,91.5,93375.0,7716667000.0,7866667000.0,660.0,16800.0,16440.0,1.0,75.0,61.0,93.0,,,,,,,,3232.0,2381.0,4334.0,D,B,3.0,0.736696,0.726555,0.877857,0.822316,,,,,,,,,0.166667,2666667000.0,,,,,,,,,,,,,,,,,,,6652174000.0,5304348000.0,6.4,5.6,6.5,4.2,7.0,6.5,847826087.0,8586957000.0,6.3,8,4,14.0,85,19.7,5.4,58,0.0,0,0,6.0,32.31,48.88,98.41,3.86,27.83,23.42,13.04,,1,15,7.0,No,Yes,12,2002.0,19.5,-0.1,Controlled asthma,School,0.0,0.0,,No,No,Female,163.2,0.0,0.0,Yes,Yes,No,No,No,No,,Yes,,Yes,4.0,1465.0,No,Yes,0.0,51.9,Secondary school year 4,1.0,Extremely urbanised,33240.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,54.0,69.0,,1.0,1640.0,90.0,94.0,,Controlled asthma,
1,2,1771258,1,Wed,school,10015.0,10015.0,,,,,,,,,607.0,580.0,325.0,180.0,322.0,181.0,491.0,161.0,10.0,744.0,1146.0,4355.0,722.0,142.0,49.0,,82.0,168.0,1326242000.0,73.6,7528571000.0,70.75,92.0,76.0,7033333000.0,8233333000.0,78.0,99.0,113.0,129.8,9516667000.0,9166667000.0,,108.5,85.25,93.4,9588889000.0,1018333000.0,1423074000.0,1078154000.0,104.2,9183333000.0,93.0,180.0,22200.0,11760.0,0.0,84.0,67.0,121.0,8.0,Ja,8.0,B. 30 tot 60 minuten,9.0,15:34:00,13:04:00,3261.0,2257.0,4509.0,A,A,3.0,0.692119,0.688716,0.885734,0.772559,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.5,0.166667,2666667000.0,,,,,,,,,,,,,,,,,,,6652174000.0,5304348000.0,6.4,5.6,6.5,4.2,7.0,6.5,847826087.0,8586957000.0,10.0,12,8,11.3,100,13.1,0.2,2,0.9,2,1,3.0,3.47,32.18,37.49,25.72,29.69,10.83,3.09,,1,15,7.0,No,Yes,12,2002.0,19.5,-0.1,Controlled asthma,School,0.0,0.0,,No,No,Female,163.2,0.0,0.0,Yes,Yes,No,No,No,No,,Yes,,Yes,4.0,1465.0,No,Yes,0.0,51.9,Secondary school year 4,1.0,Extremely urbanised,33960.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0,100.0,2061.0,1.0,4355.0,94.0,102.0,76.0,Controlled asthma,
2,3,1771258,2,Thu,school,3811.0,3811.0,,,,,,,,63.0,561.0,159.0,356.0,26.0,591.0,109.0,114.0,456.0,155.0,727.0,46.0,139.0,210.0,6.0,93.0,,71.0,122.0,9030597000.0,7916667000.0,7316667000.0,8033333000.0,78.0,7614286000.0,7466667000.0,7814286000.0,98.5,1034286000.0,1005714000.0,1096667000.0,1006667000.0,105.0,109.0,86.5,8866667000.0,102.8,78.2,85.2,94.0,100.0,89.25,96.5,82.0,300.0,19260.0,12360.0,1.0,76.0,64.0,129.0,5.0,Ja,6.0,B. 30 tot 60 minuten,9.0,15:19:00,13:09:00,3451.0,2567.0,4717.0,B,B,3.0,0.743842,0.783245,0.937255,0.830297,0.0,0.0,1.0,1.0,0.0,6.0,3.0,0.833333,0.166667,2666667000.0,,,,,,,,,,,,,,,,,,,6652174000.0,5304348000.0,6.4,5.6,6.5,4.2,7.0,6.5,847826087.0,8586957000.0,6.0,9,4,9.9,62,13.3,8.2,89,0.0,0,0,1.0,6.8,37.86,48.29,26.94,33.27,16.22,4.94,,1,15,7.0,No,Yes,12,2002.0,19.5,-0.1,Controlled asthma,School,0.0,0.0,,No,No,Female,163.2,0.0,0.0,Yes,Yes,No,No,No,No,,Yes,,Yes,4.0,1465.0,No,Yes,0.0,51.9,Secondary school year 4,1.0,Extremely urbanised,31620.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0,100.0,1384.0,1.0,727.0,90.0,96.0,77.0,Controlled asthma,
3,4,1771258,3,Fri,school,4346.0,4346.0,,,,,,,,,515.0,673.0,377.0,490.0,192.0,322.0,180.0,523.0,61.0,359.0,174.0,95.0,148.0,196.0,21.0,20.0,66.0,128.0,8843151000.0,7716667000.0,7985714000.0,7266667000.0,67.0,80.2,76.0,73.6,79.0,9857143000.0,103.5,1064286000.0,95.5,1062857000.0,9828571000.0,100.0,8433333000.0,8683333000.0,97.2,74.0,107.0,8242857000.0,9616667000.0,9083333000.0,7828571000.0,1140.0,17041.0,19619.0,1.0,75.0,59.0,107.0,5.0,Ja,6.0,B. 30 tot 60 minuten,8.0,15:49:00,13:19:00,3419.0,2497.0,4839.0,B,B,3.0,0.730331,0.761887,0.928565,0.815214,0.0,0.0,1.0,1.0,0.0,4.0,2.0,0.666667,0.166667,2666667000.0,,,,,,,,,,,,,,,,,,,6652174000.0,5304348000.0,6.4,5.6,6.5,4.2,7.0,6.5,847826087.0,8586957000.0,6.9,10,4,9.5,63,11.7,3.2,35,0.0,0,0,1.0,19.32,47.46,77.08,6.62,28.13,20.86,12.79,,1,15,7.0,No,Yes,12,2002.0,19.5,-0.1,Controlled asthma,School,0.0,0.0,,No,No,Female,163.2,0.0,0.0,Yes,Yes,No,No,No,No,,Yes,,Yes,4.0,1465.0,No,Yes,0.0,51.9,Secondary school year 4,1.0,Extremely urbanised,36660.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0,100.0,1117.0,1.0,673.0,88.0,93.0,75.0,Controlled asthma,
4,5,1771258,4,Sat,weekend,3270.0,3270.0,,,,,,,,,,50.0,92.0,167.0,1324.0,388.0,166.0,101.0,91.0,6.0,224.0,405.0,56.0,65.0,63.0,72.0,64.0,111.0,8184138000.0,8333333000.0,7766667000.0,7516667000.0,7628571000.0,7266667000.0,75.8,7383333000.0,68.0,7483333000.0,65.4,94.4,8571429000.0,77.25,8657143000.0,78.0,7666667000.0,90.0,8914286000.0,9228571000.0,110625.0,7742857000.0,8114286000.0,76.5,82.0,540.0,17340.0,15300.0,2.0,67.0,57.0,88.0,6.0,Nee,6.0,B. 30 tot 60 minuten,8.0,16:04:00,15:04:00,3519.0,2528.0,5.07,B,B,3.0,0.718386,0.77128,0.955636,0.801884,0.0,0.0,0.0,0.0,0.0,4.0,2.0,0.333333,0.166667,2666667000.0,,,,,,,,,,,,,,,,,,,6652174000.0,5304348000.0,6.4,5.6,6.5,4.2,7.0,6.5,847826087.0,8586957000.0,9.3,12,6,11.2,98,12.9,0.5,5,5.5,38,10,0.0,1.29,24.07,26.05,29.31,27.51,10.35,3.31,,1,15,7.0,No,Yes,12,2002.0,19.5,-0.1,Controlled asthma,School,0.0,0.0,,No,No,Female,163.2,0.0,0.0,Yes,Yes,No,No,No,No,,Yes,,Yes,4.0,1465.0,No,Yes,0.0,51.9,Secondary school year 4,1.0,Extremely urbanised,32640.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0,100.0,422.0,1.0,1324.0,81.0,83.0,77.0,Controlled asthma,


# Issue with HR columns

In [2300]:
# In the asthma dataset there seems to be a problem with HR00 to HR23
# The heart rate goes above a million sometimes
# It seems like the first couple digits correspond with the expected heart rate.
# 2 cases: HR below 100 and HR above 100

# Loop through each column in the dataframe
for col in asthma_df.loc[:, "HR00":"HR23"]:
    for i in range(len(asthma_df[col])):
        if asthma_df.loc[i, col] > 1000:
            # Take first 2 digits of float
            first_2_digits = str(asthma_df.loc[i, col])[:2]
            temp_number = float(first_2_digits)
            # If heart rate lower than 20 it means it should be greater than 100 (assuming heart rates < 200)
            if temp_number < 20:
                val = str(asthma_df.loc[i, col])[:3] + '.' + str(asthma_df.loc[i, col])[3]
                asthma_df.loc[i, col] = float(val)
            else:
                val = str(asthma_df.loc[i, col])[:2] + '.' + str(asthma_df.loc[i, col])[2]
                asthma_df.loc[i, col] = float(val)

In [2301]:
# Verify that it works 
asthma_df.loc[:, "HR00":"HR23"].head(100)

Unnamed: 0,HR00,HR01,HR02,HR03,HR04,HR05,HR06,HR07,HR08,HR09,HR10,HR11,HR12,HR13,HR14,HR15,HR16,HR17,HR18,HR19,HR20,HR21,HR22,HR23
0,,,,,,,,,,,,120.0,,,,,,89.8,83.4,87.8,91.5,93.3,77.1,78.6
1,73.6,75.2,70.75,92.0,76.0,70.3,82.3,78.0,99.0,113.0,129.8,95.1,91.6,,108.5,85.25,93.4,95.8,101.8,142.3,107.8,104.2,91.8,93.0
2,79.1,73.1,80.3,78.0,76.1,74.6,78.1,98.5,103.4,100.5,109.6,100.6,105.0,109.0,86.5,88.6,102.8,78.2,85.2,94.0,100.0,89.25,96.5,82.0
3,77.1,79.8,72.6,67.0,80.2,76.0,73.6,79.0,98.5,103.5,106.4,95.5,106.2,98.2,100.0,84.3,86.8,97.2,74.0,107.0,82.4,96.1,90.8,78.2
4,83.3,77.6,75.1,76.2,72.6,75.8,73.8,68.0,74.8,65.4,94.4,85.7,77.25,86.5,78.0,76.6,90.0,89.1,92.2,110.6,77.4,81.1,76.5,82.0
5,67.3,68.7,73.75,64.8,62.3,61.8,69.0,75.3,69.0,76.0,100.6,93.0,70.5,89.3,81.0,93.3,98.7,107.6,101.6,86.1,86.3,91.1,89.5,74.75
6,71.8,75.0,74.3,64.0,69.5,73.3,72.0,78.25,103.0,103.0,93.8,93.1,89.5,89.7,96.1,84.5,86.5,95.1,88.6,96.5,86.0,79.0,80.5,75.0
7,82.2,70.1,68.5,64.4,69.8,63.0,89.0,99.5,94.7,101.4,97.6,99.5,94.1,88.6,84.6,101.7,95.8,94.8,89.5,101.0,100.25,88.3,81.1,70.8
8,73.5,98.0,62.0,66.7,63.8,64.3,61.5,60.0,88.8,117.8,99.0,90.5,83.2,81.0,91.6,98.6,86.6,89.0,106.6,131.5,98.7,82.1,69.2,72.2
9,88.3,65.1,63.8,63.1,58.5,68.0,74.8,87.1,93.1,104.7,99.8,95.6,98.4,93.1,98.25,80.3,91.5,86.4,94.8,104.2,86.1,86.0,74.0,78.5


# Removing outliers

In [2302]:
# TODO

# Subject 2429672 has no data about gender, sports and urbanisation 
asthma_df = asthma_df[asthma_df['SubjectNr'] != 2429672]

# Alligning column names

In [2303]:
# Renaming columns (allignment)
asthma_df = asthma_df.rename(columns={'Gender': 'sex'})
asthma_df = asthma_df.rename(columns={'Weight': 'weight'})
asthma_df = asthma_df.rename(columns={'Height': 'height'})

# What grade they are in
healthy_df = healthy_df.rename(columns={'school_year_final': 'school_year'})
asthma_df = asthma_df.rename(columns={'school_year_edit': 'school_year'})

# Wheter they do a sport
asthma_df = asthma_df.rename(columns={'Sports': 'sportsyesno'})

# Note there are 2 urbanisations
asthma_df = asthma_df.rename(columns={'urbanisation2': 'urbanisation'})

# pedsql
asthma_df = asthma_df.rename(columns={'BASELINE_PedsQL_score': 'PedsQL_score_baseline'})

# Note healthy_df doesn't have all wear variables (it only has the grped vars)
# Asthma_df doesnt have wear05H lets compute it based on the other wear vars
asthma_df['wear05H'] = (asthma_df.loc[:, 'wear00':'wear05'] == 1).mean(axis=1) * 100

# No temperature in asthma_df but is in healthy_df?
# Drop the col
healthy_df = healthy_df.drop('BODY_TEMPERATURE_DEG_C', axis=1)

# No blood preassure in asthma_df but is in healthy_df?
# drop in healthy
healthy_df = healthy_df.drop('DIASTOLIC_BLOOD_PRESSURE_MMHG', axis=1)
healthy_df = healthy_df.drop('SYSTOLIC_BLOOD_PRESSURE_MMHG', axis=1)

# Note that WEIGHT_KG is a daily meassurement in healthy_df
# in asthma_df this meassurement is only taken at begin and end of the study period
# Also missing data is 87% lets drop this col
healthy_df = healthy_df.drop('WEIGHT_KG', axis=1)

# I am assuming these columns are the same (bedtimeReport and waketimeReport not in legend))
asthma_df = asthma_df.rename(columns={'bedtimeReport': 'sleeptime'})
asthma_df = asthma_df.rename(columns={'waketimeReport': 'waketime'})

# Create a new column with the hourly categories starting from 0 to 23
hour_mapping = {i: i for i in range(24)}

# Converting to datetime
healthy_df['sleeptime'] = pd.to_datetime(healthy_df['sleeptime'])
asthma_df['sleeptime'] = pd.to_datetime(asthma_df['sleeptime'])
healthy_df['waketime'] = pd.to_datetime(healthy_df['waketime'])
asthma_df['waketime'] = pd.to_datetime(asthma_df['waketime'])

# Setting hourly value
healthy_df['sleeptime'] = healthy_df['sleeptime'].dt.hour
asthma_df['sleeptime'] = asthma_df['sleeptime'].dt.hour
healthy_df['waketime'] = healthy_df['waketime'].dt.hour
asthma_df['waketime'] = asthma_df['waketime'].dt.hour

# predicted_fvc_best, predicted_fev1_best, predicted_fev1_ratio_best
# All of these not in asthma
healthy_df = healthy_df.drop('predicted_fvc_best', axis=1)
healthy_df = healthy_df.drop('predicted_fev1_best', axis=1)
healthy_df = healthy_df.drop('predicted_fev1_ratio_best', axis=1)

# Screentime, values need to be alligned 
# Alignment dictionary
alignment_dict = {
    '0': '0',
    'D. 2 uur tot 4 uur': '2-4 hours',
    'A. 0 tot 30 minuten': '0-30 min',
    'C. 1 uur tot 2 uur': '1-2 hours',
    'B. 30 tot 60 minuten': '0.5-1 hours',
    'E. Meer dan 4 uur': '> 4 hours'
}

# Rename values in the first dataframe based on the alignment dictionary
asthma_df['screentime'] = asthma_df['screentime'].replace(alignment_dict)

In [2304]:
healthy_df.head(5)

Unnamed: 0,Index,SubjectNr,DayNo,weekday,dayType,Age,sex,weight,height,BMI_SDS,ethnicity,school_year,sportsyesno,urbanisation,PedsQL_score_baseline,stepsTotalDaily,steps_hour_max,steps00,steps01,steps02,steps03,steps04,steps05,steps06,steps07,steps08,steps09,steps10,steps11,steps12,steps13,steps14,steps15,steps16,steps17,steps18,steps19,steps20,steps21,steps22,steps23,HR05Perc,HR95Perc,HRMinSleep,HRMaxSleep,AVGHR_daily,AVGHR_sleep,AVGHR_wake,HR00,HR01,HR02,HR03,HR04,HR05,HR06,HR07,HR08,HR09,HR10,HR11,HR12,HR13,HR14,HR15,HR16,HR17,HR18,HR19,HR20,HR21,HR22,HR23,wear05H,wear16H,wear24H,HEART_PULSE_BPM,awakeDuration,lightSleepDuration,deepSleepDuration,wakeUpCount,sleeptime,waketime,fvc_best,fev1_best,pef_best,grade_fev1,grade_fvc,fev1_ratio_best,fev1_percentage,fvc1_percentage,fev1_ratio_percentage,school_yes_no,screentime,FG,FHX,FHN,TG,TN,TX,SQ,SP,DR,RH,RHX
0,1,2853189,0,Thu,holiday,2,Female,12.0,92.0,-1.8,Other / Mixed,Day care,No,Extremely urbanised,97.826087,2806.0,1992.0,,,,,,,,,,,,7.0,,,,,480.0,1992.0,156.0,51.0,61.0,39.0,,20.0,56.0,123.0,,,96.0,,97.0,,,,,,,,,,,,,,,,,,115.5,106.5,109.0,,56.0,,92.0,0,44,33,,,,,,,,,,,,,,,,,,,5.6,8,4,5.2,3.4,7.5,2.0,25,1.9,9,2
1,2,2853189,1,Fri,holiday,2,Female,12.0,92.0,-1.8,Other / Mixed,Day care,No,Extremely urbanised,97.826087,2564.0,1205.0,151.0,,,,,,,,,,,,,32.0,9.0,58.0,,,,1205.0,599.0,,,,52.0,163.0,,,126.0,119.0,128.0,119.0,,,,,,,,,,,,,,,,,,,126.5,113.6,145.0,,,20,38,29,,,,,,,,,,,,,,,,,Neither,,7.9,9,4,5.9,4.4,7.2,0.0,0,0.0,-1,-1
2,3,2853189,2,Sat,holiday,2,Female,12.0,92.0,-1.8,Other / Mixed,Day care,No,Extremely urbanised,97.826087,521.0,369.0,,,,,,,,,,,,369.0,78.0,,,,,,,74.0,,,,,71.0,137.0,,,103.0,,103.0,,,,,,,,,,,,107.0,,,,,,,102.5,100.333333,,,,,0,25,17,,,,,,,,,,,,,,,,,Neither,,9.2,11,8,7.1,6.0,7.8,0.0,0,0.2,1,1
3,4,2853189,3,Sun,holiday,2,Female,12.0,92.0,-1.8,Other / Mixed,Day care,No,Extremely urbanised,97.826087,250.0,114.0,,,,,,,,,,,,,114.0,106.0,,,,,,,30.0,,,,151.0,156.0,,,154.0,,154.0,,,,,,,,,,,,,,153.5,,,,,,,,,,,0,19,12,,,,,,,,,,,,,,,,,,,4.3,8,1,6.4,4.5,7.0,0.0,0,0.0,-1,-1
4,5,2853189,4,Mon,school,2,Female,12.0,92.0,-1.8,Other / Mixed,Day care,No,Extremely urbanised,97.826087,9.0,9.0,,,,,,,,,,,,,,,,,,,,9.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,6,4,,,,,,,,,,,,,,,,,Neither,,7.3,14,1,8.0,4.7,9.8,0.0,0,0.6,3,1


In [2305]:
asthma_df.head(5)

Unnamed: 0.1,Unnamed: 0,SubjectNr,DayNo,weekday,dayType,stepsTotalDaily,stepsTotalDetailed,steps00,steps01,steps02,steps03,steps04,steps05,steps06,steps07,steps08,steps09,steps10,steps11,steps12,steps13,steps14,steps15,steps16,steps17,steps18,steps19,steps20,steps21,steps22,steps23,HR05Perc,HR95Perc,HrAvgWake,HR00,HR01,HR02,HR03,HR04,HR05,HR06,HR07,HR08,HR09,HR10,HR11,HR12,HR13,HR14,HR15,HR16,HR17,HR18,HR19,HR20,HR21,HR22,HR23,awakeDuration,lightSleepDuration,deepSleepDuration,wakeUpCount,HRAvgSleep,HRMinSleep,HRMaxSleep,activity_score_parent,school_yes_no,activity_score_child,screentime,sleep_score_child,sleeptime,waketime,fvc_best,fev1_best,pef_best,grade_fev1,grade_fvc,Technique,fev1_ratio_best,fev1_percentage,fvc1_percentage,fev1_ratio_percentage,ACD1,ACD2,ACD3,ACD4,ACD5,ACD6,ACD6_use,ACD6score,BASELINE_ACQ6_score,EOS_ACQ6_score,BASELINE_physical_tot,EOS_physical_tot,BASELINE_emotional_tot,EOS_emotional_tot,BASELINE_school_tot,EOS_school_tot,BASELINE_bodyimage_tot,EOS_bodyimage_tot,BASELINE_treatmentburden_tot,EOS_treatmentburden_tot,BASELINE_health_tot,EOS_health_tot,BASELINE_weight_tot,EOS_weight_tot,BASELINE_respiratory_tot,EOS_respiratory_tot,BASELINE_digestion_tot,EOS_digestion_tot,BASELINE_PAQLQ_total,EOS_PAQLQ_total,BASELINE_PAQLQ_activity,EOS_PAQLQ_activity,BASELINE_PAQLQ_symptoms,EOS_PAQLQ_symptoms,BASELINE_PAQLQ_emotion,EOS_PAQLQ_emotion,PedsQL_score_baseline,EOS_PedsQL_score,FG,FHX,FHN,TG,TN,TX,SQ,SP,DR,RH,RHX,pollenTotal,NO,NO2,Nox,O3,Ox,PM10,PM25,Symptomscore,Admission_LY,Age,AgeDiagnosis,AllergicRhin,AtopicAsthma,BirthMonth,BirthYear,BMI,BMI_SDS,CF_Asthma,DailyActiv,DayCare,Dis_Chronic,Eosinophils,ExerciseSymp,Family_his,sex,height,Hosp_treatme,Hospital_Trt,ICS,LABA,Leukotri_Mod,Monoclonals,Oral_Steroid,OtherClass,PancrInsuffi,PetsHome,Pseudomonas,SABA,SchoolYear,Serum_IgE,SmokeHome,sportsyesno,Vaccination,weight,school_year,Urbanisation,urbanisation,totalSleepDuration,wear00,wear01,wear02,wear03,wear04,wear05,wear06,wear07,wear08,wear09,wear10,wear11,wear12,wear13,wear14,wear15,wear16,wear17,wear18,wear19,wear20,wear21,wear22,wear23,wear24H,wear16H,steps15_19,week,steps_hour_max,AVGHR_daily,AVGHR_wake,AVGHR_sleep,CONDITION,EventDay,wear05H
0,1,1771258,0,Tue,school,3723.0,3723.0,,,,,,,,,,63.0,137.0,,,605.0,1640.0,,82.0,451.0,245.0,135.0,160.0,164.0,41.0,,68.0,120.0,8689362000.0,,,,,,,,,,,,120.0,,,,,,89.8,83.4,87.8,91.5,93.3,77.1,78.6,660.0,16800.0,16440.0,1.0,75.0,61.0,93.0,,,,,,,,3232.0,2381.0,4334.0,D,B,3.0,0.736696,0.726555,0.877857,0.822316,,,,,,,,,0.166667,2666667000.0,,,,,,,,,,,,,,,,,,,6652174000.0,5304348000.0,6.4,5.6,6.5,4.2,7.0,6.5,847826087.0,8586957000.0,6.3,8,4,14.0,85,19.7,5.4,58,0.0,0,0,6.0,32.31,48.88,98.41,3.86,27.83,23.42,13.04,,1,15,7.0,No,Yes,12,2002.0,19.5,-0.1,Controlled asthma,School,0.0,0.0,,No,No,Female,163.2,0.0,0.0,Yes,Yes,No,No,No,No,,Yes,,Yes,4.0,1465.0,No,Yes,0.0,51.9,Secondary school year 4,1.0,Extremely urbanised,33240.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,54.0,69.0,,1.0,1640.0,90.0,94.0,,Controlled asthma,,0.0
1,2,1771258,1,Wed,school,10015.0,10015.0,,,,,,,,,607.0,580.0,325.0,180.0,322.0,181.0,491.0,161.0,10.0,744.0,1146.0,4355.0,722.0,142.0,49.0,,82.0,168.0,1326242000.0,73.6,75.2,70.75,92.0,76.0,70.3,82.3,78.0,99.0,113.0,129.8,95.1,91.6,,108.5,85.25,93.4,95.8,101.8,142.3,107.8,104.2,91.8,93.0,180.0,22200.0,11760.0,0.0,84.0,67.0,121.0,8.0,Ja,8.0,0.5-1 hours,9.0,15.0,13.0,3261.0,2257.0,4509.0,A,A,3.0,0.692119,0.688716,0.885734,0.772559,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.5,0.166667,2666667000.0,,,,,,,,,,,,,,,,,,,6652174000.0,5304348000.0,6.4,5.6,6.5,4.2,7.0,6.5,847826087.0,8586957000.0,10.0,12,8,11.3,100,13.1,0.2,2,0.9,2,1,3.0,3.47,32.18,37.49,25.72,29.69,10.83,3.09,,1,15,7.0,No,Yes,12,2002.0,19.5,-0.1,Controlled asthma,School,0.0,0.0,,No,No,Female,163.2,0.0,0.0,Yes,Yes,No,No,No,No,,Yes,,Yes,4.0,1465.0,No,Yes,0.0,51.9,Secondary school year 4,1.0,Extremely urbanised,33960.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0,100.0,2061.0,1.0,4355.0,94.0,102.0,76.0,Controlled asthma,,100.0
2,3,1771258,2,Thu,school,3811.0,3811.0,,,,,,,,63.0,561.0,159.0,356.0,26.0,591.0,109.0,114.0,456.0,155.0,727.0,46.0,139.0,210.0,6.0,93.0,,71.0,122.0,9030597000.0,79.1,73.1,80.3,78.0,76.1,74.6,78.1,98.5,103.4,100.5,109.6,100.6,105.0,109.0,86.5,88.6,102.8,78.2,85.2,94.0,100.0,89.25,96.5,82.0,300.0,19260.0,12360.0,1.0,76.0,64.0,129.0,5.0,Ja,6.0,0.5-1 hours,9.0,15.0,13.0,3451.0,2567.0,4717.0,B,B,3.0,0.743842,0.783245,0.937255,0.830297,0.0,0.0,1.0,1.0,0.0,6.0,3.0,0.833333,0.166667,2666667000.0,,,,,,,,,,,,,,,,,,,6652174000.0,5304348000.0,6.4,5.6,6.5,4.2,7.0,6.5,847826087.0,8586957000.0,6.0,9,4,9.9,62,13.3,8.2,89,0.0,0,0,1.0,6.8,37.86,48.29,26.94,33.27,16.22,4.94,,1,15,7.0,No,Yes,12,2002.0,19.5,-0.1,Controlled asthma,School,0.0,0.0,,No,No,Female,163.2,0.0,0.0,Yes,Yes,No,No,No,No,,Yes,,Yes,4.0,1465.0,No,Yes,0.0,51.9,Secondary school year 4,1.0,Extremely urbanised,31620.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0,100.0,1384.0,1.0,727.0,90.0,96.0,77.0,Controlled asthma,,100.0
3,4,1771258,3,Fri,school,4346.0,4346.0,,,,,,,,,515.0,673.0,377.0,490.0,192.0,322.0,180.0,523.0,61.0,359.0,174.0,95.0,148.0,196.0,21.0,20.0,66.0,128.0,8843151000.0,77.1,79.8,72.6,67.0,80.2,76.0,73.6,79.0,98.5,103.5,106.4,95.5,106.2,98.2,100.0,84.3,86.8,97.2,74.0,107.0,82.4,96.1,90.8,78.2,1140.0,17041.0,19619.0,1.0,75.0,59.0,107.0,5.0,Ja,6.0,0.5-1 hours,8.0,15.0,13.0,3419.0,2497.0,4839.0,B,B,3.0,0.730331,0.761887,0.928565,0.815214,0.0,0.0,1.0,1.0,0.0,4.0,2.0,0.666667,0.166667,2666667000.0,,,,,,,,,,,,,,,,,,,6652174000.0,5304348000.0,6.4,5.6,6.5,4.2,7.0,6.5,847826087.0,8586957000.0,6.9,10,4,9.5,63,11.7,3.2,35,0.0,0,0,1.0,19.32,47.46,77.08,6.62,28.13,20.86,12.79,,1,15,7.0,No,Yes,12,2002.0,19.5,-0.1,Controlled asthma,School,0.0,0.0,,No,No,Female,163.2,0.0,0.0,Yes,Yes,No,No,No,No,,Yes,,Yes,4.0,1465.0,No,Yes,0.0,51.9,Secondary school year 4,1.0,Extremely urbanised,36660.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0,100.0,1117.0,1.0,673.0,88.0,93.0,75.0,Controlled asthma,,100.0
4,5,1771258,4,Sat,weekend,3270.0,3270.0,,,,,,,,,,50.0,92.0,167.0,1324.0,388.0,166.0,101.0,91.0,6.0,224.0,405.0,56.0,65.0,63.0,72.0,64.0,111.0,8184138000.0,83.3,77.6,75.1,76.2,72.6,75.8,73.8,68.0,74.8,65.4,94.4,85.7,77.25,86.5,78.0,76.6,90.0,89.1,92.2,110.6,77.4,81.1,76.5,82.0,540.0,17340.0,15300.0,2.0,67.0,57.0,88.0,6.0,Nee,6.0,0.5-1 hours,8.0,16.0,15.0,3519.0,2528.0,5.07,B,B,3.0,0.718386,0.77128,0.955636,0.801884,0.0,0.0,0.0,0.0,0.0,4.0,2.0,0.333333,0.166667,2666667000.0,,,,,,,,,,,,,,,,,,,6652174000.0,5304348000.0,6.4,5.6,6.5,4.2,7.0,6.5,847826087.0,8586957000.0,9.3,12,6,11.2,98,12.9,0.5,5,5.5,38,10,0.0,1.29,24.07,26.05,29.31,27.51,10.35,3.31,,1,15,7.0,No,Yes,12,2002.0,19.5,-0.1,Controlled asthma,School,0.0,0.0,,No,No,Female,163.2,0.0,0.0,Yes,Yes,No,No,No,No,,Yes,,Yes,4.0,1465.0,No,Yes,0.0,51.9,Secondary school year 4,1.0,Extremely urbanised,32640.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0,100.0,422.0,1.0,1324.0,81.0,83.0,77.0,Controlled asthma,,100.0


### Technique

About technique 2 Matthijs wrote this:
- Is it possible to do your analyses with both options to see whether it impacts the performance of your models? If not, I would just use the ones marked ‘2’ anyway to include as much data as possible.


In [2306]:
asthma_df['Technique'].value_counts()

3.0    1601
2.0     152
1.0      37
Name: Technique, dtype: int64

In [2307]:
asthma_df = asthma_df.drop(asthma_df[asthma_df['Technique'] == 1].index)

# One-hot encoding

In [2308]:
# Looking at the possible values of different columns in the healthy and sick database.
print("Let's allign these values: ")
print(healthy_df['weekday'].unique())
print(asthma_df['weekday'].unique())
print()
print(healthy_df['dayType'].unique())
print(asthma_df['dayType'].unique())
print()
print(healthy_df['school_yes_no'].unique())
print(asthma_df['school_yes_no'].unique())
print()
print(healthy_df['sex'].unique())
print(asthma_df['sex'].unique())
print()
print(healthy_df['school_year'].unique())
print(asthma_df['school_year'].unique())
print()
print(healthy_df['sportsyesno'].unique())
print(asthma_df['sportsyesno'].unique())
print()
print(healthy_df['urbanisation'].unique())
print(asthma_df['urbanisation'].unique())
print()
print(healthy_df['grade_fev1'].unique())
print(asthma_df['grade_fev1'].unique())
print()
print(healthy_df['grade_fvc'].unique())
print(asthma_df['grade_fvc'].unique())
print()
print(healthy_df['fev1_ratio_best'].unique())
print(asthma_df['fev1_ratio_best'].unique())
print("I am not sure why there are only true or nans here???")
print()
print(healthy_df['screentime'].unique())
print(asthma_df['screentime'].unique()) 

Let's allign these values: 
['Thu' 'Fri' 'Sat' 'Sun' 'Mon' 'Tue' 'Wed']
['Tue' 'Wed' 'Thu' 'Fri' 'Sat' 'Sun' 'Mon']

['holiday' 'school' 'weekend']
['school' 'weekend' 'holiday']

[nan 'Neither' 'School' 'Day Care']
[nan 'Ja' 'Nee']

['Female' 'Male']
['Female' 'Male']

['Day care' 'Primary school' 'Secondary school' 'Vocational education']
['Secondary school year 4' 'Primary school year 5'
 'Secondary school year 1' 'Primary school year 7' 'Primary school year 4'
 'Primary school year 8' 'Secondary school year 3' 'Primary school year 3'
 'Secondary school year 2' 'Primary school year 6' nan]

['No' 'Yes']
['Yes' 'No' nan]

['Extremely urbanised' 'Moderately urbanised' 'Very urbanised'
 'Little urbanised']
['Extremely urbanised' 'Not extremely urbanised']

[nan 'E' 'A' 'C' 'U' 'B' 'D' 'F']
['D' 'A' 'B' 'E' 'C' 'U' nan]

[nan 'E' 'A' 'C' 'U' 'B' 'D' 'F']
['B' 'A' 'C' 'E' 'D' 'U' nan]

[nan True]
[0.73669555 0.69211898 0.74384237 ... 0.83669886 0.83624273 0.80671937]
I am not sure why th

In [2309]:
# school_yes_no
# for healthy: school, daycare or neither
# for asthma: yes or no (school)
# Decision: school and daycare are the same class
# note that nan will be also be a category in the one-hot encoding (unknown)
healthy_df['school_yes_no'] = healthy_df['school_yes_no'].replace({'Neither': 'no', 'Day Care': 'yes', 'School': 'yes'})
asthma_df['school_yes_no'] = asthma_df['school_yes_no'].replace({'Nee': 'no', 'Ja': 'yes'})

# sex
# it doesnt make sense to one hot encode nan for one subject 
# this won't improve classification, we will have to assume a gender 
# Women are more likely to classify as non-binary --> assume female
# https://www.pewresearch.org/social-trends/2022/06/28/americans-complex-views-on-gender-identity-and-transgender-issues/
asthma_df['sex'] = asthma_df['sex'].fillna('Female')

# school year
# I think it will be best to drop this its likely to different in each data set 
healthy_df = healthy_df.drop('school_year', axis=1)
asthma_df = asthma_df.drop('school_year', axis=1)

# sports_yes_no
# A couple subjects didn't fill this in, I think it is a fair assumption to made that 
# if they didn't fill it in than they didn't do sports
asthma_df['sportsyesno'] = asthma_df['sportsyesno'].fillna('No')

# Urbanization
# Decision: Extremely and very will be merged to extremely urbanized
# moderately and little urbanized will be merged to not extremely urbanized
healthy_df['urbanisation'].replace({'Very urbanised': 'Extremely urbanised',
                                    'Moderately urbanised': 'Not extremely urbanised', 
                                    'Little urbanised': 'Not extremely urbanised'}, 
                                   inplace=True)

In [2310]:
print("new classes (they have to be the same): ")
print(healthy_df['weekday'].unique())
print(asthma_df['weekday'].unique())
print()
print(healthy_df['dayType'].unique())
print(asthma_df['dayType'].unique())
print()
print(healthy_df['school_yes_no'].unique())
print(asthma_df['school_yes_no'].unique())
print()
print(healthy_df['sex'].unique())
print(asthma_df['sex'].unique())
print()
print(healthy_df['sportsyesno'].unique())
print(asthma_df['sportsyesno'].unique())
print()
print(healthy_df['urbanisation'].unique())
print(asthma_df['urbanisation'].unique())
print()
print(healthy_df['grade_fev1'].unique())
print(asthma_df['grade_fev1'].unique())
print()
print(healthy_df['grade_fvc'].unique())
print(asthma_df['grade_fvc'].unique())
print()
print(healthy_df['fev1_ratio_best'].unique())
print(asthma_df['fev1_ratio_best'].unique())
print("I am not sure why there are only true or nans here???")
print()
print(healthy_df['screentime'].unique())
print(asthma_df['screentime'].unique()) 

new classes (they have to be the same): 
['Thu' 'Fri' 'Sat' 'Sun' 'Mon' 'Tue' 'Wed']
['Tue' 'Wed' 'Thu' 'Fri' 'Sat' 'Sun' 'Mon']

['holiday' 'school' 'weekend']
['school' 'weekend' 'holiday']

[nan 'no' 'yes']
[nan 'yes' 'no']

['Female' 'Male']
['Female' 'Male']

['No' 'Yes']
['Yes' 'No']

['Extremely urbanised' 'Not extremely urbanised']
['Extremely urbanised' 'Not extremely urbanised']

[nan 'E' 'A' 'C' 'U' 'B' 'D' 'F']
['D' 'A' 'B' 'E' 'C' 'U' nan]

[nan 'E' 'A' 'C' 'U' 'B' 'D' 'F']
['B' 'A' 'C' 'E' 'D' 'U' nan]

[nan True]
[0.73669555 0.69211898 0.74384237 ... 0.83669886 0.83624273 0.80671937]
I am not sure why there are only true or nans here???

[nan '2-4 hours' '0-30 min' '0.5-1 hours' '1-2 hours' '> 4 hours']
[nan '0.5-1 hours' '1-2 hours' '0-30 min' '2-4 hours' '> 4 hours']


In [2311]:
# one-hot encoding
one_hot_asthma = pd.get_dummies(asthma_df[['weekday', 'dayType', 'school_yes_no', 'sex', 'sportsyesno', 'urbanisation',
                                          'grade_fev1', 'grade_fvc', 'screentime']])
one_hot_healthy = pd.get_dummies(healthy_df[['weekday', 'dayType', 'school_yes_no', 'sex', 'sportsyesno', 'urbanisation',
                                          'grade_fev1', 'grade_fvc', 'screentime']])

# Merging
asthma_df = pd.concat([asthma_df, one_hot_asthma], axis=1)
healthy_df = pd.concat([healthy_df, one_hot_healthy], axis=1)

# Make sure to drop old columns (they are replaced with the hot-encoded cols)
healthy_df = healthy_df.drop(columns=['weekday', 'dayType', 'school_yes_no', 'sex', 'sportsyesno', 'urbanisation',
                                          'grade_fev1', 'grade_fvc', 'screentime'])
asthma_df = asthma_df.drop(columns=['weekday', 'dayType', 'school_yes_no', 'sex', 'sportsyesno', 'urbanisation',
                                          'grade_fev1', 'grade_fvc', 'screentime'])

# Result from one-hot encoding

In [2312]:
healthy_df.head(5)

Unnamed: 0,Index,SubjectNr,DayNo,Age,weight,height,BMI_SDS,ethnicity,PedsQL_score_baseline,stepsTotalDaily,steps_hour_max,steps00,steps01,steps02,steps03,steps04,steps05,steps06,steps07,steps08,steps09,steps10,steps11,steps12,steps13,steps14,steps15,steps16,steps17,steps18,steps19,steps20,steps21,steps22,steps23,HR05Perc,HR95Perc,HRMinSleep,HRMaxSleep,AVGHR_daily,AVGHR_sleep,AVGHR_wake,HR00,HR01,HR02,HR03,HR04,HR05,HR06,HR07,HR08,HR09,HR10,HR11,HR12,HR13,HR14,HR15,HR16,HR17,HR18,HR19,HR20,HR21,HR22,HR23,wear05H,wear16H,wear24H,HEART_PULSE_BPM,awakeDuration,lightSleepDuration,deepSleepDuration,wakeUpCount,sleeptime,waketime,fvc_best,fev1_best,pef_best,fev1_ratio_best,fev1_percentage,fvc1_percentage,fev1_ratio_percentage,FG,FHX,FHN,TG,TN,TX,SQ,SP,DR,RH,RHX,weekday_Fri,weekday_Mon,weekday_Sat,weekday_Sun,weekday_Thu,weekday_Tue,weekday_Wed,dayType_holiday,dayType_school,dayType_weekend,school_yes_no_no,school_yes_no_yes,sex_Female,sex_Male,sportsyesno_No,sportsyesno_Yes,urbanisation_Extremely urbanised,urbanisation_Not extremely urbanised,grade_fev1_A,grade_fev1_B,grade_fev1_C,grade_fev1_D,grade_fev1_E,grade_fev1_F,grade_fev1_U,grade_fvc_A,grade_fvc_B,grade_fvc_C,grade_fvc_D,grade_fvc_E,grade_fvc_F,grade_fvc_U,screentime_0-30 min,screentime_0.5-1 hours,screentime_1-2 hours,screentime_2-4 hours,screentime_> 4 hours
0,1,2853189,0,2,12.0,92.0,-1.8,Other / Mixed,97.826087,2806.0,1992.0,,,,,,,,,,,,7.0,,,,,480.0,1992.0,156.0,51.0,61.0,39.0,,20.0,56.0,123.0,,,96.0,,97.0,,,,,,,,,,,,,,,,,,115.5,106.5,109.0,,56.0,,92.0,0,44,33,,,,,,,,,,,,,,,5.6,8,4,5.2,3.4,7.5,2.0,25,1.9,9,2,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,2853189,1,2,12.0,92.0,-1.8,Other / Mixed,97.826087,2564.0,1205.0,151.0,,,,,,,,,,,,,32.0,9.0,58.0,,,,1205.0,599.0,,,,52.0,163.0,,,126.0,119.0,128.0,119.0,,,,,,,,,,,,,,,,,,,126.5,113.6,145.0,,,20,38,29,,,,,,,,,,,,,,,7.9,9,4,5.9,4.4,7.2,0.0,0,0.0,-1,-1,1,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,2853189,2,2,12.0,92.0,-1.8,Other / Mixed,97.826087,521.0,369.0,,,,,,,,,,,,369.0,78.0,,,,,,,74.0,,,,,71.0,137.0,,,103.0,,103.0,,,,,,,,,,,,107.0,,,,,,,102.5,100.333333,,,,,0,25,17,,,,,,,,,,,,,,,9.2,11,8,7.1,6.0,7.8,0.0,0,0.2,1,1,0,0,1,0,0,0,0,1,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,2853189,3,2,12.0,92.0,-1.8,Other / Mixed,97.826087,250.0,114.0,,,,,,,,,,,,,114.0,106.0,,,,,,,30.0,,,,151.0,156.0,,,154.0,,154.0,,,,,,,,,,,,,,153.5,,,,,,,,,,,0,19,12,,,,,,,,,,,,,,,4.3,8,1,6.4,4.5,7.0,0.0,0,0.0,-1,-1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,2853189,4,2,12.0,92.0,-1.8,Other / Mixed,97.826087,9.0,9.0,,,,,,,,,,,,,,,,,,,,9.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,6,4,,,,,,,,,,,,,,,7.3,14,1,8.0,4.7,9.8,0.0,0,0.6,3,1,0,1,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Delete colums 

In [2313]:
# Just an index we dont need it
healthy_df = healthy_df.drop('Index', axis=1)

# Subjects number are not relevant for predicting
# healthy_df = healthy_df.drop('SubjectNr', axis=1) # Akos: I need the SubjectNr to fill the HR

# We only have ethnicity in healthy_df
healthy_df = healthy_df.drop('ethnicity', axis=1)

In [2314]:
healthy_df.head(5)

Unnamed: 0,SubjectNr,DayNo,Age,weight,height,BMI_SDS,PedsQL_score_baseline,stepsTotalDaily,steps_hour_max,steps00,steps01,steps02,steps03,steps04,steps05,steps06,steps07,steps08,steps09,steps10,steps11,steps12,steps13,steps14,steps15,steps16,steps17,steps18,steps19,steps20,steps21,steps22,steps23,HR05Perc,HR95Perc,HRMinSleep,HRMaxSleep,AVGHR_daily,AVGHR_sleep,AVGHR_wake,HR00,HR01,HR02,HR03,HR04,HR05,HR06,HR07,HR08,HR09,HR10,HR11,HR12,HR13,HR14,HR15,HR16,HR17,HR18,HR19,HR20,HR21,HR22,HR23,wear05H,wear16H,wear24H,HEART_PULSE_BPM,awakeDuration,lightSleepDuration,deepSleepDuration,wakeUpCount,sleeptime,waketime,fvc_best,fev1_best,pef_best,fev1_ratio_best,fev1_percentage,fvc1_percentage,fev1_ratio_percentage,FG,FHX,FHN,TG,TN,TX,SQ,SP,DR,RH,RHX,weekday_Fri,weekday_Mon,weekday_Sat,weekday_Sun,weekday_Thu,weekday_Tue,weekday_Wed,dayType_holiday,dayType_school,dayType_weekend,school_yes_no_no,school_yes_no_yes,sex_Female,sex_Male,sportsyesno_No,sportsyesno_Yes,urbanisation_Extremely urbanised,urbanisation_Not extremely urbanised,grade_fev1_A,grade_fev1_B,grade_fev1_C,grade_fev1_D,grade_fev1_E,grade_fev1_F,grade_fev1_U,grade_fvc_A,grade_fvc_B,grade_fvc_C,grade_fvc_D,grade_fvc_E,grade_fvc_F,grade_fvc_U,screentime_0-30 min,screentime_0.5-1 hours,screentime_1-2 hours,screentime_2-4 hours,screentime_> 4 hours
0,2853189,0,2,12.0,92.0,-1.8,97.826087,2806.0,1992.0,,,,,,,,,,,,7.0,,,,,480.0,1992.0,156.0,51.0,61.0,39.0,,20.0,56.0,123.0,,,96.0,,97.0,,,,,,,,,,,,,,,,,,115.5,106.5,109.0,,56.0,,92.0,0,44,33,,,,,,,,,,,,,,,5.6,8,4,5.2,3.4,7.5,2.0,25,1.9,9,2,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2853189,1,2,12.0,92.0,-1.8,97.826087,2564.0,1205.0,151.0,,,,,,,,,,,,,32.0,9.0,58.0,,,,1205.0,599.0,,,,52.0,163.0,,,126.0,119.0,128.0,119.0,,,,,,,,,,,,,,,,,,,126.5,113.6,145.0,,,20,38,29,,,,,,,,,,,,,,,7.9,9,4,5.9,4.4,7.2,0.0,0,0.0,-1,-1,1,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2853189,2,2,12.0,92.0,-1.8,97.826087,521.0,369.0,,,,,,,,,,,,369.0,78.0,,,,,,,74.0,,,,,71.0,137.0,,,103.0,,103.0,,,,,,,,,,,,107.0,,,,,,,102.5,100.333333,,,,,0,25,17,,,,,,,,,,,,,,,9.2,11,8,7.1,6.0,7.8,0.0,0,0.2,1,1,0,0,1,0,0,0,0,1,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2853189,3,2,12.0,92.0,-1.8,97.826087,250.0,114.0,,,,,,,,,,,,,114.0,106.0,,,,,,,30.0,,,,151.0,156.0,,,154.0,,154.0,,,,,,,,,,,,,,153.5,,,,,,,,,,,0,19,12,,,,,,,,,,,,,,,4.3,8,1,6.4,4.5,7.0,0.0,0,0.0,-1,-1,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2853189,4,2,12.0,92.0,-1.8,97.826087,9.0,9.0,,,,,,,,,,,,,,,,,,,,9.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,6,4,,,,,,,,,,,,,,,7.3,14,1,8.0,4.7,9.8,0.0,0,0.6,3,1,0,1,0,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [2315]:
asthma_df.head()

Unnamed: 0.1,Unnamed: 0,SubjectNr,DayNo,stepsTotalDaily,stepsTotalDetailed,steps00,steps01,steps02,steps03,steps04,steps05,steps06,steps07,steps08,steps09,steps10,steps11,steps12,steps13,steps14,steps15,steps16,steps17,steps18,steps19,steps20,steps21,steps22,steps23,HR05Perc,HR95Perc,HrAvgWake,HR00,HR01,HR02,HR03,HR04,HR05,HR06,HR07,HR08,HR09,HR10,HR11,HR12,HR13,HR14,HR15,HR16,HR17,HR18,HR19,HR20,HR21,HR22,HR23,awakeDuration,lightSleepDuration,deepSleepDuration,wakeUpCount,HRAvgSleep,HRMinSleep,HRMaxSleep,activity_score_parent,activity_score_child,sleep_score_child,sleeptime,waketime,fvc_best,fev1_best,pef_best,Technique,fev1_ratio_best,fev1_percentage,fvc1_percentage,fev1_ratio_percentage,ACD1,ACD2,ACD3,ACD4,ACD5,ACD6,ACD6_use,ACD6score,BASELINE_ACQ6_score,EOS_ACQ6_score,BASELINE_physical_tot,EOS_physical_tot,BASELINE_emotional_tot,EOS_emotional_tot,BASELINE_school_tot,EOS_school_tot,BASELINE_bodyimage_tot,EOS_bodyimage_tot,BASELINE_treatmentburden_tot,EOS_treatmentburden_tot,BASELINE_health_tot,EOS_health_tot,BASELINE_weight_tot,EOS_weight_tot,BASELINE_respiratory_tot,EOS_respiratory_tot,BASELINE_digestion_tot,EOS_digestion_tot,BASELINE_PAQLQ_total,EOS_PAQLQ_total,BASELINE_PAQLQ_activity,EOS_PAQLQ_activity,BASELINE_PAQLQ_symptoms,EOS_PAQLQ_symptoms,BASELINE_PAQLQ_emotion,EOS_PAQLQ_emotion,PedsQL_score_baseline,EOS_PedsQL_score,FG,FHX,FHN,TG,TN,TX,SQ,SP,DR,RH,RHX,pollenTotal,NO,NO2,Nox,O3,Ox,PM10,PM25,Symptomscore,Admission_LY,Age,AgeDiagnosis,AllergicRhin,AtopicAsthma,BirthMonth,BirthYear,BMI,BMI_SDS,CF_Asthma,DailyActiv,DayCare,Dis_Chronic,Eosinophils,ExerciseSymp,Family_his,height,Hosp_treatme,Hospital_Trt,ICS,LABA,Leukotri_Mod,Monoclonals,Oral_Steroid,OtherClass,PancrInsuffi,PetsHome,Pseudomonas,SABA,SchoolYear,Serum_IgE,SmokeHome,Vaccination,weight,Urbanisation,totalSleepDuration,wear00,wear01,wear02,wear03,wear04,wear05,wear06,wear07,wear08,wear09,wear10,wear11,wear12,wear13,wear14,wear15,wear16,wear17,wear18,wear19,wear20,wear21,wear22,wear23,wear24H,wear16H,steps15_19,week,steps_hour_max,AVGHR_daily,AVGHR_wake,AVGHR_sleep,CONDITION,EventDay,wear05H,weekday_Fri,weekday_Mon,weekday_Sat,weekday_Sun,weekday_Thu,weekday_Tue,weekday_Wed,dayType_holiday,dayType_school,dayType_weekend,school_yes_no_no,school_yes_no_yes,sex_Female,sex_Male,sportsyesno_No,sportsyesno_Yes,urbanisation_Extremely urbanised,urbanisation_Not extremely urbanised,grade_fev1_A,grade_fev1_B,grade_fev1_C,grade_fev1_D,grade_fev1_E,grade_fev1_U,grade_fvc_A,grade_fvc_B,grade_fvc_C,grade_fvc_D,grade_fvc_E,grade_fvc_U,screentime_0-30 min,screentime_0.5-1 hours,screentime_1-2 hours,screentime_2-4 hours,screentime_> 4 hours
0,1,1771258,0,3723.0,3723.0,,,,,,,,,,63.0,137.0,,,605.0,1640.0,,82.0,451.0,245.0,135.0,160.0,164.0,41.0,,68.0,120.0,8689362000.0,,,,,,,,,,,,120.0,,,,,,89.8,83.4,87.8,91.5,93.3,77.1,78.6,660.0,16800.0,16440.0,1.0,75.0,61.0,93.0,,,,,,3232.0,2381.0,4334.0,3.0,0.736696,0.726555,0.877857,0.822316,,,,,,,,,0.166667,2666667000.0,,,,,,,,,,,,,,,,,,,6652174000.0,5304348000.0,6.4,5.6,6.5,4.2,7.0,6.5,847826087.0,8586957000.0,6.3,8,4,14.0,85,19.7,5.4,58,0.0,0,0,6.0,32.31,48.88,98.41,3.86,27.83,23.42,13.04,,1,15,7.0,No,Yes,12,2002.0,19.5,-0.1,Controlled asthma,School,0.0,0.0,,No,No,163.2,0.0,0.0,Yes,Yes,No,No,No,No,,Yes,,Yes,4.0,1465.0,No,0.0,51.9,1.0,33240.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,54.0,69.0,,1.0,1640.0,90.0,94.0,,Controlled asthma,,0.0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
1,2,1771258,1,10015.0,10015.0,,,,,,,,,607.0,580.0,325.0,180.0,322.0,181.0,491.0,161.0,10.0,744.0,1146.0,4355.0,722.0,142.0,49.0,,82.0,168.0,1326242000.0,73.6,75.2,70.75,92.0,76.0,70.3,82.3,78.0,99.0,113.0,129.8,95.1,91.6,,108.5,85.25,93.4,95.8,101.8,142.3,107.8,104.2,91.8,93.0,180.0,22200.0,11760.0,0.0,84.0,67.0,121.0,8.0,8.0,9.0,15.0,13.0,3261.0,2257.0,4509.0,3.0,0.692119,0.688716,0.885734,0.772559,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.5,0.166667,2666667000.0,,,,,,,,,,,,,,,,,,,6652174000.0,5304348000.0,6.4,5.6,6.5,4.2,7.0,6.5,847826087.0,8586957000.0,10.0,12,8,11.3,100,13.1,0.2,2,0.9,2,1,3.0,3.47,32.18,37.49,25.72,29.69,10.83,3.09,,1,15,7.0,No,Yes,12,2002.0,19.5,-0.1,Controlled asthma,School,0.0,0.0,,No,No,163.2,0.0,0.0,Yes,Yes,No,No,No,No,,Yes,,Yes,4.0,1465.0,No,0.0,51.9,1.0,33960.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0,100.0,2061.0,1.0,4355.0,94.0,102.0,76.0,Controlled asthma,,100.0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,1,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
2,3,1771258,2,3811.0,3811.0,,,,,,,,63.0,561.0,159.0,356.0,26.0,591.0,109.0,114.0,456.0,155.0,727.0,46.0,139.0,210.0,6.0,93.0,,71.0,122.0,9030597000.0,79.1,73.1,80.3,78.0,76.1,74.6,78.1,98.5,103.4,100.5,109.6,100.6,105.0,109.0,86.5,88.6,102.8,78.2,85.2,94.0,100.0,89.25,96.5,82.0,300.0,19260.0,12360.0,1.0,76.0,64.0,129.0,5.0,6.0,9.0,15.0,13.0,3451.0,2567.0,4717.0,3.0,0.743842,0.783245,0.937255,0.830297,0.0,0.0,1.0,1.0,0.0,6.0,3.0,0.833333,0.166667,2666667000.0,,,,,,,,,,,,,,,,,,,6652174000.0,5304348000.0,6.4,5.6,6.5,4.2,7.0,6.5,847826087.0,8586957000.0,6.0,9,4,9.9,62,13.3,8.2,89,0.0,0,0,1.0,6.8,37.86,48.29,26.94,33.27,16.22,4.94,,1,15,7.0,No,Yes,12,2002.0,19.5,-0.1,Controlled asthma,School,0.0,0.0,,No,No,163.2,0.0,0.0,Yes,Yes,No,No,No,No,,Yes,,Yes,4.0,1465.0,No,0.0,51.9,1.0,31620.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0,100.0,1384.0,1.0,727.0,90.0,96.0,77.0,Controlled asthma,,100.0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
3,4,1771258,3,4346.0,4346.0,,,,,,,,,515.0,673.0,377.0,490.0,192.0,322.0,180.0,523.0,61.0,359.0,174.0,95.0,148.0,196.0,21.0,20.0,66.0,128.0,8843151000.0,77.1,79.8,72.6,67.0,80.2,76.0,73.6,79.0,98.5,103.5,106.4,95.5,106.2,98.2,100.0,84.3,86.8,97.2,74.0,107.0,82.4,96.1,90.8,78.2,1140.0,17041.0,19619.0,1.0,75.0,59.0,107.0,5.0,6.0,8.0,15.0,13.0,3419.0,2497.0,4839.0,3.0,0.730331,0.761887,0.928565,0.815214,0.0,0.0,1.0,1.0,0.0,4.0,2.0,0.666667,0.166667,2666667000.0,,,,,,,,,,,,,,,,,,,6652174000.0,5304348000.0,6.4,5.6,6.5,4.2,7.0,6.5,847826087.0,8586957000.0,6.9,10,4,9.5,63,11.7,3.2,35,0.0,0,0,1.0,19.32,47.46,77.08,6.62,28.13,20.86,12.79,,1,15,7.0,No,Yes,12,2002.0,19.5,-0.1,Controlled asthma,School,0.0,0.0,,No,No,163.2,0.0,0.0,Yes,Yes,No,No,No,No,,Yes,,Yes,4.0,1465.0,No,0.0,51.9,1.0,36660.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0,100.0,1117.0,1.0,673.0,88.0,93.0,75.0,Controlled asthma,,100.0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
4,5,1771258,4,3270.0,3270.0,,,,,,,,,,50.0,92.0,167.0,1324.0,388.0,166.0,101.0,91.0,6.0,224.0,405.0,56.0,65.0,63.0,72.0,64.0,111.0,8184138000.0,83.3,77.6,75.1,76.2,72.6,75.8,73.8,68.0,74.8,65.4,94.4,85.7,77.25,86.5,78.0,76.6,90.0,89.1,92.2,110.6,77.4,81.1,76.5,82.0,540.0,17340.0,15300.0,2.0,67.0,57.0,88.0,6.0,6.0,8.0,16.0,15.0,3519.0,2528.0,5.07,3.0,0.718386,0.77128,0.955636,0.801884,0.0,0.0,0.0,0.0,0.0,4.0,2.0,0.333333,0.166667,2666667000.0,,,,,,,,,,,,,,,,,,,6652174000.0,5304348000.0,6.4,5.6,6.5,4.2,7.0,6.5,847826087.0,8586957000.0,9.3,12,6,11.2,98,12.9,0.5,5,5.5,38,10,0.0,1.29,24.07,26.05,29.31,27.51,10.35,3.31,,1,15,7.0,No,Yes,12,2002.0,19.5,-0.1,Controlled asthma,School,0.0,0.0,,No,No,163.2,0.0,0.0,Yes,Yes,No,No,No,No,,Yes,,Yes,4.0,1465.0,No,0.0,51.9,1.0,32640.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,100.0,100.0,422.0,1.0,1324.0,81.0,83.0,77.0,Controlled asthma,,100.0,0,0,1,0,0,0,0,0,0,1,1,0,1,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0


# Setting the columns equal

In [2316]:
# Drop columns in asthma_df that aren't already in healthy_df
asthma_df = asthma_df.drop(columns=asthma_df.columns.difference(healthy_df.columns))

# Sort columns 
asthma_df = asthma_df.reindex(columns=healthy_df.columns)

# Assert the columns are the same
assert all(healthy_df.columns == asthma_df.columns)

In [2317]:
asthma_df.head(30)

Unnamed: 0,SubjectNr,DayNo,Age,weight,height,BMI_SDS,PedsQL_score_baseline,stepsTotalDaily,steps_hour_max,steps00,steps01,steps02,steps03,steps04,steps05,steps06,steps07,steps08,steps09,steps10,steps11,steps12,steps13,steps14,steps15,steps16,steps17,steps18,steps19,steps20,steps21,steps22,steps23,HR05Perc,HR95Perc,HRMinSleep,HRMaxSleep,AVGHR_daily,AVGHR_sleep,AVGHR_wake,HR00,HR01,HR02,HR03,HR04,HR05,HR06,HR07,HR08,HR09,HR10,HR11,HR12,HR13,HR14,HR15,HR16,HR17,HR18,HR19,HR20,HR21,HR22,HR23,wear05H,wear16H,wear24H,HEART_PULSE_BPM,awakeDuration,lightSleepDuration,deepSleepDuration,wakeUpCount,sleeptime,waketime,fvc_best,fev1_best,pef_best,fev1_ratio_best,fev1_percentage,fvc1_percentage,fev1_ratio_percentage,FG,FHX,FHN,TG,TN,TX,SQ,SP,DR,RH,RHX,weekday_Fri,weekday_Mon,weekday_Sat,weekday_Sun,weekday_Thu,weekday_Tue,weekday_Wed,dayType_holiday,dayType_school,dayType_weekend,school_yes_no_no,school_yes_no_yes,sex_Female,sex_Male,sportsyesno_No,sportsyesno_Yes,urbanisation_Extremely urbanised,urbanisation_Not extremely urbanised,grade_fev1_A,grade_fev1_B,grade_fev1_C,grade_fev1_D,grade_fev1_E,grade_fev1_F,grade_fev1_U,grade_fvc_A,grade_fvc_B,grade_fvc_C,grade_fvc_D,grade_fvc_E,grade_fvc_F,grade_fvc_U,screentime_0-30 min,screentime_0.5-1 hours,screentime_1-2 hours,screentime_2-4 hours,screentime_> 4 hours
0,1771258,0,15,51.9,163.2,-0.1,847826087.0,3723.0,1640.0,,,,,,,,,,63.0,137.0,,,605.0,1640.0,,82.0,451.0,245.0,135.0,160.0,164.0,41.0,,68.0,120.0,61.0,93.0,90.0,,94.0,,,,,,,,,,,,120.0,,,,,,89.8,83.4,87.8,91.5,93.3,77.1,78.6,0.0,69.0,54.0,,660.0,16800.0,16440.0,1.0,,,3232.0,2381.0,4334.0,0.736696,0.726555,0.8778575,0.822316,6.3,8,4,14.0,85,19.7,5.4,58,0.0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,1,0,,0,0,1,0,0,0,,0,0,0,0,0,0
1,1771258,1,15,51.9,163.2,-0.1,847826087.0,10015.0,4355.0,,,,,,,,,607.0,580.0,325.0,180.0,322.0,181.0,491.0,161.0,10.0,744.0,1146.0,4355.0,722.0,142.0,49.0,,82.0,168.0,67.0,121.0,94.0,76.0,102.0,73.6,75.2,70.75,92.0,76.0,70.3,82.3,78.0,99.0,113.0,129.8,95.1,91.6,,108.5,85.25,93.4,95.8,101.8,142.3,107.8,104.2,91.8,93.0,100.0,100.0,100.0,,180.0,22200.0,11760.0,0.0,15.0,13.0,3261.0,2257.0,4509.0,0.692119,0.688716,0.8857343,0.772559,10.0,12,8,11.3,100,13.1,0.2,2,0.9,2,1,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,1,1,0,1,0,0,0,0,,0,1,0,0,0,0,,0,0,1,0,0,0
2,1771258,2,15,51.9,163.2,-0.1,847826087.0,3811.0,727.0,,,,,,,,63.0,561.0,159.0,356.0,26.0,591.0,109.0,114.0,456.0,155.0,727.0,46.0,139.0,210.0,6.0,93.0,,71.0,122.0,64.0,129.0,90.0,77.0,96.0,79.1,73.1,80.3,78.0,76.1,74.6,78.1,98.5,103.4,100.5,109.6,100.6,105.0,109.0,86.5,88.6,102.8,78.2,85.2,94.0,100.0,89.25,96.5,82.0,100.0,100.0,100.0,,300.0,19260.0,12360.0,1.0,15.0,13.0,3451.0,2567.0,4717.0,0.743842,0.783245,0.9372555,0.830297,6.0,9,4,9.9,62,13.3,8.2,89,0.0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,,0,0,1,0,0,0,,0,0,1,0,0,0
3,1771258,3,15,51.9,163.2,-0.1,847826087.0,4346.0,673.0,,,,,,,,,515.0,673.0,377.0,490.0,192.0,322.0,180.0,523.0,61.0,359.0,174.0,95.0,148.0,196.0,21.0,20.0,66.0,128.0,59.0,107.0,88.0,75.0,93.0,77.1,79.8,72.6,67.0,80.2,76.0,73.6,79.0,98.5,103.5,106.4,95.5,106.2,98.2,100.0,84.3,86.8,97.2,74.0,107.0,82.4,96.1,90.8,78.2,100.0,100.0,100.0,,1140.0,17041.0,19619.0,1.0,15.0,13.0,3419.0,2497.0,4839.0,0.730331,0.761887,0.9285646,0.815214,6.9,10,4,9.5,63,11.7,3.2,35,0.0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,,0,0,1,0,0,0,,0,0,1,0,0,0
4,1771258,4,15,51.9,163.2,-0.1,847826087.0,3270.0,1324.0,,,,,,,,,,50.0,92.0,167.0,1324.0,388.0,166.0,101.0,91.0,6.0,224.0,405.0,56.0,65.0,63.0,72.0,64.0,111.0,57.0,88.0,81.0,77.0,83.0,83.3,77.6,75.1,76.2,72.6,75.8,73.8,68.0,74.8,65.4,94.4,85.7,77.25,86.5,78.0,76.6,90.0,89.1,92.2,110.6,77.4,81.1,76.5,82.0,100.0,100.0,100.0,,540.0,17340.0,15300.0,2.0,16.0,15.0,3519.0,2528.0,5.07,0.718386,0.77128,0.9556363,0.801884,9.3,12,6,11.2,98,12.9,0.5,5,5.5,38,10,0,0,1,0,0,0,0,0,0,1,1,0,1,0,0,1,1,0,0,1,0,0,0,,0,0,1,0,0,0,,0,0,1,0,0,0
5,1771258,5,15,51.9,163.2,-0.1,847826087.0,4697.0,2371.0,,,,,,,,,,134.0,207.0,107.0,234.0,2371.0,141.0,,599.0,145.0,133.0,146.0,59.0,97.0,324.0,,57.0,116.0,55.0,116.0,81.0,66.0,87.0,67.3,68.7,73.75,64.8,62.3,61.8,69.0,75.3,69.0,76.0,100.6,93.0,70.5,89.3,81.0,93.3,98.7,107.6,101.6,86.1,86.3,91.1,89.5,74.75,100.0,100.0,100.0,,120.0,16080.0,11820.0,0.0,16.0,14.0,3457.0,2378.0,4694.0,0.68788,0.725516,0.9387993,0.767832,8.4,11,5,11.4,96,13.9,2.1,23,1.0,19,18,0,0,0,1,0,0,0,0,0,1,1,0,1,0,0,1,1,0,1,0,0,0,0,,0,0,0,1,0,0,,0,0,0,1,0,0
6,1771258,6,15,51.9,163.2,-0.1,847826087.0,4574.0,645.0,,,,,,,,334.0,459.0,175.0,336.0,206.0,402.0,25.0,83.0,645.0,232.0,220.0,557.0,546.0,173.0,124.0,31.0,,62.0,118.0,56.0,106.0,84.0,71.0,90.0,71.8,75.0,74.3,64.0,69.5,73.3,72.0,78.25,103.0,103.0,93.8,93.1,89.5,89.7,96.1,84.5,86.5,95.1,88.6,96.5,86.0,79.0,80.5,75.0,100.0,100.0,100.0,,1440.0,14040.0,15720.0,2.0,15.0,12.0,3486.0,2509.0,4591.0,0.719736,0.765483,0.9466747,0.803391,6.2,8,5,10.4,95,11.7,0.8,9,5.8,10,2,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,1,0,0,0,0,,0,1,0,0,0,0,,0,0,1,0,0,0
7,1771258,7,15,51.9,163.2,-0.1,847826087.0,7244.0,1248.0,,,,,,,,431.0,256.0,488.0,919.0,605.0,6.0,382.0,421.0,187.0,1248.0,57.0,127.0,990.0,846.0,121.0,160.0,,61.0,120.0,54.0,129.0,87.0,70.0,95.0,82.2,70.1,68.5,64.4,69.8,63.0,89.0,99.5,94.7,101.4,97.6,99.5,94.1,88.6,84.6,101.7,95.8,94.8,89.5,101.0,100.25,88.3,81.1,70.8,100.0,100.0,100.0,,720.0,16260.0,16440.0,1.0,15.0,12.0,3435.0,2.46,4042.0,0.716157,0.750534,0.9328249,0.799397,8.3,11,6,10.5,94,12.8,6.6,74,0.4,3,3,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,1,1,0,0,0,0,1,0,,0,0,0,0,0,1,,0,0,1,0,0,0
8,1771258,8,15,51.9,163.2,-0.1,847826087.0,9796.0,4508.0,,,,,,,,,425.0,761.0,356.0,218.0,356.0,112.0,563.0,252.0,141.0,137.0,962.0,4508.0,641.0,239.0,16.0,,75.0,161.0,52.0,116.0,85.0,71.0,92.0,73.5,98.0,62.0,66.7,63.8,64.3,61.5,60.0,88.8,117.8,99.0,90.5,83.2,81.0,91.6,98.6,86.6,89.0,106.6,131.5,98.7,82.1,69.2,72.2,100.0,100.0,100.0,,120.0,17881.0,14459.0,0.0,15.0,13.0,3735.0,2.66,4859.0,0.712182,0.811415,101410900.0,0.794965,6.4,8,4,9.6,61,13.2,7.3,82,0.0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,1,1,0,1,0,0,0,0,,0,0,0,1,0,0,,0,0,1,0,0,0
9,1771258,9,15,51.9,163.2,-0.1,847826087.0,5447.0,1309.0,,,,,,,,209.0,679.0,122.0,432.0,,507.0,165.0,113.0,642.0,1309.0,35.0,654.0,255.0,80.0,245.0,,,59.0,113.0,55.0,103.0,85.0,68.0,92.0,88.3,65.1,63.8,63.1,58.5,68.0,74.8,87.1,93.1,104.7,99.8,95.6,98.4,93.1,98.25,80.3,91.5,86.4,94.8,104.2,86.1,86.0,74.0,78.5,100.0,100.0,100.0,,1800.0,14820.0,17100.0,2.0,15.0,12.0,3656.0,2547.0,4674.0,0.696663,0.776945,0.9926593,0.777642,5.0,7,3,9.0,52,13.7,7.3,82,0.0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,1,,0,0,0,0,0,1,,0,0,1,0,0,0


# Dealing with NaN's

In [2318]:
healthy_df.isnull().sum()

SubjectNr                                  0
DayNo                                      0
Age                                        0
weight                                   286
height                                    44
BMI_SDS                                  286
PedsQL_score_baseline                    110
stepsTotalDaily                           47
steps_hour_max                            82
steps00                                 3533
steps01                                 3692
steps02                                 3751
steps03                                 3784
steps04                                 3791
steps05                                 3713
steps06                                 2669
steps07                                 1166
steps08                                  698
steps09                                  493
steps10                                  400
steps11                                  380
steps12                                  360
steps13   

In [2319]:
asthma_df.isnull().sum()

SubjectNr                                  0
DayNo                                      0
Age                                        0
weight                                     0
height                                     0
BMI_SDS                                    0
PedsQL_score_baseline                    145
stepsTotalDaily                           12
steps_hour_max                           116
steps00                                 2305
steps01                                 2455
steps02                                 2488
steps03                                 2504
steps04                                 2504
steps05                                 2454
steps06                                 1977
steps07                                  938
steps08                                  600
steps09                                  424
steps10                                  352
steps11                                  321
steps12                                  289
steps13   

## Columns where we have nulls / NaNs

### stepsXX
With steps I think we should focus steps09-steps20, because before it and after it the number of records are dropping fast (probably bc they're sleeping) -> steps21-steps08 Nan could be 0

should check the % of the missing data of the remaining stepsXXs

### weight
If we know the height and age, we could add the avg weight a kid would have with this height and age.

### height
At 44 cases we only know the age. We could add the avg height and weight a kid would have with this age.

### BMI_SDS
It's missing only bc the weight or the weight+height is missing.

### stepsTotalDaily
If we guess the missing steps, we can count it (we have 0 here, if ALL the steps are missing)
comment: How do you want to guess the missing steps?
Ákos's comment: Yeaah, I meant that IF we can (want to) guess the steps, this variable won't be a problem haha. But I couldn't figure out any good method to do it. Maybe the avg steps of the same hour on different weeks. Like they have P.E. on Tuesday 9-11, so they'll have more steps this time of the week.

### steps_hour_max
If we guess the missing steps, we can count it (we have NaN here, if ALL the steps are missing)
comment: How do you want to guess the missing steps?

### HR05Perc and HR95Perc
If we guess the missing HRs, we can count it (we have NaN here, if ALL the HRs are missing)

### HRMinSleep and HRMaxSleep
These are quite randomly missing (I dunno how are these calculated), but we could fill it I think.
Comment: (Minimum/Max heart rate measured during sleep) --> between 11pm to 8am maybe? or perhaps better to calculate hrmin and hrmax based on the waketime and sleeptime variables!
Ákos's comment: Ahh true, I was tired. If we decide the "sleeping hours" at the first point, we can do the same here and do what you suggested.

### AVGHR_daily
If we guess the missing HRs, we can count it (we have NaN here, if ALL the HRs are missing)  

### AVGHR_sleep and AVGHR_wake
How do we know the kid is sleeping or not?
Comment: waketime and sleeptime (yes it has missing data, maybe predict the time they fell asleep based on biometrics such as heartrate? if heartrate down --> fall asleep?
Ákos's comment: Good idea!

### HR00-HR23
Daniel has good ideas about these. Avg Tuesday(e.g.), or prev. and next day, or smth else.

### awakeDuration
NA: no sleep data was registered. 579 times. Maybe the time between the kid started and stopped wearing the watch.

### sleeptime
NA: no sleep data was registered. 579 times. Maybe the time the kid stopped wearing the watch.

### waketime
NA: no sleep data was registered. 579 times. Maybe the time the kid started wearing the watch.

Comment: Good idea can be checked if accurate

### StepsXX

In [2320]:
healthy_df[['steps00', 'steps01', 'steps02', 'steps03', 'steps04', 'steps05', 'steps06', 'steps07', 'steps08', 'steps21', 'steps22', 'steps23']] = healthy_df[['steps00', 'steps01', 'steps02', 'steps03', 'steps04', 'steps05', 'steps06', 'steps07', 'steps08', 'steps21', 'steps22', 'steps23']].fillna(0)
asthma_df[['steps00', 'steps01', 'steps02', 'steps03', 'steps04', 'steps05', 'steps06', 'steps07', 'steps08', 'steps21', 'steps22', 'steps23']] = asthma_df[['steps00', 'steps01', 'steps02', 'steps03', 'steps04', 'steps05', 'steps06', 'steps07', 'steps08', 'steps21', 'steps22', 'steps23']].fillna(0)

healthy_df[['steps09', 'steps10', 'steps11', 'steps12', 'steps13', 'steps14', 'steps15', 'steps16', 'steps17', 'steps18', 'steps19', 'steps20']].isna().sum()

steps09    493
steps10    400
steps11    380
steps12    360
steps13    395
steps14    392
steps15    272
steps16    261
steps17    277
steps18    271
steps19    389
steps20    881
dtype: int64

### Weight & Height
source: www.disabled-world.com/calculators-charts/height-weight-teens.php

In [2321]:
healthy_df[['weight', 'height', 'BMI_SDS']].isna().sum()

weight     286
height      44
BMI_SDS    286
dtype: int64

In [2322]:
asthma_df[['weight', 'height', 'BMI_SDS']].isna().sum()

weight     0
height     0
BMI_SDS    0
dtype: int64

In [2323]:
f_awh = pd.DataFrame(np.array([
    [2, 12.02, 85.5],
    [3, 14.29, 94],
    [4, 15.42, 100.3],
    [5, 17.92, 107.9],
    [6, 19.96, 115.5],
    [7, 22.45, 121.1],
    [8, 25.85, 128.2],
    [9, 28.12, 133.3],
    [10, 31.98, 138.4],
    [11, 36.97, 144.0],
    [12, 41.5, 149.8],
    [13, 45.81, 156.7],
    [14, 47.63, 158.7],
    [15, 52.16, 159.7],
    [16, 53.52, 162.5]
]),
columns=['Age', 'weight', 'height'])

m_awh = pd.DataFrame(np.array([
    [2, 12.47, 86.8],
    [3, 14.06, 95.2],
    [4, 16.33, 102.3],
    [5, 18.37, 109.2],
    [6, 20.64, 115.5],
    [7, 22.9, 121.9],
    [8, 25.63, 128.0],
    [9, 28.58, 133.3],
    [10, 32.0, 138.4],
    [11, 35.6, 143.5],
    [12, 39.92, 149.1],
    [13, 45.36, 156.2],
    [14, 50.8, 163.8],
    [15, 56.02, 170.1],
    [16, 60.78, 173.4]
]),
columns=['Age', 'weight', 'height'])

In [2324]:
def fill_w_h(db, ref, column, sex_Female):
    db[column] = db.apply(
        lambda row: ref.loc[ref['Age'] == row['Age'], [column]].iloc[0][column] if np.isnan(row[column]) and row['sex_Female'] == sex_Female else row[column],
        axis=1
    )

In [2325]:
fill_w_h(healthy_df, f_awh, 'weight', 1)
fill_w_h(healthy_df, f_awh, 'height', 1)
fill_w_h(healthy_df, m_awh, 'weight', 0)
fill_w_h(healthy_df, m_awh, 'height', 0)
# We don't need for asthma_df, since we have no missing data there

### BMI_SDS

In [2326]:
# We don't need to do this for asthma_df

healthy_df['BMI'] = healthy_df['weight'] / (healthy_df['height']*healthy_df['height']/10000)

for age in range(2,17):
    mean = healthy_df.loc[healthy_df['Age'] == age, ['BMI']].describe().loc['mean'][0]
    std = healthy_df.loc[healthy_df['Age'] == age, ['BMI']].describe().loc['std'][0]
    healthy_df.loc[healthy_df['Age'] == age, ['BMI_SDS']] = np.round((healthy_df['BMI'] - mean) / std, 1)

healthy_df = healthy_df.drop(['BMI'], axis=1)

In [2327]:
healthy_df[['weight', 'height', 'BMI_SDS']].isna().sum()

weight     0
height     0
BMI_SDS    0
dtype: int64

### Getting AVG HR/hour/weekday

Now every NaN gets a number (if the avg could've been counted -> there's at least 1 record for avg) <br>
But we should drop those rows that have lots of missing data first

In [2328]:
healthy_subject_nrs = healthy_df['SubjectNr'].drop_duplicates()
asthma_subject_nrs = asthma_df['SubjectNr'].drop_duplicates()

In [2329]:
Mon_HR_AVGs = pd.DataFrame()
Tue_HR_AVGs = pd.DataFrame()
Wed_HR_AVGs = pd.DataFrame()
Thu_HR_AVGs = pd.DataFrame()
Fri_HR_AVGs = pd.DataFrame()
Sat_HR_AVGs = pd.DataFrame()
Sun_HR_AVGs = pd.DataFrame()
# no two SubjectNr are the same, so we don't need separated dataframes for asthma and healthy

def fill_avg_hr(database, subjects):
    for subject in subjects:
        Mon_HR_AVGs[subject] = np.round(database.loc[(database['SubjectNr'] == subject) & (database['weekday_Mon'] == 1), 'HR00':'HR23'].describe().loc['mean'],2)
        Tue_HR_AVGs[subject] = np.round(database.loc[(database['SubjectNr'] == subject) & (database['weekday_Tue'] == 1), 'HR00':'HR23'].describe().loc['mean'],2)
        Wed_HR_AVGs[subject] = np.round(database.loc[(database['SubjectNr'] == subject) & (database['weekday_Wed'] == 1), 'HR00':'HR23'].describe().loc['mean'],2)
        Thu_HR_AVGs[subject] = np.round(database.loc[(database['SubjectNr'] == subject) & (database['weekday_Thu'] == 1), 'HR00':'HR23'].describe().loc['mean'],2)
        Fri_HR_AVGs[subject] = np.round(database.loc[(database['SubjectNr'] == subject) & (database['weekday_Fri'] == 1), 'HR00':'HR23'].describe().loc['mean'],2)
        Sat_HR_AVGs[subject] = np.round(database.loc[(database['SubjectNr'] == subject) & (database['weekday_Sat'] == 1), 'HR00':'HR23'].describe().loc['mean'],2)
        Sun_HR_AVGs[subject] = np.round(database.loc[(database['SubjectNr'] == subject) & (database['weekday_Sun'] == 1), 'HR00':'HR23'].describe().loc['mean'],2)

fill_avg_hr(asthma_df, asthma_subject_nrs)
fill_avg_hr(healthy_df, healthy_subject_nrs)

  Mon_HR_AVGs[subject] = np.round(database.loc[(database['SubjectNr'] == subject) & (database['weekday_Mon'] == 1), 'HR00':'HR23'].describe().loc['mean'],2)
  Tue_HR_AVGs[subject] = np.round(database.loc[(database['SubjectNr'] == subject) & (database['weekday_Tue'] == 1), 'HR00':'HR23'].describe().loc['mean'],2)
  Wed_HR_AVGs[subject] = np.round(database.loc[(database['SubjectNr'] == subject) & (database['weekday_Wed'] == 1), 'HR00':'HR23'].describe().loc['mean'],2)
  Thu_HR_AVGs[subject] = np.round(database.loc[(database['SubjectNr'] == subject) & (database['weekday_Thu'] == 1), 'HR00':'HR23'].describe().loc['mean'],2)
  Fri_HR_AVGs[subject] = np.round(database.loc[(database['SubjectNr'] == subject) & (database['weekday_Fri'] == 1), 'HR00':'HR23'].describe().loc['mean'],2)
  Sat_HR_AVGs[subject] = np.round(database.loc[(database['SubjectNr'] == subject) & (database['weekday_Sat'] == 1), 'HR00':'HR23'].describe().loc['mean'],2)
  Sun_HR_AVGs[subject] = np.round(database.loc[(database['

In [2330]:
Mon_HR_AVGs.isna().sum() # Some subject has 24 NaNs = they have 0 record on Mondays

1771258     0
1125968     0
1137963     0
8380204     0
1725729     0
4016957     0
8251413     0
4484964     0
8438039     0
3864661     0
2404542     0
4957084     1
2461821     0
1353701     6
5557553     0
7612162     0
5682271     0
1086145     0
8804324     0
3167719     0
1282713     0
4877940     0
8150231     0
4651197     0
5302478     0
2985100     0
4571581     0
1642849     0
1438514     0
2450908     8
7190055     0
3175094     0
8628998     0
2102215     0
5995703     0
4962375     0
8364553     0
2689839     0
1921897     0
4581506     0
5637993     0
2492797     0
1334205     0
4936000     0
3085968     0
4286442     0
5461205     1
2665460     0
3495507     0
8488946     0
5158754     0
8466277     0
6115426     0
5310691     0
4795179     0
3004792     0
1130656     0
8125067     0
5578891     7
4722486     0
8533888     0
8635768     1
8196400     0
5039256     0
4986804     0
4745136     0
1925533     0
3132734     0
3831912     0
3762951     0
7358340     0
357369

In [2331]:
Mon_HR_AVGs = Mon_HR_AVGs.transpose()
Tue_HR_AVGs = Tue_HR_AVGs.transpose()
Wed_HR_AVGs = Wed_HR_AVGs.transpose()
Thu_HR_AVGs = Thu_HR_AVGs.transpose()
Fri_HR_AVGs = Fri_HR_AVGs.transpose()
Sat_HR_AVGs = Sat_HR_AVGs.transpose()
Sun_HR_AVGs = Sun_HR_AVGs.transpose()

In [2332]:
asthma_df.head()

Unnamed: 0,SubjectNr,DayNo,Age,weight,height,BMI_SDS,PedsQL_score_baseline,stepsTotalDaily,steps_hour_max,steps00,steps01,steps02,steps03,steps04,steps05,steps06,steps07,steps08,steps09,steps10,steps11,steps12,steps13,steps14,steps15,steps16,steps17,steps18,steps19,steps20,steps21,steps22,steps23,HR05Perc,HR95Perc,HRMinSleep,HRMaxSleep,AVGHR_daily,AVGHR_sleep,AVGHR_wake,HR00,HR01,HR02,HR03,HR04,HR05,HR06,HR07,HR08,HR09,HR10,HR11,HR12,HR13,HR14,HR15,HR16,HR17,HR18,HR19,HR20,HR21,HR22,HR23,wear05H,wear16H,wear24H,HEART_PULSE_BPM,awakeDuration,lightSleepDuration,deepSleepDuration,wakeUpCount,sleeptime,waketime,fvc_best,fev1_best,pef_best,fev1_ratio_best,fev1_percentage,fvc1_percentage,fev1_ratio_percentage,FG,FHX,FHN,TG,TN,TX,SQ,SP,DR,RH,RHX,weekday_Fri,weekday_Mon,weekday_Sat,weekday_Sun,weekday_Thu,weekday_Tue,weekday_Wed,dayType_holiday,dayType_school,dayType_weekend,school_yes_no_no,school_yes_no_yes,sex_Female,sex_Male,sportsyesno_No,sportsyesno_Yes,urbanisation_Extremely urbanised,urbanisation_Not extremely urbanised,grade_fev1_A,grade_fev1_B,grade_fev1_C,grade_fev1_D,grade_fev1_E,grade_fev1_F,grade_fev1_U,grade_fvc_A,grade_fvc_B,grade_fvc_C,grade_fvc_D,grade_fvc_E,grade_fvc_F,grade_fvc_U,screentime_0-30 min,screentime_0.5-1 hours,screentime_1-2 hours,screentime_2-4 hours,screentime_> 4 hours
0,1771258,0,15,51.9,163.2,-0.1,847826087.0,3723.0,1640.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,137.0,,,605.0,1640.0,,82.0,451.0,245.0,135.0,160.0,164.0,41.0,0.0,68.0,120.0,61.0,93.0,90.0,,94.0,,,,,,,,,,,,120.0,,,,,,89.8,83.4,87.8,91.5,93.3,77.1,78.6,0.0,69.0,54.0,,660.0,16800.0,16440.0,1.0,,,3232.0,2381.0,4334.0,0.736696,0.726555,0.877857,0.822316,6.3,8,4,14.0,85,19.7,5.4,58,0.0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,1,0,,0,0,1,0,0,0,,0,0,0,0,0,0
1,1771258,1,15,51.9,163.2,-0.1,847826087.0,10015.0,4355.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,607.0,580.0,325.0,180.0,322.0,181.0,491.0,161.0,10.0,744.0,1146.0,4355.0,722.0,142.0,49.0,0.0,82.0,168.0,67.0,121.0,94.0,76.0,102.0,73.6,75.2,70.75,92.0,76.0,70.3,82.3,78.0,99.0,113.0,129.8,95.1,91.6,,108.5,85.25,93.4,95.8,101.8,142.3,107.8,104.2,91.8,93.0,100.0,100.0,100.0,,180.0,22200.0,11760.0,0.0,15.0,13.0,3261.0,2257.0,4509.0,0.692119,0.688716,0.885734,0.772559,10.0,12,8,11.3,100,13.1,0.2,2,0.9,2,1,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,1,1,0,1,0,0,0,0,,0,1,0,0,0,0,,0,0,1,0,0,0
2,1771258,2,15,51.9,163.2,-0.1,847826087.0,3811.0,727.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,561.0,159.0,356.0,26.0,591.0,109.0,114.0,456.0,155.0,727.0,46.0,139.0,210.0,6.0,93.0,0.0,71.0,122.0,64.0,129.0,90.0,77.0,96.0,79.1,73.1,80.3,78.0,76.1,74.6,78.1,98.5,103.4,100.5,109.6,100.6,105.0,109.0,86.5,88.6,102.8,78.2,85.2,94.0,100.0,89.25,96.5,82.0,100.0,100.0,100.0,,300.0,19260.0,12360.0,1.0,15.0,13.0,3451.0,2567.0,4717.0,0.743842,0.783245,0.937255,0.830297,6.0,9,4,9.9,62,13.3,8.2,89,0.0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,,0,0,1,0,0,0,,0,0,1,0,0,0
3,1771258,3,15,51.9,163.2,-0.1,847826087.0,4346.0,673.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,515.0,673.0,377.0,490.0,192.0,322.0,180.0,523.0,61.0,359.0,174.0,95.0,148.0,196.0,21.0,20.0,66.0,128.0,59.0,107.0,88.0,75.0,93.0,77.1,79.8,72.6,67.0,80.2,76.0,73.6,79.0,98.5,103.5,106.4,95.5,106.2,98.2,100.0,84.3,86.8,97.2,74.0,107.0,82.4,96.1,90.8,78.2,100.0,100.0,100.0,,1140.0,17041.0,19619.0,1.0,15.0,13.0,3419.0,2497.0,4839.0,0.730331,0.761887,0.928565,0.815214,6.9,10,4,9.5,63,11.7,3.2,35,0.0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,,0,0,1,0,0,0,,0,0,1,0,0,0
4,1771258,4,15,51.9,163.2,-0.1,847826087.0,3270.0,1324.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,92.0,167.0,1324.0,388.0,166.0,101.0,91.0,6.0,224.0,405.0,56.0,65.0,63.0,72.0,64.0,111.0,57.0,88.0,81.0,77.0,83.0,83.3,77.6,75.1,76.2,72.6,75.8,73.8,68.0,74.8,65.4,94.4,85.7,77.25,86.5,78.0,76.6,90.0,89.1,92.2,110.6,77.4,81.1,76.5,82.0,100.0,100.0,100.0,,540.0,17340.0,15300.0,2.0,16.0,15.0,3519.0,2528.0,5.07,0.718386,0.77128,0.955636,0.801884,9.3,12,6,11.2,98,12.9,0.5,5,5.5,38,10,0,0,1,0,0,0,0,0,0,1,1,0,1,0,0,1,1,0,0,1,0,0,0,,0,0,1,0,0,0,,0,0,1,0,0,0


In [2333]:
asthma_df.loc[asthma_df['SubjectNr'] == 1771258, 'HR00':'HR23'].transpose().quantile(q=[0.05, 0.95])
# ??? the HR05Perc and HR95Perc are not right in the database... am I missing smth?

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
0.05,77.625,71.035,74.825,72.75,68.69,62.675,69.845,65.015,61.575,63.205,63.0025,65.66,62.9,58.255,65.9,65.855,69.83,64.75,63.9975,61.75,63.655,63.36,66.97,71.19,65.33,63.6,65.075,74.72,75.25
0.95,110.655,128.12,108.4,106.37,94.07,101.45,102.025,101.34,116.12,103.54,101.88,102.835,101.82,106.97,99.19,112.945,107.875,111.65,99.82,115.24,109.05,101.305,118.245,107.28,118.33,100.06,104.085,106.285,114.18


In [2334]:
def fill_hrxx(db):
    for hrxx in db.loc[:, 'HR00':'HR23'].columns.to_list():
        db[hrxx] = db.apply(
            lambda row: Mon_HR_AVGs.loc[row['SubjectNr']][hrxx] if np.isnan(row[hrxx]) and row['weekday_Mon'] == 1 else 
                        Tue_HR_AVGs.loc[row['SubjectNr']][hrxx] if np.isnan(row[hrxx]) and row['weekday_Tue'] == 1 else
                        Wed_HR_AVGs.loc[row['SubjectNr']][hrxx] if np.isnan(row[hrxx]) and row['weekday_Wed'] == 1 else
                        Thu_HR_AVGs.loc[row['SubjectNr']][hrxx] if np.isnan(row[hrxx]) and row['weekday_Thu'] == 1 else
                        Fri_HR_AVGs.loc[row['SubjectNr']][hrxx] if np.isnan(row[hrxx]) and row['weekday_Fri'] == 1 else
                        Sat_HR_AVGs.loc[row['SubjectNr']][hrxx] if np.isnan(row[hrxx]) and row['weekday_Sat'] == 1 else
                        Sun_HR_AVGs.loc[row['SubjectNr']][hrxx] if np.isnan(row[hrxx]) and row['weekday_Sun'] == 1 else
                        row[hrxx],
            axis=1
        )


fill_hrxx(asthma_df)
fill_hrxx(healthy_df)

In [2335]:
asthma_df.head()


Unnamed: 0,SubjectNr,DayNo,Age,weight,height,BMI_SDS,PedsQL_score_baseline,stepsTotalDaily,steps_hour_max,steps00,steps01,steps02,steps03,steps04,steps05,steps06,steps07,steps08,steps09,steps10,steps11,steps12,steps13,steps14,steps15,steps16,steps17,steps18,steps19,steps20,steps21,steps22,steps23,HR05Perc,HR95Perc,HRMinSleep,HRMaxSleep,AVGHR_daily,AVGHR_sleep,AVGHR_wake,HR00,HR01,HR02,HR03,HR04,HR05,HR06,HR07,HR08,HR09,HR10,HR11,HR12,HR13,HR14,HR15,HR16,HR17,HR18,HR19,HR20,HR21,HR22,HR23,wear05H,wear16H,wear24H,HEART_PULSE_BPM,awakeDuration,lightSleepDuration,deepSleepDuration,wakeUpCount,sleeptime,waketime,fvc_best,fev1_best,pef_best,fev1_ratio_best,fev1_percentage,fvc1_percentage,fev1_ratio_percentage,FG,FHX,FHN,TG,TN,TX,SQ,SP,DR,RH,RHX,weekday_Fri,weekday_Mon,weekday_Sat,weekday_Sun,weekday_Thu,weekday_Tue,weekday_Wed,dayType_holiday,dayType_school,dayType_weekend,school_yes_no_no,school_yes_no_yes,sex_Female,sex_Male,sportsyesno_No,sportsyesno_Yes,urbanisation_Extremely urbanised,urbanisation_Not extremely urbanised,grade_fev1_A,grade_fev1_B,grade_fev1_C,grade_fev1_D,grade_fev1_E,grade_fev1_F,grade_fev1_U,grade_fvc_A,grade_fvc_B,grade_fvc_C,grade_fvc_D,grade_fvc_E,grade_fvc_F,grade_fvc_U,screentime_0-30 min,screentime_0.5-1 hours,screentime_1-2 hours,screentime_2-4 hours,screentime_> 4 hours
0,1771258,0,15,51.9,163.2,-0.1,847826087.0,3723.0,1640.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,137.0,,,605.0,1640.0,,82.0,451.0,245.0,135.0,160.0,164.0,41.0,0.0,68.0,120.0,61.0,93.0,90.0,,94.0,73.7,71.4,71.62,65.35,75.82,69.12,74.26,102.05,105.5,96.9,97.95,120.0,93.32,91.58,89.92,95.15,97.95,89.8,83.4,87.8,91.5,93.3,77.1,78.6,0.0,69.0,54.0,,660.0,16800.0,16440.0,1.0,,,3232.0,2381.0,4334.0,0.736696,0.726555,0.877857,0.822316,6.3,8,4,14.0,85,19.7,5.4,58,0.0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,1,0,,0,0,1,0,0,0,,0,0,0,0,0,0
1,1771258,1,15,51.9,163.2,-0.1,847826087.0,10015.0,4355.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,607.0,580.0,325.0,180.0,322.0,181.0,491.0,161.0,10.0,744.0,1146.0,4355.0,722.0,142.0,49.0,0.0,82.0,168.0,67.0,121.0,94.0,76.0,102.0,73.6,75.2,70.75,92.0,76.0,70.3,82.3,78.0,99.0,113.0,129.8,95.1,91.6,86.63,108.5,85.25,93.4,95.8,101.8,142.3,107.8,104.2,91.8,93.0,100.0,100.0,100.0,,180.0,22200.0,11760.0,0.0,15.0,13.0,3261.0,2257.0,4509.0,0.692119,0.688716,0.885734,0.772559,10.0,12,8,11.3,100,13.1,0.2,2,0.9,2,1,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,1,1,0,1,0,0,0,0,,0,1,0,0,0,0,,0,0,1,0,0,0
2,1771258,2,15,51.9,163.2,-0.1,847826087.0,3811.0,727.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,63.0,561.0,159.0,356.0,26.0,591.0,109.0,114.0,456.0,155.0,727.0,46.0,139.0,210.0,6.0,93.0,0.0,71.0,122.0,64.0,129.0,90.0,77.0,96.0,79.1,73.1,80.3,78.0,76.1,74.6,78.1,98.5,103.4,100.5,109.6,100.6,105.0,109.0,86.5,88.6,102.8,78.2,85.2,94.0,100.0,89.25,96.5,82.0,100.0,100.0,100.0,,300.0,19260.0,12360.0,1.0,15.0,13.0,3451.0,2567.0,4717.0,0.743842,0.783245,0.937255,0.830297,6.0,9,4,9.9,62,13.3,8.2,89,0.0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,,0,0,1,0,0,0,,0,0,1,0,0,0
3,1771258,3,15,51.9,163.2,-0.1,847826087.0,4346.0,673.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,515.0,673.0,377.0,490.0,192.0,322.0,180.0,523.0,61.0,359.0,174.0,95.0,148.0,196.0,21.0,20.0,66.0,128.0,59.0,107.0,88.0,75.0,93.0,77.1,79.8,72.6,67.0,80.2,76.0,73.6,79.0,98.5,103.5,106.4,95.5,106.2,98.2,100.0,84.3,86.8,97.2,74.0,107.0,82.4,96.1,90.8,78.2,100.0,100.0,100.0,,1140.0,17041.0,19619.0,1.0,15.0,13.0,3419.0,2497.0,4839.0,0.730331,0.761887,0.928565,0.815214,6.9,10,4,9.5,63,11.7,3.2,35,0.0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,,0,0,1,0,0,0,,0,0,1,0,0,0
4,1771258,4,15,51.9,163.2,-0.1,847826087.0,3270.0,1324.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,92.0,167.0,1324.0,388.0,166.0,101.0,91.0,6.0,224.0,405.0,56.0,65.0,63.0,72.0,64.0,111.0,57.0,88.0,81.0,77.0,83.0,83.3,77.6,75.1,76.2,72.6,75.8,73.8,68.0,74.8,65.4,94.4,85.7,77.25,86.5,78.0,76.6,90.0,89.1,92.2,110.6,77.4,81.1,76.5,82.0,100.0,100.0,100.0,,540.0,17340.0,15300.0,2.0,16.0,15.0,3519.0,2528.0,5.07,0.718386,0.77128,0.955636,0.801884,9.3,12,6,11.2,98,12.9,0.5,5,5.5,38,10,0,0,1,0,0,0,0,0,0,1,1,0,1,0,0,1,1,0,0,1,0,0,0,,0,0,1,0,0,0,,0,0,1,0,0,0


### HR05Perc and HR95Perc

IF we find out what's up with the original values and we also delete the ones with lots of missing data

In [2336]:
def fill_hrxxperc(db, subjects):
    for subject in subjects:
        percentiles = np.round(db.loc[db['SubjectNr'] == subject, 'HR00':'HR23'].transpose().quantile(q=[0.05, 0.95]).transpose().rename(columns={0.05:'HR05Perc', 0.95:'HR95Perc'}), 0)
        db.loc[db['SubjectNr'] == subject, ['HR05Perc', 'HR95Perc']] = percentiles

fill_hrxxperc(healthy_df, healthy_subject_nrs)
fill_hrxxperc(asthma_df, asthma_subject_nrs)

### AVGHR_daily

In [2337]:
asthma_df['AVGHR_daily'].iloc[83]

nan

In [2338]:
def fill_avghr(db, subjects):
    for subject in subjects:
        avghr = np.round(db.loc[db['SubjectNr'] == subject, 'HR00':'HR23'].transpose().describe().loc['mean'], 0).rename('AVGHR_daily')
        db.loc[db['SubjectNr'] == subject, 'AVGHR_daily'] = avghr

fill_avghr(healthy_df, healthy_subject_nrs)
fill_avghr(asthma_df, asthma_subject_nrs)

In [2339]:
asthma_df['AVGHR_daily'].iloc[83]

83.0

### HRMinSleep and HRMaxSleep and AVGHR_sleep and AVGHR_wake and awakeDuration and sleeptime and waketime
They all depends on what we say sleeptime and awaketime, but I think even the given data is wrong

# Removing columns with too much missing data
This should be done earlier

In [None]:
# TODO

# Modeling

In [None]:
# TODO use eventday column to classify the excurbations

In [None]:
# Creat a y column for classification
healthy_df['y'] = 0
asthma_df['y'] = 1

# Create the feature matrix X by concatenating the two dataframes
X = pd.concat([asthma_df, healthy_df], axis=0)

# TODO fev1_ratio_best with true and nans only idk why
# For now remove it
object_columns = X.select_dtypes(include='object').columns
print(object_columns)

X = X.select_dtypes(exclude=['object'])
X_object_cols = X.select_dtypes(include=['object'])
print(X_object_cols.dtypes)

y = X['y']

# Split the data into training and testing sets using a 70-30 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit the One-Class SVM model to the training data
model = OneClassSVM().fit(X_train[y_train == 1])

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Evaluate the performance of the model
accuracy = (y_pred == y_test).mean()
print("Accuracy: {:.2f}".format(accuracy))

Index(['fev1_ratio_best'], dtype='object')
Series([], dtype: object)
Accuracy: 0.22
