In [1]:
# Libraries 
import pandas as pd
from pandas.testing import assert_frame_equal
import numpy as np
import csv 
import os

import matplotlib.pyplot as plt
import seaborn as sns

#pip install fancyimpute
from fancyimpute import IterativeImputer

In [None]:
# Print display settings
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 5000)
pd.set_option('display.max_colwidth', 5000)

In [None]:
# Move up 2 directories
data_directory = '../..' 

# Load the CSV files
asthma_df = pd.read_csv(os.path.join(data_directory, 'Data', 'astma.csv'))
healthy_df = pd.read_csv(os.path.join(data_directory, 'Data', 'healthy_parsed.csv'))

# Preprocessing

In [None]:
healthy_df.head(5)

In [None]:
asthma_df.head(5)

# Issue with HR columns

In [None]:
# In the asthma dataset there seems to be a problem with HR00 to HR23
# The heart rate goes above a million sometimes
# It seems like the first couple digits correspond with the expected heart rate.
# 2 cases: HR below 100 and HR above 100

# Loop through each column in the dataframe
for col in asthma_df.loc[:, "HR00":"HR23"]:
    for i in range(len(asthma_df[col])):
        if asthma_df.loc[i, col] > 1000:
            # Take first 2 digits of float
            first_2_digits = str(asthma_df.loc[i, col])[:2]
            temp_number = float(first_2_digits)
            # If heart rate lower than 20 it means it should be greater than 100 (assuming heart rates < 200)
            if temp_number < 20:
                val = str(asthma_df.loc[i, col])[:3] + '.' + str(asthma_df.loc[i, col])[3]
                asthma_df.loc[i, col] = float(val)
            else:
                val = str(asthma_df.loc[i, col])[:2] + '.' + str(asthma_df.loc[i, col])[2]
                asthma_df.loc[i, col] = float(val)

In [None]:
# Verify that it works 
asthma_df.loc[:, "HR00":"HR23"].head(100)

# Removing outliers

In [None]:
# Subject 2429672 has no data about gender, sports and urbanisation 
# asthma_df = asthma_df[asthma_df['SubjectNr'] != 2429672]

# Alligning column names

In [None]:
# Renaming columns (allignment)
asthma_df = asthma_df.rename(columns={'Gender': 'sex'})
asthma_df = asthma_df.rename(columns={'Weight': 'weight'})
asthma_df = asthma_df.rename(columns={'Height': 'height'})

# What grade they are in
healthy_df = healthy_df.rename(columns={'school_year_final': 'school_year'})
asthma_df = asthma_df.rename(columns={'school_year_edit': 'school_year'})

# Wheter they do a sport
asthma_df = asthma_df.rename(columns={'Sports': 'sportsyesno'})

# Note there are 2 urbanisations
asthma_df = asthma_df.rename(columns={'urbanisation2': 'urbanisation'})

# pedsql
asthma_df = asthma_df.rename(columns={'BASELINE_PedsQL_score': 'PedsQL_score_baseline'})

# Note healthy_df doesn't have all wear variables (it only has the grped vars)
# Asthma_df doesnt have wear05H lets compute it based on the other wear vars
asthma_df['wear05H'] = (asthma_df.loc[:, 'wear00':'wear05'] == 1).mean(axis=1) * 100

# No temperature in asthma_df but is in healthy_df?
# Drop the col
healthy_df = healthy_df.drop('BODY_TEMPERATURE_DEG_C', axis=1)

# No blood preassure in asthma_df but is in healthy_df?
# drop in healthy
healthy_df = healthy_df.drop('DIASTOLIC_BLOOD_PRESSURE_MMHG', axis=1)
healthy_df = healthy_df.drop('SYSTOLIC_BLOOD_PRESSURE_MMHG', axis=1)

# Note that WEIGHT_KG is a daily meassurement in healthy_df
# in asthma_df this meassurement is only taken at begin and end of the study period
# Also missing data is 87% lets drop this col
healthy_df = healthy_df.drop('WEIGHT_KG', axis=1)

# I am assuming these columns are the same (bedtimeReport and waketimeReport not in legend))
asthma_df = asthma_df.rename(columns={'bedtimeReport': 'sleeptime'})
asthma_df = asthma_df.rename(columns={'waketimeReport': 'waketime'})

# Create a new column with the hourly categories starting from 0 to 23
hour_mapping = {i: i for i in range(24)}

# Converting to datetime
healthy_df['sleeptime'] = pd.to_datetime(healthy_df['sleeptime'])
asthma_df['sleeptime'] = pd.to_datetime(asthma_df['sleeptime'])
healthy_df['waketime'] = pd.to_datetime(healthy_df['waketime'])
asthma_df['waketime'] = pd.to_datetime(asthma_df['waketime'])

# Setting hourly value
healthy_df['sleeptime'] = healthy_df['sleeptime'].dt.hour
asthma_df['sleeptime'] = asthma_df['sleeptime'].dt.hour
healthy_df['waketime'] = healthy_df['waketime'].dt.hour
asthma_df['waketime'] = asthma_df['waketime'].dt.hour

# predicted_fvc_best, predicted_fev1_best, predicted_fev1_ratio_best
# All of these not in asthma
healthy_df = healthy_df.drop('predicted_fvc_best', axis=1)
healthy_df = healthy_df.drop('predicted_fev1_best', axis=1)
healthy_df = healthy_df.drop('predicted_fev1_ratio_best', axis=1)

# Screentime, values need to be alligned 
# Alignment dictionary
alignment_dict = {
    '0': '0',
    'D. 2 uur tot 4 uur': '2-4 hours',
    'A. 0 tot 30 minuten': '0-30 min',
    'C. 1 uur tot 2 uur': '1-2 hours',
    'B. 30 tot 60 minuten': '0.5-1 hours',
    'E. Meer dan 4 uur': '> 4 hours'
}

# Rename values in the first dataframe based on the alignment dictionary
asthma_df['screentime'] = asthma_df['screentime'].replace(alignment_dict)

In [None]:
healthy_df.head(5)

In [None]:
asthma_df.head(5)

# Delete colums (that are useless)

In [None]:
# Just an index we dont need it
healthy_df = healthy_df.drop('Index', axis=1)

# Subjects number are not relevant for predicting
healthy_df = healthy_df.drop('SubjectNr', axis=1)

# We only have ethnicity in healthy_df
healthy_df = healthy_df.drop('ethnicity', axis=1)

In [None]:
healthy_df.head(5)

In [None]:
asthma_df.head(5)

# One-hot encoding

In [None]:
# Looking at the possible values of different columns in the healthy and sick database.
print("Let's allign these values: ")
print(healthy_df['weekday'].unique())
print(asthma_df['weekday'].unique())
print()
print(healthy_df['dayType'].unique())
print(asthma_df['dayType'].unique())
print()
print(healthy_df['school_yes_no'].unique())
print(asthma_df['school_yes_no'].unique())
print()
print(healthy_df['sex'].unique())
print(asthma_df['sex'].unique())
print()
print(healthy_df['school_year'].unique())
print(asthma_df['school_year'].unique())
print()
print(healthy_df['sportsyesno'].unique())
print(asthma_df['sportsyesno'].unique())
print()
print(healthy_df['urbanisation'].unique())
print(asthma_df['urbanisation'].unique())
print()
print(healthy_df['grade_fev1'].unique())
print(asthma_df['grade_fev1'].unique())
print()
print(healthy_df['grade_fvc'].unique())
print(asthma_df['grade_fvc'].unique())
print()
print(healthy_df['fev1_ratio_best'].unique())
print(asthma_df['fev1_ratio_best'].unique())
print("I am not sure why there are only true or nans here???")
print()
print(healthy_df['screentime'].unique())
print(asthma_df['screentime'].unique()) 

In [None]:
# school_yes_no
# for healthy: school, daycare or neither
# for asthma: yes or no (school)
# Decision: school and daycare are the same class
# note that nan will be also be a category in the one-hot encoding (unknown)
healthy_df['school_yes_no'] = healthy_df['school_yes_no'].replace({'Neither': 'no', 'Day Care': 'yes', 'School': 'yes'})
asthma_df['school_yes_no'] = asthma_df['school_yes_no'].replace({'Nee': 'no', 'Ja': 'yes'})

# sex
# it doesnt make sense to one hot encode nan for one subject 
# this won't improve classification, we will have to assume a gender 
# Women are more likely to classify as non-binary --> assume female
# https://www.pewresearch.org/social-trends/2022/06/28/americans-complex-views-on-gender-identity-and-transgender-issues/
asthma_df['sex'] = asthma_df['sex'].fillna('Female')

# school year
# I think it will be best to drop this its likely to different in each data set 
healthy_df = healthy_df.drop('school_year', axis=1)
asthma_df = asthma_df.drop('school_year', axis=1)

# sports_yes_no
# A couple subjects didn't fill this in, I think it is a fair assumption to made that 
# if they didn't fill it in than they didn't do sports
asthma_df['sportsyesno'] = asthma_df['sportsyesno'].fillna('No')

# Urbanization
# Decision: Extremely and very will be merged to extremely urbanized
# moderately and little urbanized will be merged to not extremely urbanized
healthy_df['urbanisation'].replace({'Very urbanised': 'Extremely urbanised',
                                    'Moderately urbanised': 'Not extremely urbanised', 
                                    'Little urbanised': 'Not extremely urbanised'}, 
                                   inplace=True)

In [None]:
print("new classes (they have to be the same): ")
print(healthy_df['weekday'].unique())
print(asthma_df['weekday'].unique())
print()
print(healthy_df['dayType'].unique())
print(asthma_df['dayType'].unique())
print()
print(healthy_df['school_yes_no'].unique())
print(asthma_df['school_yes_no'].unique())
print()
print(healthy_df['sex'].unique())
print(asthma_df['sex'].unique())
print()
print(healthy_df['sportsyesno'].unique())
print(asthma_df['sportsyesno'].unique())
print()
print(healthy_df['urbanisation'].unique())
print(asthma_df['urbanisation'].unique())
print()
print(healthy_df['grade_fev1'].unique())
print(asthma_df['grade_fev1'].unique())
print()
print(healthy_df['grade_fvc'].unique())
print(asthma_df['grade_fvc'].unique())
print()
print(healthy_df['fev1_ratio_best'].unique())
print(asthma_df['fev1_ratio_best'].unique())
print("I am not sure why there are only true or nans here???")
print()
print(healthy_df['screentime'].unique())
print(asthma_df['screentime'].unique()) 

In [None]:
# one-hot encoding
one_hot_asthma = pd.get_dummies(asthma_df[['weekday', 'dayType', 'school_yes_no', 'sex', 'sportsyesno', 'urbanisation',
                                          'grade_fev1', 'grade_fvc', 'screentime']])
one_hot_healthy = pd.get_dummies(healthy_df[['weekday', 'dayType', 'school_yes_no', 'sex', 'sportsyesno', 'urbanisation',
                                          'grade_fev1', 'grade_fvc', 'screentime']])

# Merging
asthma_df = pd.concat([asthma_df, one_hot_asthma], axis=1)
healthy_df = pd.concat([healthy_df, one_hot_healthy], axis=1)

# Make sure to drop old columns (they are replaced with the hot-encoded cols)
healthy_df = healthy_df.drop(columns=['weekday', 'dayType', 'school_yes_no', 'sex', 'sportsyesno', 'urbanisation',
                                          'grade_fev1', 'grade_fvc', 'screentime'])
asthma_df = asthma_df.drop(columns=['weekday', 'dayType', 'school_yes_no', 'sex', 'sportsyesno', 'urbanisation',
                                          'grade_fev1', 'grade_fvc', 'screentime'])

# Result from one-hot encoding

In [None]:
healthy_df.head(5)

In [None]:
asthma_df.head(5)

# Setting the columns equal

In [None]:
# Drop columns in asthma_df that aren't already in healthy_df
asthma_df = asthma_df.drop(columns=asthma_df.columns.difference(healthy_df.columns))

# Sort columns 
asthma_df = asthma_df.reindex(columns=healthy_df.columns)

# Assert the columns are the same
assert all(healthy_df.columns == asthma_df.columns)

# Booleans to numerical

In [None]:
# This will change the booleans from the hot encoding to 0 or 1, while preserving all other data as floats
healthy_df = healthy_df.astype(float)
asthma_df = asthma_df.astype(float)

In [None]:
healthy_df.head()

In [None]:
asthma_df.head(5)

# Dealing with NaN's

In [None]:
asthma_df.isnull().sum()

In [None]:
healthy_df.isnull().sum()

# Removing columns with too much missing data

# healthy_df

In [None]:
# Compute for all the columns, the percentages of Nan per column
# We will drop the columns that have more than 40% Nan
percentNan = []
columns_to_remove = []
for col in healthy_df.columns:
    nan_percentage = healthy_df[col].isnull().sum() / len(healthy_df[col]) * 100
    percentNan.append((nan_percentage, col))

percentNan.sort(reverse=True)

print("Removed cols")
for el in percentNan:
    if el[0] > 40:
        columns_to_remove.append(el[1])
        print(el)

# Remove the column in both dataframes
healthy_df = healthy_df.drop(columns=columns_to_remove) 
asthma_df = asthma_df.drop(columns=columns_to_remove)

In [None]:
# Compute for all the columns, the percentages of Nan per column
# We will drop the columns that have more than 40% Nan
percentNan = []
columns_to_remove = []
for col in asthma_df.columns:
    nan_percentage = asthma_df[col].isnull().sum() / len(asthma_df[col]) * 100
    percentNan.append((nan_percentage, col))

percentNan.sort(reverse=True)

print("Removed cols")
for el in percentNan:
    if el[0] > 40:
        columns_to_remove.append(el[1])
        print(el)
        
# Remove the column in both dataframes
healthy_df = healthy_df.drop(columns=columns_to_remove)
asthma_df = asthma_df.drop(columns=columns_to_remove)

In [None]:
healthy_df.head(5)

In [None]:
asthma_df.head(5)

# Model based imputation

# healthy_df

In [None]:
# Perform MICE imputation
mice_imputer = IterativeImputer()
imputed_data = mice_imputer.fit_transform(healthy_df)

# Convert the imputed data array back to a DataFrame
healthy_df = pd.DataFrame(imputed_data, columns=healthy_df.columns)

# Display the imputed DataFrame
healthy_df.head(5)

# asthma_df

In [None]:
# Perform MICE imputation
mice_imputer2 = IterativeImputer()
imputed_data2 = mice_imputer2.fit_transform(asthma_df)

# Convert the imputed data array back to a DataFrame
asthma_df = pd.DataFrame(imputed_data2, columns=asthma_df.columns)

# Display the imputed DataFrame
asthma_df.head(5)

# Store dataframe as csv to test on models

In [None]:
data_directory = '../..'  # Specify the absolute path to the parent directory of the data directory
subdirectory = 'Preprocessed'  # Name of the subdirectory

# Create the subdirectory if it doesn't exist
subdirectory_path = os.path.join(data_directory, 'Data', subdirectory)
os.makedirs(subdirectory_path, exist_ok=True)

# Save the preprocessed DataFrames as CSV files in the subdirectory
asthma_df.to_csv(os.path.join(subdirectory_path, 'preprocessed_MICE_asthma_normal_Eventdate.csv'), index=False)
healthy_df.to_csv(os.path.join(subdirectory_path, 'preprocessed_MICE_healthy_normal_Eventdate.csv'), index=False)