In [1]:
import numpy as np
import pandas as pd
import glob
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import LabelEncoder
import keras_tuner as kt


<h1>Preprocessing</h1>

In [2]:
csv_files = glob.glob('DOH COVID Data Drop_20240103/DOH COVID Data Drop_ 20240103 (2020-2023) - 04 Case Information_batch_*.csv')
dfs = []
columns_to_import = ['Age', 'Sex', 'DateResultRelease', 'DateRepConf', 'DateDied', 'DateRecover', 'DateRepRem', 'DateOnset', 'RemovalType']
date_columns = ['DateResultRelease', 'DateRepConf', 'DateDied', 'DateRecover', 'DateRepRem', 'DateOnset']

for filename in csv_files:
    df = pd.read_csv(filename, usecols=columns_to_import, parse_dates=date_columns)
    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)

In [3]:
combined_df.isna().sum()

Age                    11898
Sex                        1
DateResultRelease     976231
DateRepConf                0
DateDied             4070302
DateRecover          3404454
RemovalType             1773
DateRepRem              1773
DateOnset            2644554
dtype: int64

In [4]:
imputerAge = SimpleImputer(strategy='median')
age = combined_df.select_dtypes([np.number])
imputerAge.fit(age)
combined_df['Age'] = imputerAge.transform(age)

combined_df['Sex'].fillna(combined_df['Sex'].mode()[0], inplace=True)

In [5]:
combined_df = pd.get_dummies(combined_df, columns=['Sex', 'RemovalType'], dtype='float32')
combined_df.head()

Unnamed: 0,Age,DateResultRelease,DateRepConf,DateDied,DateRecover,DateRepRem,DateOnset,Sex_FEMALE,Sex_MALE,RemovalType_DIED,RemovalType_RECOVERED
0,38.0,2020-01-30,2020-01-30,NaT,NaT,2020-02-07,2020-01-21,1.0,0.0,0.0,1.0
1,44.0,2020-01-30,2020-02-03,2020-02-01,NaT,2020-02-02,2020-01-18,0.0,1.0,1.0,0.0
2,60.0,2020-01-30,2020-02-05,NaT,2020-01-31,2020-02-05,2020-01-21,1.0,0.0,0.0,1.0
3,49.0,NaT,2020-03-06,NaT,NaT,2020-03-27,NaT,0.0,1.0,0.0,1.0
4,63.0,NaT,2020-03-06,2020-03-11,NaT,2020-03-12,NaT,0.0,1.0,1.0,0.0


In [6]:
combined_df['DateRemoval'] = np.select(
    [
        combined_df['DateDied'].notna(),
        combined_df['DateRecover'].notna(),
        combined_df['DateRepRem'].notna()
    ],
    [
        combined_df['DateDied'],
        combined_df['DateRecover'],
        combined_df['DateRepRem']
    ],
    default=pd.NaT
)
combined_df['DateRemoval'] = pd.to_datetime(combined_df['DateRemoval'], errors='coerce')

combined_df['DateStart'] = np.select(
    [
        combined_df['DateOnset'] < combined_df['DateResultRelease'],
        combined_df['DateOnset'] > combined_df['DateResultRelease'],
        combined_df['DateRepConf'].notna()
    ],
    [
        combined_df['DateOnset'],
        combined_df['DateResultRelease'],
        combined_df['DateRepConf']
    ],
    default=pd.NaT
)
combined_df['DateStart'] = pd.to_datetime(combined_df['DateStart'], errors='coerce')

In [7]:
combined_df_clean = combined_df[combined_df['DateStart'] <= combined_df['DateRemoval']].reset_index()
combined_df_clean = combined_df_clean.drop(['index', 'DateResultRelease', 'DateRepConf',
                                            'DateDied', 'DateRecover', 'DateRepRem', 'DateOnset'], axis=1)
combined_df_clean['DateDifference'] = combined_df_clean['DateRemoval'] - combined_df_clean['DateStart']
combined_df_clean = combined_df_clean.drop('RemovalType_DIED', axis=1)

In [8]:
combined_df_clean[['Sex_FEMALE', 'Sex_MALE', 'Age', 'RemovalType_RECOVERED']] = combined_df_clean[['Sex_FEMALE', 'Sex_MALE', 'Age', 'RemovalType_RECOVERED']].astype(int)
combined_df_clean['DateDifference'] = combined_df_clean['DateDifference'].astype('string').str.extract(r"(^\d+(?=\s))", expand=False).astype(float)

In [9]:
X_train_full, X_test, y_train_full, y_test = train_test_split(combined_df_clean[['Sex_FEMALE', 'Sex_MALE', 'Age']], combined_df_clean[['RemovalType_RECOVERED', 'DateDifference']], test_size=0.1, random_state=150)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=X_test.shape[0], random_state=150)

<h1>Hyperparameter Optimization</h1>

In [10]:
def build_model(hp):
    n_hidden = hp.Int("n_hidden", min_value=1, max_value=5)
    n_neurons = hp.Int("n_neurons", min_value=10, max_value=100)
    learning_rate = hp.Float("learning_rate", min_value=1e-4, max_value=1e-2, sampling="log")
    optimizer = hp.Choice("optimizer", values=["sgd", "adam"])
    nesterov = hp.Choice("nesterov", values=[True, False])
    if optimizer == "sgd":
        optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, nesterov=nesterov)
    else:
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    
    norm_layer = tf.keras.layers.Normalization(input_shape=(3, ))
    model = tf.keras.Sequential()
    model.add(norm_layer)
    model.add(tf.keras.layers.Flatten())
    for _ in range(n_hidden):
        model.add(tf.keras.layers.Dense(n_neurons, activation="relu"))
    model.add(tf.keras.layers.Dense(2))
    model.compile(loss="mse", optimizer=optimizer, metrics=["accuracy"])

    return model

In [11]:
random_search_tuner = kt.RandomSearch(build_model, objective="val_accuracy", max_trials=100, overwrite=False, directory="covidModels", project_name='Trials', seed=150)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
random_search_tuner.search(X_train, y_train, epochs=500, validation_data=(X_valid, y_valid), callbacks=[early_stopping])

Reloading Tuner from covidModels\Trials\tuner0.json

Search: Running Trial #31

Value             |Best Value So Far |Hyperparameter
3                 |4                 |n_hidden
46                |81                |n_neurons
0.0085194         |0.00050704        |learning_rate
sgd               |sgd               |optimizer
1                 |0                 |nesterov




  super().__init__(**kwargs)


Epoch 1/500
[1m 40354/100634[0m [32m━━━━━━━━[0m[37m━━━━━━━━━━━━[0m [1m1:39[0m 2ms/step - accuracy: 0.0161 - loss: nan

KeyboardInterrupt: 

In [24]:
random_search_tuner.results_summary(num_trials=1)

Results summary
Results in covidModels\Trials
Showing 1 best trials
Objective(name="val_accuracy", direction="max")

Trial 004 summary
Hyperparameters:
n_hidden: 4
n_neurons: 81
learning_rate: 0.0005070370705614628
optimizer: sgd
nesterov: 0
Score: 0.9845156073570251


In [28]:
best_model = random_search_tuner.get_best_models(num_models=1)[0]
best_model.fit(X_train_full, y_train_full, epochs=10)
test_loss, test_accuracy = best_model.evaluate(X_test, y_test)

  super().__init__(**kwargs)


Epoch 1/10
[1m113213/113213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 2ms/step - accuracy: 0.9848 - loss: 517.3672
Epoch 2/10
[1m113213/113213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 2ms/step - accuracy: 0.9848 - loss: 515.1258
Epoch 3/10
[1m113213/113213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 2ms/step - accuracy: 0.9847 - loss: 520.4631
Epoch 4/10
[1m113213/113213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 2ms/step - accuracy: 0.9848 - loss: 506.6729
Epoch 5/10
[1m113213/113213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 1ms/step - accuracy: 0.9848 - loss: 521.5933
Epoch 6/10
[1m113213/113213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s[0m 1ms/step - accuracy: 0.9847 - loss: 516.4030
Epoch 7/10
[1m113213/113213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 2ms/step - accuracy: 0.9847 - loss: 518.1096
Epoch 8/10
[1m113213/113213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 1ms/s

In [31]:
y_pred = best_model.predict(X_test)

[1m12580/12580[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 2ms/step


In [37]:
y_pred_df = pd.DataFrame(y_pred, columns=['RecoveryType', 'daysToRecover'])

In [40]:
y_test

Unnamed: 0,RemovalType_RECOVERED,DateDifference
3415572,1,10.0
1081703,1,10.0
1156153,1,18.0
585057,1,19.0
3735318,1,9.0
...,...,...
3736001,1,16.0
3683965,1,6.0
152141,1,19.0
1896611,1,8.0


In [None]:
test