In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

from preprocessing.user_data import User
from preprocessing.symptom_dates import get_symptom_dates

## Load COVID-19-Wearables dataset

In [2]:
# get a list of user ids
user_ids = get_symptom_dates().index.to_list()

users = []
for user_id in tqdm(user_ids):
    user = User(
        user_id,
        sampling_rule="1H",
        aggregate="mean",
        load_steps=False,
        load_sleep=False
    )
    users.append(user.to_dict())

100%|██████████| 24/24 [00:15<00:00,  1.56it/s]


In [10]:
SLIDING_WINDOW_SIZE = 20 # days
RECOVERY_PERIOD = pd.Timedelta(40, unit="D") # days
INTERVAL = 1 # days

def create_sliding_window_splits(hr, labels, symptom_date, end_cutoff: int = 30):

    # print(hr.shape)
    series_length = (hr.index[-1] - hr.index[0]).days
    # print(series_length)

    first_date = hr.index[0]
    sliced_hrs = []
    sliced_labels = []
    sliced_indices = []

    for start_idx in range(0, series_length - end_cutoff, INTERVAL):
        
        # create time period
        start_date = first_date + pd.Timedelta(start_idx, unit="D")
        end_date = start_date + pd.Timedelta(SLIDING_WINDOW_SIZE, unit="D")

        # print(start_date, end_date)
        # print(start_date, end_date)

        if end_date > symptom_date and end_date < symptom_date + RECOVERY_PERIOD:
            continue

        # create slices from sliding window
        hr_slice = hr[start_date:end_date].values
        hr_index = hr[start_date:end_date].index
        label_slice = labels[start_date:end_date].values

        # if slice contains a positive label at any time position, assign a positive labels to the whole slice
        if label_slice.any():
            label = 1
        else:
            label = 0

        sliced_hrs.append(hr_slice.squeeze())
        sliced_labels.append(label)
        sliced_indices.append(hr_index)
    
    return sliced_hrs, sliced_labels, sliced_indices


In [11]:
user = users[0]



hrs, labels, _ = create_sliding_window_splits(user["hr"], user["target"], user["symptom_date"])

print(np.array(hrs).shape)

(9, 337)


In [12]:
X = []
y = []

for user in users:
    hrs, labels, _ = create_sliding_window_splits(user["hr"], user["target"], user["symptom_date"])

    X.extend(hrs)
    y.extend(labels)
    # X.append(np.array(hrs))
    # y.append(np.array(labels))

X = np.array(X, dtype=np.int32)
y = np.array(y, dtype=np.int32)

print(X.shape, y.shape)
print(f"Positive: {np.sum(y)}, negative: {y.shape[0] - np.sum(y)}")


(817, 337) (817,)
Positive: 266, negative: 551


In [13]:
from sklearn.model_selection import train_test_split

# training and test split
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(len(X_train), len(X_test))

print(f"Train positive: {np.sum(y_train)}, negative: {y_train.shape[0] - np.sum(y_train)}")
print(f"Test positive: {np.sum(y_test)}, negative: {y_test.shape[0] - np.sum(y_test)}")

612 205
Train positive: 201, negative: 411
Test positive: 65, negative: 140


## Classification

In [14]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

### TimeSeriesForestClassifier

In [15]:
from sktime.classification.interval_based import TimeSeriesForestClassifier

classifier = TimeSeriesForestClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

acc = accuracy_score(y_test, y_pred)
prc = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"acc: {acc:04f}, prec: {prc:04f}, rec: {rec:04f}, f1: {f1:04f}")

acc: 0.819512, prec: 0.937500, rec: 0.461538, f1: 0.618557


### HIVE-COTE 2.0

In [16]:
from sktime.classification.hybrid import HIVECOTEV2

hc2 = HIVECOTEV2(time_limit_in_minutes=1)
hc2.fit(X_train, y_train)
y_pred = hc2.predict(X_test)

acc = accuracy_score(y_test, y_pred)
prc = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"acc: {acc:04f}, prec: {prc:04f}, rec: {rec:04f}, f1: {f1:04f}")

acc: 0.912195, prec: 0.943396, rec: 0.769231, f1: 0.847458


## Test set evaluation

In [20]:
# get test data
test_base_path = os.path.join(os.getcwd(), "data", "tum_test_set")
test_files = os.listdir(test_base_path)

# select classifier
test_classifier = hc2

predictions = {test_file.split(".")[0]: None for test_file in test_files}

for test_file in tqdm(test_files):
    # read data
    df = pd.read_csv(os.path.join(test_base_path, test_file))

    # set timeseries as index
    df.set_index("timestamp", inplace=True)
    df.index = pd.to_datetime(df.index)

    # resample data
    # print(df)
    test_series = df.resample(rule="1H").mean()   

    test_slices, _, test_indices = create_sliding_window_splits(test_series.heart_rate, test_series.heart_rate, symptom_date=test_series.index[-1], end_cutoff=0)

    # slice_prediction_days = df.resample(rule="1D").mean().index
    # print(slice_prediction_days)
   
    slice_predictions = []
    slice_days = []
    
    # print(test_slices.shape)

    for test_slice, test_index in zip(test_slices, test_indices):
        # predict label for slice
        # print(test_slice[np.newaxis].shape)
        test_prediction = test_classifier.predict(test_slice[np.newaxis])
        test_date = test_index[-1].date()
        
        # store predictions
        slice_predictions.append(test_prediction.item())
        slice_days.append(test_date)

    # print(slice_predictions)
    # print(slice_days)

    predictions[test_file.split(".")[0]] = {
        "dates": slice_days, 
        "predictions": slice_predictions
    }

100%|██████████| 40/40 [09:43<00:00, 14.59s/it]


In [22]:
threshold = 2
# iterate over predictions, set symptom onset dates after threshold-many consecutive positive dates

onset_predictions = {}

for case, test_dict in predictions.items():
    
    onset_date = None

    labels = str("".join([str(label) for label in test_dict["predictions"]]))
    print(labels)
    
    pos_idx = labels.find("1111")

    if pos_idx == -1:
        pos_idx = labels.find("111")
        if pos_idx == -1:
            pos_idx = labels.find("11")
            if pos_idx == -1:
                pos_idx = labels.find("1")
    
    if pos_idx != -1:
        # infection detected
        onset_date = test_dict["dates"][pos_idx]

    onset_predictions[case] = onset_date

000000000000000000000000000000000000000000000000000
001000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000
000000000000000000000010000000010000100000000000000
0000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000
000000000000000000000000010000000000000000000000000
000000000000000000000000000000000000000000000000000
00000000000000000000000000000000000
000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000
000000000000000000000
000000000000000000000000000000000001000000000000000
000000000000000000000000000000000000000000000000000
000000000000000000000000000000000000000000000000000
00000000000000000000000000

In [30]:
from datetime import datetime
import csv
with open('predictions.csv', 'w') as csv_file:  
    writer = csv.writer(csv_file)
    for key, value in onset_predictions.items():
        if value is None:
            continue
        writer.writerow([key, value.strftime("%-d.%-m.%Y")])

In [24]:
print(onset_predictions)

{'test_38': None, 'test_36': datetime.date(2020, 1, 17), 'test_8': None, 'test_32': None, 'test_37': None, 'test_10': None, 'test_23': None, 'test_7': None, 'test_17': datetime.date(2020, 2, 6), 'test_25': None, 'test_9': None, 'test_22': datetime.date(2020, 2, 9), 'test_11': None, 'test_18': None, 'test_14': None, 'test_35': None, 'test_33': None, 'test_26': datetime.date(2020, 2, 19), 'test_12': None, 'test_3': None, 'test_15': None, 'test_16': datetime.date(2020, 2, 20), 'test_28': None, 'test_20': datetime.date(2020, 1, 17), 'test_13': None, 'test_29': None, 'test_30': datetime.date(2020, 2, 11), 'test_39': None, 'test_4': datetime.date(2020, 2, 17), 'test_2': None, 'test_34': None, 'test_24': None, 'test_5': datetime.date(2020, 2, 12), 'test_6': None, 'test_19': None, 'test_27': None, 'test_21': None, 'test_40': None, 'test_1': None, 'test_31': None}
