In [10]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from preprocessing.user_data import User
from preprocessing.symptom_dates import get_symptom_dates

## Load COVID-19-Wearables dataset

In [11]:
# get a list of user ids
user_ids = get_symptom_dates().index.to_list()

users = []
for user_id in tqdm(user_ids):
    user = User(
        user_id,
        sampling_rule="1H",
        aggregate="mean",
        load_steps=False,
        load_sleep=False
    )
    users.append(user.to_dict())

100%|██████████| 24/24 [00:15<00:00,  1.56it/s]


In [12]:
SLIDING_WINDOW_SIZE = 14 # days
RECOVERY_PERIOD = pd.Timedelta(40, unit="D") # days
INTERVAL = 1 # days

def create_sliding_window_splits(hr, labels, symptom_date):

    # print(hr.shape)
    series_length = (hr.index[-1] - hr.index[0]).days
    # print(series_length)

    first_date = hr.index[0]
    sliced_hrs = []
    sliced_labels = []

    for start_idx in range(0, series_length - 30, INTERVAL):
        
        # create time period
        start_date = first_date + pd.Timedelta(start_idx, unit="D")
        end_date = start_date + pd.Timedelta(SLIDING_WINDOW_SIZE, unit="D")

        # print(start_date, end_date)
        # print(start_date, end_date)

        if end_date > symptom_date and end_date < symptom_date + RECOVERY_PERIOD:
            continue

        # create slices from sliding window
        hr_slice = hr[start_date:end_date].values
        label_slice = labels[start_date:end_date].values

        # if slice contains a positive label at any time position, assign a positive labels to the whole slice
        if label_slice.any():
            label = 1
        else:
            label = 0

        sliced_hrs.append(hr_slice.squeeze())
        sliced_labels.append(label)
    
    return sliced_hrs, sliced_labels


In [13]:
user = users[0]



hrs, labels = create_sliding_window_splits(user["hr"], user["target"], user["symptom_date"])

print(np.array(hrs).shape)

(9, 337)


In [14]:
X = []
y = []

for user in users:
    hrs, labels = create_sliding_window_splits(user["hr"], user["target"], user["symptom_date"])

    X.extend(hrs)
    y.extend(labels)
    # X.append(np.array(hrs))
    # y.append(np.array(labels))

X = np.array(X, dtype=np.int32)
y = np.array(y, dtype=np.int32)

print(X.shape, y.shape)
print(f"Positive: {np.sum(y)}, negative: {y.shape[0] - np.sum(y)}")


(817, 337) (817,)
Positive: 266, negative: 551


In [15]:
from sklearn.model_selection import train_test_split

# training and test split
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(len(X_train), len(X_test))

print(f"Train positive: {np.sum(y_train)}, negative: {y_train.shape[0] - np.sum(y_train)}")
print(f"Test positive: {np.sum(y_test)}, negative: {y_test.shape[0] - np.sum(y_test)}")

612 205
Train positive: 206, negative: 406
Test positive: 60, negative: 145


## Classification

In [16]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

### TimeSeriesForestClassifier

In [17]:
from sktime.classification.interval_based import TimeSeriesForestClassifier

classifier = TimeSeriesForestClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

acc = accuracy_score(y_test, y_pred)
prc = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"acc: {acc:04f}, prec: {prc:04f}, rec: {rec:04f}, f1: {f1:04f}")

acc: 0.800000, prec: 0.771429, rec: 0.450000, f1: 0.568421


### HIVE-COTE 2.0

In [18]:
from sktime.classification.hybrid import HIVECOTEV2

hc2 = HIVECOTEV2(time_limit_in_minutes=1)
hc2.fit(X_train, y_train)
y_pred = hc2.predict(X_test)

acc = accuracy_score(y_test, y_pred)
prc = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"acc: {acc:04f}, prec: {prc:04f}, rec: {rec:04f}, f1: {f1:04f}")

acc: 0.917073, prec: 0.877193, rec: 0.833333, f1: 0.854701
