In [1]:
import numpy as np
import pandas as pd
import altair as alt
from os.path import join

from constants import DATA_DIR, COLUMNS

np.random.seed(0)

In [2]:
n = 2500 # total number of patients
R = 3 # doubling rate of number of cases
d = 30 # maximum timestamp for the initial onset data points

In [3]:
patient_ids = list(range(n))

In [4]:
def generate_exponential_df():
    onsets = np.power(np.random.random(n) * np.power(d, 1/R), R) # exponential growth
    onsets_noisy = np.random.poisson(onsets) # poisson noise
    
    arr = np.stack((patient_ids, onsets_noisy), axis=-1)
    
    df = pd.DataFrame(data=arr, columns=[COLUMNS.PATIENT_ID, COLUMNS.TIMESTAMP]).sort_values(by=COLUMNS.TIMESTAMP, ignore_index=True)
    df[COLUMNS.VALUE] = "true"
    return df

In [5]:
def lag_exponential_df(df, lag):
    df = df.copy()
    lag_arr = np.abs(np.random.normal(lag, 1, df.shape[0]))
    df[COLUMNS.TIMESTAMP] = df[COLUMNS.TIMESTAMP] + pd.Series(lag_arr)
    return df

In [6]:
def add_qualitative_value(df):
    vals = np.random.choice(["mild", "severe"], df.shape[0], p=[0.7, 0.3])
    df[COLUMNS.VALUE] = vals
    return df

In [7]:
onset_of_symptoms_df = generate_exponential_df()
admission_to_hospital_df = lag_exponential_df(onset_of_symptoms_df, lag=4)
ards_df = add_qualitative_value(lag_exponential_df(admission_to_hospital_df, lag=1))
admission_to_icu_df = lag_exponential_df(ards_df, lag=1)
discharge_from_icu_df = lag_exponential_df(admission_to_icu_df, lag=2)
discharge_from_hospital_df = lag_exponential_df(discharge_from_icu_df, lag=3)

In [8]:
rate_of_respiration_df = lag_exponential_df(onset_of_symptoms_df, lag=1)

In [9]:
timeline_entries = {
    'Onset of COVID-19 Symptoms': onset_of_symptoms_df.to_dict("records"),
    'Admission to the hospital': admission_to_hospital_df.to_dict("records"),
    'Acute Respiratory Distress Syndrome': ards_df.to_dict("records"),
    'Admission to ICU': admission_to_icu_df.to_dict("records"),
    'Rate of respiration': rate_of_respiration_df.to_dict("records"),
    'Discharge from ICU': discharge_from_icu_df.to_dict("records"),
    'Discharge from hospital': discharge_from_hospital_df.to_dict("records")
}

In [10]:
def convert_timeline_entries_to_df(timeline_entries):
    df = pd.DataFrame(columns=[COLUMNS.PATIENT_ID, COLUMNS.TIMESTAMP, COLUMNS.EVENT, COLUMNS.VALUE])
    for event_name, entries in timeline_entries.items():
        for entry in entries:
            df = df.append({
                COLUMNS.EVENT: event_name,
                COLUMNS.PATIENT_ID: entry[COLUMNS.PATIENT_ID],
                COLUMNS.TIMESTAMP: entry[COLUMNS.TIMESTAMP],
                COLUMNS.VALUE: entry[COLUMNS.VALUE],
            }, ignore_index=True)
    return df

In [11]:
timeline_entries_df = convert_timeline_entries_to_df(timeline_entries)
timeline_entries_df.head()

Unnamed: 0,patient_id,timestamp,event,value
0,736,0,Onset of COVID-19 Symptoms,True
1,1072,0,Onset of COVID-19 Symptoms,True
2,2122,0,Onset of COVID-19 Symptoms,True
3,1070,0,Onset of COVID-19 Symptoms,True
4,545,0,Onset of COVID-19 Symptoms,True


In [12]:
timeline_entries_df.to_csv(join(DATA_DIR, "fake_event_data.csv"), index=False)