In [64]:
import numpy as np
import pandas as pd
import altair as alt
from os.path import join
import random
import math
import string
from numpy.random import choice
from datetime import timedelta, date

import sys
sys.path.insert(1, '../../notebooks') # We want to use constants.py in another folder

from constants import DATA_DIR

np.random.seed(0)

In [65]:
# File #1: DailyCounts-SiteID.csv
# Fields: siteid, date, new_positive_cases, patients_in_icu, new_deaths
# Notes:
# (1) One row per date (2020-03-15, 2020-03-16, etc.)
# (2) Site is a unique identifier for your institution (e.g., "BIDMC")
# (3) new_positive_cases, patients_in_icu, and new_deaths are number of distinct patients
# (4) If a patient has multiple positive test results, use the earliest date
# (5) Set patients_in_icu = -2 if you do not have ICU data
# (6) Set new_deaths = -2 if you do not have death data
# (7) Obfuscate small counts with "-1" as required by your institution
# Examples: (DailyCounts-BIDMC.csv)
# BIDMC, 2020-03-20, -1, 0, -2
# BIDMC, 2020-03-21, 6, -1, -2
# BIDMC, 2020-03-22, 14, 5, -2
# BIDMC, 2020-03-23, 26, 10, -2

In [66]:
site_ids = ['FWN','FXL','FZT','FMT','FFG','FOW','FSZ','FMA','FEQ','FVX','FKQ','FBL','FDQ','FKN','FBD','FKL','FUU','FZU','FZM','FUN']

In [70]:
start_date = date(2020, 3, 14)
end_date = date(2020,3,31)

In [72]:
# Iterate each site to make dataFrame

for siteid in site_ids:
    data = []

    #### Factors:
    # https://twitter.com/FinancialTimes/status/1244520128386985984/photo/1
    #
    # When are the patients start to be discovered?
    offset = random.choice(range(0, 10))

    # Does the number of patients starts to drop at some time point?
    
    # Slope of the growth before dropping 
    base_slope = random.choice(range(8, 12)) * 0.1
    
    is_obfuscate = choice([True, False], 1, [0.3, 0.7])
    #############
    
    cur_date = start_date + timedelta(days=offset)
    
    while cur_date <= end_date:
        date_str = cur_date.strftime("%Y-%m-%d")
        
        num_days_so_far = (cur_date-start_date).days
        
        slope_new_positive = base_slope * 1
        slope_icu = base_slope * 0.25
        slope_death = base_slope * 0.1
        
        new_positive_cases = math.floor(num_days_so_far * slope_new_positive)
        new_positive_cases += math.floor(random.uniform(-new_positive_cases*0.3, new_positive_cases*0.3))
        
        patients_in_icu = math.floor(num_days_so_far * slope_icu)
        patients_in_icu += math.floor(random.uniform(-patients_in_icu*0.3, patients_in_icu*0.3))

        new_deaths = math.floor(num_days_so_far * slope_death)
        new_deaths += math.floor(random.uniform(-new_deaths*0.3, new_deaths*0.3))
        
        if new_positive_cases <= 0:
            cur_date += timedelta(days=1)
            continue
        
        # Obfuscate / mark -2 for zero value
        if patients_in_icu <= 0:
            patients_in_icu = -2
        if new_deaths <= 0:
            new_deaths = -2
            
        if is_obfuscate and patients_in_icu < 5:
            patients_in_icu = -1
        if is_obfuscate and new_deaths < 5:
            new_deaths = -1

        data.append([siteid, date_str, new_positive_cases, patients_in_icu, new_deaths])
        
        cur_date += timedelta(days=1)

    # Make data
    df = pd.DataFrame(data)
    
    # Write a file
    save_date = end_date.strftime("%Y-%m-%d")
    fname = f"{save_date}_DailyCounts-{siteid}.csv"
    df.to_csv(join("..", "site_level_data", siteid, fname), index = False, header=False)