In [54]:
import numpy as np
import pandas as pd
import altair as alt
from os.path import join
import random
import string
from numpy.random import choice

import sys
sys.path.insert(1, '../../notebooks') # We want to use constants.py in another folder

from constants import DATA_DIR, COLUMNS

np.random.seed(0)

In [29]:
# GUIDELINE

# File #4: Diagnoses-SiteID.csv
# Fields: siteid, icd_code, icd_version, num_patients
# Notes:
# (1) One row per ICD diagnosis code
# (2) All diagnoses the patients have starting seven days before the positive test 
# (3) icd_version = "9" or "10"
# (4) Obfuscate small counts with "-1" as required by your institution
# Examples: (Diagnoses-BIDMC.csv)
# BIDMC, B97.29, 10, 25
# BIDMC, J12.89, 10, 19
# BIDMC, R03.0, 10, 15
# BIDMC, U07.1, 10, 13
# BIDMC, 123, 9, -1

In [49]:
num_sites = 20
num_codes = 9

In [65]:
# ICD-10 examples taken from:
# https://www.cdc.gov/nchs/data/icd/ICD-10-CM-Official-Coding-Gudance-Interim-Advice-coronavirus-feb-20-2020.pdf
ICD_10_CODES = [
    "J12.89", "B97.29", "J20.8", "J40", "J22", "J98.8", "J80", "Z03.818", "Z20.828"
]

# ICD-09: https://en.wikipedia.org/wiki/List_of_ICD-9_codes
ICD_9_CODES = np.char.mod('%d', random.sample(range(1,139), num_codes))

6

In [86]:
# Fake site ids

# To generate new list, uncomment below
# site_ids = []
# for i in range(num_sites):
#     site_ids.append("F" + "".join(random.choices(string.ascii_uppercase, k=2)))

site_ids = ['FWN','FXL','FZT','FMT','FFG','FOW','FSZ','FMA','FEQ','FVX','FKQ','FBL','FDQ','FKN','FBD','FKL','FUU','FZU','FZM','FUN']

In [89]:
# Iterate each site to make dataFrame
dfs = []
for siteid in site_ids:
    
    data = []
    
    # A factor that decide the number of patients in each site
    size_of_site = choice(["s", "m", "l"], 1, [0.1, 0.6, 0.4])
        
    # ICD_9_CODES
    for code in ICD_9_CODES:
        
        # Some codes may not be used for each site
        if choice([True, False], 1, [0.5, 0.5]):
            continue
        
        is_obfuscate = choice([True, False], 1, [0.05, 0.95])
        
        icd_version = "9"
        num_patients = -1
        
        if is_obfuscate == False:
            if size_of_site == "s":
                num_patients = random.choice(range(1,12))
            elif size_of_site == "m":
                num_patients = random.choice(range(1,50))
            else:
                num_patients = random.choice(range(1,98))

        data.append([siteid, code, icd_version, num_patients])
    
    # ICD_10_CODES
    for code in ICD_10_CODES:
        
        # Some codes may not be used for each site
        if choice([True, False], 1, [0.2, 0.8]):
            continue
        
        is_obfuscate = choice([True, False], 1, [0.05, 0.95])
        
        icd_version = "10"
        num_patients = -1
        
        if is_obfuscate == False:
            if size_of_site == "s":
                num_patients = random.choice(range(1,12))
            elif size_of_site == "m":
                num_patients = random.choice(range(1,50))
            else:
                num_patients = random.choice(range(1,98))

        data.append([siteid, code, icd_version, num_patients])
        
    # Make data
    df = pd.DataFrame(data)
    dfs.append(df)
    
    # Write a file
    df.to_csv(join("..", "Diagnoses-" + siteid + '.csv'), index = False, header=False)