## Test Data Generator

This notebook is used to create sample data for testing various aspects of the Sepsis analysis pipeline.

For details on the data model, see [Data Specification](../documentation/data_specification.md)

In [7]:
import pandas as pd
import numpy as np
import random
import datetime

In [26]:
# how many records do we want?
n = 10000

# how frequently to output status as records are being generated
n_freq = n / 10

# generate random 8 digit number for subject ID
id_length = 8
subject_ids = []
for x in range (0, n):
    subject_ids.append(''.join(["%s" % random.randint(0, 9) for num in range(0, id_length)]))
    
# setup icd9 dataset
icd9 = pd.read_csv('../../data/cci_icd9_2015.csv')

# setup icd10 dataset
icd10 = pd.read_csv('../../data/cci_icd10cm_2018_1.csv')

####
#### these next lines only need to be run once
####
#### since both single and double quoting is used, remove extra quotes and leading/trailing spaces
#### from https://stackoverflow.com/questions/33788913/pythonic-efficient-way-to-strip-whitespace-from-every-pandas-data-frame-cell-tha

#icd9 = icd9.apply(lambda x: x.str.strip("'") if x.dtype == "object" else x)
#icd9 = icd9.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
#icd9.columns = icd9.columns.str.strip("'")
#icd9.to_csv('../data/cci_icd9_2015.csv', index=False)

#icd10 = icd10.apply(lambda x: x.str.strip("'") if x.dtype == "object" else x)
#icd10 = icd10.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
#icd10.columns = icd10.columns.str.strip("'")
#icd10.to_csv('../data/cci_icd10cm_2018_1.csv', index=False)

In [27]:
# from https://stackoverflow.com/questions/553303/generate-a-random-date-between-two-other-dates
def random_date(start, end):
    """Generate a random datetime between `start` and `end`"""
    return start + datetime.timedelta(
        # Get a random amount of seconds between `start` and `end`
        seconds=random.randint(0, int((end - start).total_seconds())),
    )

# sample call
random_date(datetime.date(1900, 1, 1), datetime.date(2018, 1, 1))

datetime.date(1936, 12, 13)

In [28]:
start = datetime.datetime.now()
### Table 1: Hospital admission-level dataset
financialclasses = ['self-pay',
                    'insurance',
                    'medicare',
                    'medicaid',
                    'chip']

admissiontypes = ['Elective',
                  'Emergency',
                  'Newborn',
                  'Trauma Center',
                  'Urgent']

admissionsources = ['Unspecified',
                    'Clinic Referral',
                    'From Another ED/Urgent Care',
                    'From Another Hospital',
                    'From Inpat Rehab',
                    'From Skilled Nursing Facility',
                    'Inpat Psych',
                    'Long Term Acute Care "LTAC"',
                    'Non-Healthcare Facility Pt',
                    'Transfer From Same Hospital']

units = ['MCR MEDICAL UNIT',
         'MCR CARDIAC PCU UNIT',
         'MCR SURGICAL ICU UNIT',
         'AMC MED SPEC UNIT',
         'AMC MEDICAL ICU',
         'AMC M/S PROG CARE UNIT',
         'MHC MEDICAL UNIT',
         'MHC RENAL MED UNIT',
         'MHC SURG TRAUMA UNIT',
         'MHN ICU',
         'MHN MEDICAL SURGICAL 3',
         'MHN WOMENS PAVILION',
         'PVH MEDICINE UNIT'
         'PVH ICU'
         'PVH TELEMETRY']
%store units

patientclasses = ['inpatient',
                  'observation']

admittingservices = ['cardiology',
                     'oncology',
                     'surgery',
                     'maternity']

dischargedispositions = ['home',
                         'skilled care facility',
                         'hospice']

#https://s.details.loinc.org/LOINC/74693-3.html?sections=Comprehensive
races = ['American Indian or Alaska Native',
         'Asian',
         'Black or African American',
         'Native Hawaiian or Other Pacific Islander',
         'White',
         'More than one race',
         'Unknown']
#https://s.details.loinc.org/LOINC/74694-1.html?sections=Comprehensive
ethnicities = ['Hispanic or Latino', 'Not Hispanic or Latino', 'Unknown']
                    
row = 1;
records = []
for i in subject_ids:
    # make sure dates are logical
    DOB = random_date(datetime.date(1900, 1, 1), datetime.date(2018, 1, 1))
    AdmissionDateTime = random_date(datetime.datetime(2017, 6, 1), datetime.datetime(2018, 1, 1))
    # according to https://www.cdc.gov/nchs/data/series/sr_13/sr13_168.pdf average US hospital stay is 4.8 days
    # however, I don't have data on standard deviation, so guessing on that part
    # nor do I know if values are actually normally distributed
    DischargeDateTime = AdmissionDateTime + datetime.timedelta(days = np.random.normal(4.8, 1))
    records.append({'SubjectId': i,
                    'EncounterId' : row,
                    'DOB' : DOB, 
                    'Sex' : np.random.choice(['M', 'F', 'U']),
                    'Race' : np.random.choice(races),
                    'Ethnicity' : np.random.choice(ethnicities), 
                    'FirstHeightInInches' : random.randint(5, 108), 
                    'FirstWeightInOunces' : random.randint(8, 22400), 
                    'AdmittingDepartmentName' : np.random.choice(units),
                    'AdmissionDateTime': AdmissionDateTime,
                    'FinancialClass' : np.random.choice(financialclasses),
                    'AdmissionType': np.random.choice(admissiontypes), 
                    'AdmissionSource': np.random.choice(admissionsources), 
                    'PatientClass' : np.random.choice(patientclasses), 
                    'AdmittingService' : np.random.choice(admittingservices), 
                    'PrincipalProblem' : 'patient reported text', 
                    'PrimaryCodedDx' : np.random.choice(icd10['ICD-10-CM CODE']), 
                    'DischargeDateTime' : DischargeDateTime, 
                    'DischargeDisposition' : np.random.choice(dischargedispositions), 
                    'HospitalService' : np.random.choice(admittingservices)
                   })
    if row % n_freq == 0:
        print(row, 'rows created')
    row += 1
df_admission = pd.DataFrame.from_dict(records)
df_admission = df_admission[['SubjectId', 
                             'EncounterId', 
                             'DOB', 
                             'Sex', 
                             'Race', 
                             'Ethnicity', 
                             'FirstHeightInInches', 
                             'FirstWeightInOunces', 
                             'AdmittingDepartmentName', 
                             'AdmissionDateTime', 
                             'FinancialClass',
                             'AdmissionType', 
                             'AdmissionSource', 
                             'PatientClass', 
                             'AdmittingService', 
                             'PrincipalProblem', 
                             'PrimaryCodedDx', 
                             'DischargeDateTime', 
                             'DischargeDisposition', 
                             'HospitalService']]
print('Elapsed:', datetime.datetime.now() - start)    
df_admission.head()
# 2500 append: 0:13
# 2500 concat  1:10
# 2500 loc     0:01
# 100000 from_records    0:08
# 100000 from_dict.      0:08
# 100k rowlist of series 0:34

Stored 'units' (list)
1000 rows created
2000 rows created
3000 rows created
4000 rows created
5000 rows created
6000 rows created
7000 rows created
8000 rows created
9000 rows created
10000 rows created
Elapsed: 0:00:00.940638


Unnamed: 0,SubjectId,EncounterId,DOB,Sex,Race,Ethnicity,FirstHeightInInches,FirstWeightInOunces,AdmittingDepartmentName,AdmissionDateTime,FinancialClass,AdmissionType,AdmissionSource,PatientClass,AdmittingService,PrincipalProblem,PrimaryCodedDx,DischargeDateTime,DischargeDisposition,HospitalService
0,35539801,1,1944-11-20,U,White,Unknown,99,22010,AMC MEDICAL ICU,2017-11-20 22:20:03,insurance,Newborn,From Another ED/Urgent Care,inpatient,surgery,patient reported text,S00471S,2017-11-25 15:16:09.444051,home,cardiology
1,82514719,2,1989-11-10,U,More than one race,Hispanic or Latino,101,10113,MHC SURG TRAUMA UNIT,2017-06-25 19:58:02,self-pay,Urgent,From Skilled Nursing Facility,inpatient,cardiology,patient reported text,D6951,2017-07-01 15:29:20.006934,skilled care facility,surgery
2,6464258,3,1979-10-17,M,Black or African American,Unknown,69,1619,PVH MEDICINE UNITPVH ICUPVH TELEMETRY,2017-08-23 02:23:20,insurance,Elective,"Long Term Acute Care ""LTAC""",inpatient,oncology,patient reported text,V673XXD,2017-08-28 21:30:33.668310,skilled care facility,oncology
3,96903269,4,2000-03-10,M,Native Hawaiian or Other Pacific Islander,Unknown,97,20672,MCR MEDICAL UNIT,2017-08-27 20:55:40,insurance,Trauma Center,From Another Hospital,inpatient,surgery,patient reported text,Q2731,2017-09-02 12:06:25.329991,skilled care facility,oncology
4,6208782,5,1957-06-29,M,Native Hawaiian or Other Pacific Islander,Unknown,46,647,MHC MEDICAL UNIT,2017-12-21 06:59:24,chip,Emergency,"Long Term Acute Care ""LTAC""",observation,surgery,patient reported text,S22010D,2017-12-25 20:18:37.922347,home,oncology


In [12]:
df_admission.to_csv('../../data/admission.csv')

In [17]:
### Table 2: Admitting Diagnosis
records = []
for index, row in df_admission.iterrows():
    diagnoses = random.randint(1, 10)
    for _ in range(diagnoses):
        records.append({
            'SubjectId': row['SubjectId'],
            'EncounterId': row['EncounterId'],
            'AdmittingDiagnosis': np.random.choice(icd10['ICD-10-CM CODE DESCRIPTION'])
             })
    if index % n_freq == 0:
        print(index, 'rows evaluated')
df_admission_diagnoses = pd.DataFrame.from_dict(records)
df_admission_diagnoses = df_admission_diagnoses[['SubjectId',
                                                 'EncounterId',
                                                 'AdmittingDiagnosis']]
df_admission_diagnoses.head()

0 rows evaluated
100 rows evaluated
200 rows evaluated
300 rows evaluated
400 rows evaluated
500 rows evaluated
600 rows evaluated
700 rows evaluated
800 rows evaluated
900 rows evaluated


Unnamed: 0,SubjectId,EncounterId,AdmittingDiagnosis
0,5613207,1,Pain in unspecified toe(s)
1,5613207,1,Nonpurulent mastitis associated with pregnancy...
2,5613207,1,Traumatic rupture of palmar ligament of right ...
3,5613207,1,"Abrasion, left lower leg, sequela"
4,5613207,1,Supervision of pregnancy with grand multiparit...


In [None]:
df_admission_diagnoses.to_csv('../data/admission_diagnoses.csv')

In [19]:
### Table 3: All diagnoses for that hospitalization
diagnosessources = ['Billing',
                    'Encounter',
                    'Problem List',
                    'Patient History']

sepsiscodes = ["A021","A207","A227","A267","A327","A392","A393","A394","A400","A401","A403","A408","A409","A4101","A4102","A411","A412","A413","A414","A4150","A4151","A4152","A4153","A4159","A4181","A4189","A419","A427","A483","A5486","B007","B377","P360","P3610","P3619","P362","P3630","P3639","P364","P365","P368","P369","R571","R578","R6510","R6511","R6520","R6521","T8112XA"]

In [30]:
records = []
for index, row in df_admission.iterrows():
    diagnoses = random.randint(1, 10);
    for _ in range(diagnoses):
        if (np.random.binomial(1, 0.20) == 0):
            records.append({
                 'SubjectId': row['SubjectId'],
                 'EncounterId': row['EncounterId'],
                 'Source': np.random.choice(diagnosessources),
                 'StartDate': random_date(row['AdmissionDateTime'], row['DischargeDateTime']),
                 'Code': np.random.choice(icd10['ICD-10-CM CODE']),
                 'Type': 'ICD-10-CM'})
        # now do special sepsis case - 20% probability
        else:
            records.append({
                 'SubjectId': row['SubjectId'],
                 'EncounterId': row['EncounterId'],
                 'Source': 'Encounter',
                 'StartDate': random_date(row['AdmissionDateTime'], row['DischargeDateTime']),
                 'Code': np.random.choice(sepsiscodes),
                 'Type': 'ICD-10-CM'
                 })
    if index % n_freq == 0:
        print(index, 'rows evaluated')
df_diagnoses = pd.DataFrame.from_dict(records)
df_diagnoses = df_diagnoses[['SubjectId',
                             'EncounterId',
                             'Source',
                             'StartDate',
                             'Code',
                             'Type']]
df_diagnoses.head()

0 rows evaluated
1000 rows evaluated
2000 rows evaluated
3000 rows evaluated
4000 rows evaluated
5000 rows evaluated
6000 rows evaluated
7000 rows evaluated
8000 rows evaluated
9000 rows evaluated


Unnamed: 0,SubjectId,EncounterId,Source,StartDate,Code,Type
0,35539801,1,Billing,2017-11-22 10:13:26,Y36591A,ICD-10-CM
1,35539801,1,Encounter,2017-11-25 11:55:07,A394,ICD-10-CM
2,35539801,1,Problem List,2017-11-21 06:11:34,M80822G,ICD-10-CM
3,35539801,1,Patient History,2017-11-25 06:52:06,Q526,ICD-10-CM
4,35539801,1,Problem List,2017-11-21 02:44:14,Q189,ICD-10-CM


In [None]:
df_diagnoses.to_csv('../data/diagnoses.csv')

In [None]:
### Table 4: ADT events in hospital
df_adt_events = pd.DataFrame(columns = ['SubjectId',
                                        'EncounterId',
                                        'DepartmentName',
                                        'TransferInDateTime',
                                        'TransferOutDateTime'])

for index, row in df_admission.iterrows():
    df_adt_events = df_adt_events.append({
                        'SubjectId': row['SubjectId'],
                        'EncounterId': row['EncounterId'],
                        'DepartmentName': row['AdmittingDepartmentName'],
                        'TransferInDateTime': row['AdmissionDateTime'],
                        'TransferOutDateTime': row['DischargeDateTime']
                        }, ignore_index=True)
df_adt_events.head()

In [None]:
df_adt_events.to_csv('../data/adt_events.csv')

In [None]:
### Table 5: Surgical procedures – ordered & completed
df_procedures = pd.DataFrame(columns = ['SubjectId',
                                        'EncounterId',
                                        'SurgeryName',
                                        'ProcedureCompleteDateTime',
                                        'PatientInPacuDateTime',
                                        'AnesthesiaStopDateTime'])

for index, row in df_admission.iterrows():
    field_to_select = np.random.choice(['ProcedureCompleteDateTime', 'PatientInPacuDateTime', 'AnesthesiaStopDateTime'])
    df_procedures = df_procedures.append({
                        'SubjectId': row['SubjectId'],
                        'EncounterId': row['EncounterId'],
                        'SurgeryName': 'surgery name text',
                        field_to_select: random_date(row['AdmissionDateTime'], row['DischargeDateTime']),
                        }, ignore_index=True)
df_procedures.head()

In [None]:
df_procedures.to_csv('../data/procedures.csv')

In [None]:
### Table 6: flowsheet info
df_flowsheet = pd.DataFrame(columns = ['SubjectId',
                                       'EncounterId',
                                       'FlowsheetDisplayName',
                                       'FlowsheetValue',
                                       'DateTime'])


## Flowsheet items of interest (17):

flowsheetnames = ["Patient's Location After MET/RRT Call",
                  "MET/RRT Team Members Present",
                  "Duration of MET/RRT in Minutes",
                  # "EWS", # excluded since will be adding separately
                  "Does patient have risk factors, signs or symptoms of infection?",
                  "Does the patient have new mental status changes?",
                  #"Early Detection of Sepsis Score",
                  "Calculated urine output mL/kg/hour",
                  "Sepsis Actions Taken",
                  "Elevated Lactate",
                  "MAP<65 or SBP<90",
                  "Platelet Count <100,000",
                  "INR >1.5 or PTT >60",
                  "Serum Creatinine >2",
                  "Urine Output <0.5mL/kg/hr for 2 hrs",
                  "Bilirubin >2.0",
                  "New or Increasing O2 Need",
                  "Type of Emergency",
                  "Time of Team Arrival"]


for index, row in df_admission.iterrows():
    items = random.randint(1, 10);
    for _ in range(items):
        df_flowsheet = df_flowsheet.append({
                            'SubjectId': row['SubjectId'],
                            'EncounterId': row['EncounterId'],
                            'FlowsheetDisplayName': np.random.choice(flowsheetnames),
                            'FlowsheetValue': random.uniform(1.0, 100.0),
                            'DateTime': random_date(row['AdmissionDateTime'], row['DischargeDateTime'])
                            }, ignore_index=True)

# now do EWS scores
for index, row in df_admission.iterrows():
    ewsDateTime = row['AdmissionDateTime'] + datetime.timedelta(minutes = 160)
    while (ewsDateTime < row['DischargeDateTime']):
        df_flowsheet = df_flowsheet.append({
                            'SubjectId': row['SubjectId'],
                            'EncounterId': row['EncounterId'],
                            'FlowsheetDisplayName': 'EWS',
                            'FlowsheetValue': random.randint(0, 20),
                            'DateTime': ewsDateTime
                            }, ignore_index=True)
        ewsDateTime += datetime.timedelta(minutes = 160)

# now do ESPM scores
for index, row in df_admission.iterrows():
    epsmDateTime = row['AdmissionDateTime'] + datetime.timedelta(minutes = 120)
    while (epsmDateTime < row['DischargeDateTime']):
        df_flowsheet = df_flowsheet.append({
                            'SubjectId': row['SubjectId'],
                            'EncounterId': row['EncounterId'],
                            'FlowsheetDisplayName': 'Early Detection of Sepsis Score',
                            'FlowsheetValue': random.randint(0, 100),
                            'DateTime': epsmDateTime
                            }, ignore_index=True)
        epsmDateTime += datetime.timedelta(minutes = 120)

df_flowsheet.head()

In [None]:
df_flowsheet.to_csv('../data/flowsheet.csv')

In [None]:
### Table 7: Labs – microbiology; viral testing
df_labs = pd.DataFrame(columns = ['SubjectId',
                                  'EncounterId',
                                  'LabName',
                                  'LabValue',
                                  'LabUnit',
                                  'CollectionDateTime',
                                  'ResultDateTime'])

labnames = ["SPECIMEN TYPE",
            "STOOL CULTURE",
            "INFLUENZA A",
            "BACTERIAL CULTURE",
            "NEISSERIA GONORRHOEAE",
            "STREP RESULT POC",
            "LACTIC ACID",
            "CULTURE STREP SCREEN",
            "CULTURE YERSINIA",
            "SPUTCULT",
            "POSITIVE ORGANISM",
            "THROAT CX, GROUP A STREP ONLY",
            "WEST NILE VIRUS IGM, SERUM",
            "WEST NILE VIRUS IGG, SERUM",
            "ANAEROBIC CULTURE",
            "GROUP A STREP CULTURE",
            "POCT STREP CULTURE",
            "URINE CULTURE ROUTINE",
            "Streptococcus, Group A,Culture",
            "HSV CULTURE",
            "CULTURE, ANEROBIC BACTERIA W/GRAM STAIN",
            "CULTURE, AEROBIC BACTERIA",
            "BLOOD CULTURE",
            "Culture",
            "SALMONELLA/SHIGELLA CULTURE",
            "CULTURE, THROAT",
            "WOUND CULTURE AND GRAM STAIN",
            "LACTIC ACID, PLASMA",
            "CLOSTRIDIUM DIFFICILE CULTURE",
            "RAPID STREP A SCREEN",
            "SOURCE",
            "THROAT CX, GROUP A STREP ONLY",
            "VIRAL CULTURE",
            "RAPID STREPTOCOCCAL A ANTIGEN",
            "Lactate, Ven",
            "LACTATE",
            "CHLAMYDIA PNEUMONIAE PCR",
            "CULTURE",
            "AEROBIC CULTURE",
            "ANAEROBIC CULTURE",
            "URINE CULTURE, COMPREHENSIVE",
            "BLOOD CULTURE",
            "UPPER RESPIRATORY CULTURE",
            "FUNGUS CULTURE",
            "SALMONELLA/SHIGELLA CULTURE",
            "URINE CULTURE",
            "GS",
            "HSV CULTURE/TYPE",
            "CAMPYLOBACTER CULTURE",
            "BODY FLUID CULTURE",
            "ORGANISM IDENTIFICATION, YEAST",
            "RESULT 1",
            "VARICELLA (VZV) CULTURE",
            "HSV CULTURE WITHOUT TYPING",
            "Lactate Whole Blood Arterial",
            "Lactate Whole Blood Venous",
            "LACTATE WHOLE BLOOD",
            "LACTATE PLASMA",
            "CYTOMEGALOVIRUS CULTURE",
            "RESPIRATORY VIRUS TUBE CULTURE",
            "VIRAL CULTURE CSF",
            "VIRAL CULTURE",
            "VARICELLA ZOSTER VIRUS SV CULTURE",
            "VARICELLA ZOSTER VIRUS SV CULTURE FINAL",
            "INFLUENZA A STAIN",
            "INFLUENZA B STAIN",
            "ADENOVIRUS STAIN",
            "PARAINFLUENZA 1 STAIN",
            "PARAINFLUENZA 2 STAIN",
            "PARAINFLUENZA 3 STAIN",
            "RSV STAIN",
            "Herpes simplex virus culture",
            "ACTINO CULTURE",
            "Direct Gram Stain",
            "AFB Stain",
            "AEROBIC CULTURE",
            "ANAEROBIC CULTURE",
            "AFB CULTURE",
            "BF Culture, Aerobic",
            "BF Culture, Anaerobic",
            "BLOOD CULTURE",
            "CSF Culture, Aerobic",
            "CSF CULTURE, ANAEROBIC",
            "FUNGUS CULTURE",
            "Strep Screening Culture",
            "LEGIONELLA CULTURE",
            "MRSA CULTURE",
            "RESPIRATORY CULTURE",
            "QUANT RESPIRATORY CULTURE",
            "RESPIRATORY CULTURE, CF",
            "STOOL CULTURE",
            "URINE CULTURE",
            "VIBRIO CULTURE",
            "YEAST CULTURE",
            "YERSINIA CULTURE",
            "ACANTHAMOEBA CULTURE",
            "ORGANISM ID CULTURE",
            "GI PCR SUPPLEMENTAL CULTURE",
            "SALMONELLA/SHIGELLA CULTURE",
            "Mold Identification",
            "Group A Strep Culture (Non-Respiratory)",
            "BLOOD CULTURE BOTTLE GRAM STAIN",
            "Legionella Species Culture",
            "Mycoplasma hominis/Ureaplasma species Culture",
            "Acid-Fast Bacillus Culture",
            "Mycoplasma hominis/Ureaplasma species Culture Source",
            "Acid-Fast Bacillus Blood Culture",
            "POCT LACTATE",
            "POCT Lactate",
            "POCT LACTATE VENOUS",
            "LACTATE -POCT",
            "Lactate - POCT",
            "CAMPYLOBACTER",
            "SALMONELLA",
            "CRYPTOSPORIDIUM",
            "Shigella/Enteroinvasive E coli",
            "AMPLIVUE HSV 1",
            "AMPLIVUE HSV 2"]

for index, row in df_admission.iterrows():
    items = random.randint(1, 10);
    for _ in range(items):
        CollectionDateTime = random_date(row['AdmissionDateTime'], row['DischargeDateTime'])
        ResultDateTime = random_date(CollectionDateTime, row['DischargeDateTime'])
        df_labs = df_labs.append({
                            'SubjectId': row['SubjectId'],
                            'EncounterId': row['EncounterId'],
                            'LabName': np.random.choice(labnames),
                            'LabValue': random.uniform(1.0, 100.0),
                            'LabUnit': np.random.choice(['mg', 'kg', 'count', 'ml', 'l']),
                            'CollectionDateTime': CollectionDateTime,
                            'ResultDateTime': ResultDateTime
                            }, ignore_index=True)
df_labs.head()

In [None]:
df_labs.to_csv('../data/labs.csv')