In [1]:
import os
import re
import csv
import sys
import json
import time
import datetime

from src import segmentation
from src import o2sat_finder as o2f
from src import symptom_finder as sf
from src import diagnose_covid as dc
from src import covid_diagnosis_finder as cf

# create the sentence segmentor
seg_obj = segmentation.Segmentation()

# attempt to segment texts longer than this into sentences
SEG_CHECK_LEN = 100  # length in characters

Loading Spacy language model...done.


## Configure the input file path and the output directory

The input file must be in CSV (comma-separated value) format.
The output directory will be created if it does not already exit. Be sure to specify a location to which you have write access.

In [2]:
INPUT_FILE = 'synthetic_data_20220328.csv'

# write files containing text extracted from the text columns to this directory
OUTDIR = 'results'

#### Compute the indices of all text columns.

Compute the indices from the column names instead of hardcoding them, to protect against file format changes.

In [3]:
# extract the date of the input file (year and month)
# output files will be written to a folder labeled with the date
match = re.search(r'\d+', INPUT_FILE)
if match is not None:
    date = match.group()
else:
    month = datetime.datetime.now().month
    year = datetime.datetime.now().year
    date = '{0:4d}{1:02d}'.format(year, month)

# names of relevant text and date columns
TEXT_COLS = {
    'mg_notes'        : None,
    'mv_comp_oth_sp'  : None,
    'mg_death_dx'     : None,
    'mv_sx_oth_sp'    : None,
    'mv_tx_oth_sp1'   : None,
    'mv_tx_oth_sp2'   : None,
    'mv_tx_oth_sp3'   : None,
}  

DATE_COLS = {
    # date of ICU admission
    'mg_decon_icuadm_dt' : None,
    # date of positive Covid test
    'cv_sn_pos_spec1' : None,
}

#### Load the data file, build maps of column names to column indices

In [4]:
# print column names, build dict mapping col_name to col_index
col_map = {}
date_col_map = {}

with open(INPUT_FILE, newline='') as csvfile:
    for i,line in enumerate(csvfile):
        if 0 == i:
            reader = csv.reader([line])
            col_names = list(reader)[0]
            # convert all column names to lowercase
            col_names = [name.lower() for name in col_names]
            print('Found {0} columns'.format(len(col_names)))
            print(col_names)

            # get indices of text cols and store in TEXT_COLS dict
            for k,v in TEXT_COLS.items():
                index = col_names.index(k)
                TEXT_COLS[k] = index

            # flatten the dict into (index, col_name) tuples and sort by index in increasing order
            col_tuples = [(v,k) for k,v in TEXT_COLS.items()]
            col_tuples = sorted(col_tuples, key=lambda x: x[0])

            # build col_map
            for j, col_name in enumerate(col_names):
                col_map[col_name] = j

            # build col map for date cols
            for col_name, col_index in DATE_COLS.items():
                date_col_map[col_name] = col_map[col_name]

        else:
            print('Sample line: ')
            print(line)
            break

# print to check values with data dictionary notes
print('Text column info:')
for col_index, col_name in col_tuples:
    print('\t{0:2}: {1}'.format(col_index, col_name))
    
print('Date column info:')
for col_name, col_index in date_col_map.items():
    print('\t{0:2}: {1}'.format(col_index, col_name))

# save the indices of the date cols in a list
date_col_indices = [k for k in date_col_map.values()]    

Found 38 columns
['obs', 'mg_death_dx', 'mv_sx', 'mv_sx_fever', 'mv_sx_sfever', 'mv_sx_chills', 'mv_sx_rigors', 'mv_sx_myalgia', 'mv_sx_runnose', 'mv_sx_sthroat', 'mv_sx_taste', 'mv_sx_fatigue', 'mv_sx_cough', 'mv_sx_wheezing', 'mv_sx_sob', 'mv_sx_breath', 'mv_sx_chest', 'mv_sx_nauvom', 'mv_sx_head', 'mv_sx_abdom', 'mv_sx_diarrhea', 'mv_sx_oth', 'mv_sx_oth_sp', 'mv_comp_pna', 'mv_comp_ards', 'mv_comp_mv', 'mv_comp_ecmo', 'mv_comp_oth_sp', 'mv_icu', 'id', 'cv_sn_pos_spec1', 'mv_sx_dt', 'mg_decon_icuadm_dt', 'mg_notes', 'mv_tx_rem', 'mv_tx_oth_sp1', 'mv_tx_oth_sp2', 'mv_tx_oth_sp3']
Sample line: 
1,,1,0,0,0,0,0,1,0,1,1,1,0,0,1,0,1,1,0,1,1,Congestion,0,0,0,0,,0,1,24-Mar-20,23-Mar-20,.,H/O sinus tachycardia,0,,,

Text column info:
	 1: mg_death_dx
	22: mv_sx_oth_sp
	27: mv_comp_oth_sp
	33: mg_notes
	35: mv_tx_oth_sp1
	36: mv_tx_oth_sp2
	37: mv_tx_oth_sp3
Date column info:
	32: mg_decon_icuadm_dt
	30: cv_sn_pos_spec1


#### Extract unique texts from all text columns and write to column-specific files.

The imported regex code in the src folder should be based on the usage examples contained in these text files.

In [5]:
def cleanup(text):
    
    # convert to lowercase
    text = text.lower()

    # replace some chars with a single space
    text = re.sub(r'[,()]', ' ', text)    
    
    # correct some spelling errors
    text = re.sub(r'\bffor\b', 'for', text)
    text = re.sub(r'\bplasme\b', 'plasma', text)
    text = re.sub(r'\bsysmptoms\b', 'symptoms', text)    
    
    # collapse repeated whitespace
    text = re.sub(r'\s+', ' ', text)
    
    return text

In [6]:
# The 'text_sets' contain the UNIQUE texts for each colum.
# Reserve one set for each text column.
text_sets = {col_index:set() for col_index, col_name in col_tuples}

with open(INPUT_FILE, encoding='latin-1', newline='') as csvfile:
    for i, line in enumerate(csvfile):
        if 0 == i:
            # skip header line
            continue
        else:
            reader = csv.reader([line])
            line_items = list(reader)[0]
            # skip line if unexpected number of items present, probably decode error
            if len(line_items) != len(col_names):
                print('Bad line at index {0}: '.format(i))
                print(line)
                print()
                continue
            for col_index, col_name in col_tuples:
                text = line_items[col_index]
                if len(text) > 0:
                    text = cleanup(text)
                    text_sets[col_index].add(text)
                
    #if i > 0 and (0 == i % 1000):
    #    print('completed line {0}'.format(i+1))
            
# convert text sets to lists and sort by decreasing order of length
for col_index, text_set in text_sets.items():
    text_list = list(text_set)
    text_list = sorted(text_list, key=lambda x: len(x), reverse=True)
    text_sets[col_index] = text_list
    
# rename, since now contain lists instead of sets
text_lists = text_sets

# create the output dir
output_dir = os.path.join(OUTDIR, date)
os.makedirs(output_dir, exist_ok=True)
    
# write one file per column
for col_index, text_list in text_lists.items():
    filename = os.path.join(output_dir, 'col_{0}.txt'.format(col_index))
    with open(filename, 'w') as outfile:
        for t in text_list:
            outfile.write('{0}\n'.format(t))
    print('Wrote file "{0}"'.format(filename))

Wrote file "results/20220328/col_1.txt"
Wrote file "results/20220328/col_22.txt"
Wrote file "results/20220328/col_27.txt"
Wrote file "results/20220328/col_33.txt"
Wrote file "results/20220328/col_35.txt"
Wrote file "results/20220328/col_36.txt"
Wrote file "results/20220328/col_37.txt"


## Diagnose Covid Severity

First some supporting functions.

In [7]:
def has_discrete_symptom(col_name, col_map, line_items):
    """
    Read the value of a radio button variable and return a Boolean indicating its value.
    Radio button values are either 1=Yes, 0=No, or 88=Unknown.
    The unknown value is treated as being False.
    """
    col_index = col_map[col_name]
    symptom = line_items[col_index]
    if '1' == symptom:
        return True
    else:
        return False

In [8]:
def discrete_value_is_zero(col_name, col_map, line_items):
    """
    Return True if the discrete value is explicitly set to 0, False otherwise.
    """
    col_index = col_map[col_name]
    symptom = line_items[col_index]
    if '0' == symptom:
        return True
    else:
        return False

In [9]:
def extract_o2_info(text_list, do_segmentation=True):
    """
    Search for text strings about Oxygen usage and extract flow rates and devices.
    """
    o2_flow_rates   = []
    o2_devices      = []
    o2_needs_o2     = []
    for text in text_list:
        
        if do_segmentation and len(text) > SEG_CHECK_LEN:
            sentences = seg_obj.parse_sentences(text)
        else:
            sentences = [text]
        
        for sentence in sentences:
            o2_list = extract_fields(sentence, o2f.run, o2f.O2Tuple)    
            for item in o2_list:
                flow_rate  = item.flow_rate
                device     = item.device
                needs      = item.needs_o2
                needs_dev  = item.needs_o2_device
                needs_flow = item.needs_o2_flow
                # patient needs O2 if a flow rate is present
                needs_o2 = needs or needs_dev or needs_flow
                o2_flow_rates.append(flow_rate)
                o2_devices.append(device)
                o2_needs_o2.append(needs_o2)
                
    return o2_flow_rates, o2_devices, o2_needs_o2

In [10]:
def extract_symptoms_from_text(text, run_fn=sf.run, decode_type=sf.SymptomTuple, do_segmentation=True, ignore_common=False):
    """
    Run the symptom finding code on the given text and return a SymptomTuple object.
    """
    if text is not None and len(text) > 0 and not text.isspace():
        
        # try to segment text into sentences if long enough
        if do_segmentation and len(text) > SEG_CHECK_LEN:
            sentences = seg_obj.parse_sentences(text)
        else:
            sentences = [text]
        
        symptom_obj_list = []
        for s in sentences:
            json_result = run_fn(text, ignore_common)
            json_data = json.loads(json_result)
            obj_list = [decode_type(**d) for d in json_data]
            assert 1 == len(obj_list)
            symptom_obj_list.append(obj_list[0])
        
        obj_count = len(symptom_obj_list)
        assert obj_count > 0
        if 1 == obj_count:
            # only a single object, no merge required
            return symptom_obj_list[0]
        else:
            # merge two or more objects into a single result
            symptom_obj = sf.merge_symptoms(symptom_obj_list)
            return symptom_obj
    else:
        return None

In [11]:
def has_symptom(symptom_key, symptom_obj_list):
    """
    Scan the sf.SymptomTuple objects in the list and determine whether any have the named symptom.
    """
    
    for obj in symptom_obj_list:
        for k,v in obj._asdict().items():
            if k == symptom_key:
                assert v is not None
                if v:
                    return True
                
    return False

In [12]:
def covid_caused_death(death_text):
    """
    Determine whether Covid is stated as a cause of death in the given text.
    This function should only operate on the 'mg_death_dx' text field.
    """
    
    # collapse repeated whitespace
    text = re.sub(r'\s+', ' ', death_text)
    
    # recognize mentions of Covid-19
    str_covid = r'\b(covid([- ]?19)?|sars-cov-2|(novel )?coronavirus)'
    regex_covid = re.compile(str_covid, re.IGNORECASE)
    
    match = regex_covid.search(text)
    if match:
        return True
    else:
        return False

In [13]:
def extract_fields(sentence, run_fn, decode_type):
    """
    Run a finder function and decode the json result to the specified type.
    """
    
    json_result = run_fn(sentence)
    json_data = json.loads(json_result)
    computed_values = [decode_type(**d) for d in json_data]
    
    return computed_values

In [14]:
def has_pneumonia_from_txt(text_list):
    """
    Check all relevant text fields to determine whether the patient has pneumonia.
    """
    
    for text in text_list:
        if 0 == len(text) or text.isspace():
            continue
        cf_list = extract_fields(text, cf.run, cf.CovidDiagnosisTuple)
        assert 1 == len(cf_list)
        cf_obj = cf_list[0]
        if cf_obj.has_pneumonia:
            return True
    
    return False

In [15]:
# need these columns for each user
USER_TEXT_COLS = [
    'mg_idpreg',        # CDC pregnancy ID
    'mg_notes',         # abstractor notes
    'mv_comp_oth_sp',   # description of other complications
    'mg_death_dx',      # cause of death
    'mv_sx_oth_sp',     # other symptoms specified
    'mv_tx_oth_sp1',    # medication 1
    'mv_tx_oth_sp2',    # medication 2
    'mv_tx_oth_sp3',    # medication 3
    # 35: mg_decon_icuadm_dt: (text)     date of icu admission       
    # 37: dis_severity
]

USER_RADIO_COLS = [
    'mv_comp_mv',        # mechanical ventilation
    'mv_comp_ecmo',      # ECMO machine
    'mv_icu',            # admitted to ICU for Covid-19
    'mv_comp_ards',      # has ARDS
    'mv_comp_pna',       # pneumonia
    'mv_sx',             # symptoms present during course of illness
    'mv_sx_fever',       # fever
    'mv_sx_sfever',      # subjective fever, felt feverish
    'mv_sx_chills',      # chills
    'mv_sx_rigors',      # rigors
    'mv_sx_myalgia',     # muscle aches (myalgias)
    'mv_sx_runnose',     # runny nose (rhinorrhea)
    'mv_sx_sthroat',     # sore throat
    'mv_sx_taste',       # new olfactory and taste disorder
    'mv_sx_fatigue',     # fatigue
    'mv_sx_cough',       # cough
    'mv_sx_wheezing',    # wheezing
    'mv_sx_sob',         # shortness of breath (dyspnea)
    'mv_sx_breath',      # difficulty breathing
    'mv_sx_chest',       # chest pain
    'mv_sx_nauvom',      # nausea or vomiting
    'mv_sx_head',        # headache
    'mv_sx_abdom',       # abdominal pain
    'mv_sx_diarrhea',    # diarrhea
    'mv_sx_oth',         # other symptoms
    'mv_tx_rem',         # remdesivir 
]


## Main Loop

In [16]:
patient_map = {}
corrupted_line_indices = []

start_time = time.time()
with open(INPUT_FILE, encoding='latin-1', newline='') as csvfile:
    for i, line in enumerate(csvfile):
        if 0 == i:
            # skip header line
            continue
        else:
            reader = csv.reader([line])
            line_items = list(reader)[0]
            # skip line if unexpected number of items present, probably decode error
            if len(line_items) != len(col_names):
    #             print('Bad line at index {0}: '.format(i))
    #             print(line)
    #             print()
                corrupted_line_indices.append(i)
                continue

        # 0th col is the user id
        user_id = line_items[0]

        # extract desired discrete fields; 'r' prefix means from a radio button
        r_vent        = has_discrete_symptom('mv_comp_mv',     col_map, line_items)
        r_ecmo        = has_discrete_symptom('mv_comp_ecmo',   col_map, line_items)
        r_icu         = has_discrete_symptom('mv_icu',         col_map, line_items)
        r_ards        = has_discrete_symptom('mv_comp_ards',   col_map, line_items)
        r_pna         = has_discrete_symptom('mv_comp_pna',    col_map, line_items)
        r_sx          = has_discrete_symptom('mv_sx',          col_map, line_items)
        r_fever1      = has_discrete_symptom('mv_sx_fever',    col_map, line_items)
        r_fever2      = has_discrete_symptom('mv_sx_sfever',   col_map, line_items)
        r_cough       = has_discrete_symptom('mv_sx_cough',    col_map, line_items)    
        r_sob         = has_discrete_symptom('mv_sx_sob',      col_map, line_items)
        r_breath      = has_discrete_symptom('mv_sx_breath',   col_map, line_items)
        r_rem         = has_discrete_symptom('mv_tx_rem',      col_map, line_items)
        r_chills      = has_discrete_symptom('mv_sx_chills',   col_map, line_items)
        r_rigors      = has_discrete_symptom('mv_sx_rigors',   col_map, line_items)
        r_myalgia     = has_discrete_symptom('mv_sx_myalgia',  col_map, line_items)
        r_runnose     = has_discrete_symptom('mv_sx_runnose',  col_map, line_items)
        r_sthroat     = has_discrete_symptom('mv_sx_sthroat',  col_map, line_items)
        r_smell_taste = has_discrete_symptom('mv_sx_taste',    col_map, line_items)
        r_fatigue     = has_discrete_symptom('mv_sx_fatigue',  col_map, line_items)
        r_wheezing    = has_discrete_symptom('mv_sx_wheezing', col_map, line_items)
        r_chest       = has_discrete_symptom('mv_sx_chest',    col_map, line_items)
        r_nauvom      = has_discrete_symptom('mv_sx_nauvom',   col_map, line_items)
        r_head        = has_discrete_symptom('mv_sx_head',     col_map, line_items)
        r_abdom       = has_discrete_symptom('mv_sx_abdom',    col_map, line_items)
        r_diarrhea    = has_discrete_symptom('mv_sx_diarrhea', col_map, line_items)
        r_sx_other    = has_discrete_symptom('mv_sx_oth',      col_map, line_items)

        # the symptom Boolean must be explicitly zero to qualify as asymptomatic
        r_asymptomatic = discrete_value_is_zero('mv_sx', col_map, line_items)

        # extract relevant symptoms from text fields

        # general notes - ignore common symptoms (nausea, vomiting, abdominal pain)
        idx = col_map['mg_notes']
        txt_notes = line_items[idx]
        symptoms_notes = extract_symptoms_from_text(txt_notes, ignore_common=True)

        # other complications - ignore common symptoms also
        idx = col_map['mv_comp_oth_sp']
        txt_other_comp = line_items[idx]
        symptoms_comp = extract_symptoms_from_text(txt_other_comp, ignore_common=True)

        # cause of death - ignore common symptoms
        idx = col_map['mg_death_dx']
        txt_death = line_items[idx]
        symptoms_death = extract_symptoms_from_text(txt_death, ignore_common=True)

        # other symptoms - also ignore common symptoms
        idx = col_map['mv_sx_oth_sp']
        txt_other_symptoms = line_items[idx]
        symptoms_other = extract_symptoms_from_text(txt_other_symptoms, ignore_common=True)

        # medication 1, 2, and 3
        idx = col_map['mv_tx_oth_sp1']
        txt_med1 = line_items[idx]
        idx = col_map['mv_tx_oth_sp2']
        txt_med2 = line_items[idx]
        idx = col_map['mv_tx_oth_sp3']
        txt_med3 = line_items[idx]

        # combine medication texts together for later output
        txt_med = ' '.join([txt_med1, txt_med2, txt_med3])
        if txt_med.isspace():
            # replace with empty string if only whitespace
            txt_med = ''
        else:
            # collapse repeated whitespace
            txt_med = re.sub(r'\s+', ' ', txt_med)

        symptoms_med1 = extract_symptoms_from_text(txt_med1, do_segmentation=False)
        symptoms_med2 = extract_symptoms_from_text(txt_med2, do_segmentation=False)
        symptoms_med3 = extract_symptoms_from_text(txt_med3, do_segmentation=False)

        # combine all symptom objects that are not None
        symptom_obj_list = []
        if symptoms_notes is not None:
            symptom_obj_list.append(symptoms_notes)
        if symptoms_comp is not None:
            symptom_obj_list.append(symptoms_comp)
        if symptoms_death is not None:
            symptom_obj_list.append(symptoms_death)
        if symptoms_other is not None:
            symptom_obj_list.append(symptoms_other)
        if symptoms_med1 is not None:
            symptom_obj_list.append(symptoms_med1)
        if symptoms_med2 is not None:
            symptom_obj_list.append(symptoms_med2)
        if symptoms_med3 is not None:
            symptom_obj_list.append(symptoms_med3)

        # check text fields for pneumonia and oxygen device info
        text_list = [txt_notes, txt_other_comp, txt_other_symptoms, txt_death]  
        has_pneumonia_txt = has_pneumonia_from_txt(text_list)

        # do not need to scan the death text for O2 devices or flow rates
        # need to scan the medication lists for Oxygen, sometimes O2 use is listed there
        text_list = text_list[:-1]
        text_list.extend([txt_med1, txt_med2, txt_med3])
        o2_flow_rates, o2_devices, o2_needs_o2 = extract_o2_info(text_list)

        # determine final symptom set
        has_pneumonia       = has_pneumonia_txt or r_pna
        has_symptoms        = r_sx
        has_other_symptoms  = r_sx_other
        has_fever           = has_symptom('has_fever',           symptom_obj_list) or r_fever1 or r_fever2
        has_dyspnea         = has_symptom('has_dyspnea',         symptom_obj_list) or r_sob or r_breath
        has_cough           = has_symptom('has_cough',           symptom_obj_list) or r_cough    
        is_intubated        = has_symptom('is_intubated',        symptom_obj_list)
        is_ventilated       = has_symptom('is_ventilated',       symptom_obj_list) or r_vent
        in_icu              = has_symptom('in_icu',              symptom_obj_list) or r_icu
        has_ards_or_rf      = has_symptom('has_ards_or_rf',      symptom_obj_list) or r_ards
        on_ecmo             = has_symptom('on_ecmo',             symptom_obj_list) or r_ecmo
        has_septic_shock    = has_symptom('has_septic_shock',    symptom_obj_list)
        has_mod             = has_symptom('has_mod',             symptom_obj_list)
        on_remdesivir       = has_symptom('on_remdesivir',       symptom_obj_list) or r_rem
        on_plasma           = has_symptom('on_plasma',           symptom_obj_list)
        on_plaquenil        = has_symptom('on_plaquenil',        symptom_obj_list)
        on_azithromycin     = has_symptom('on_azithromycin',     symptom_obj_list)
        on_other_drugs      = has_symptom('on_other_drugs',      symptom_obj_list)
        on_dexamethasone    = has_symptom('on_dexamethasone',    symptom_obj_list)    
        has_chills          = has_symptom('has_chills',          symptom_obj_list) or r_chills
        has_rigors          = has_symptom('has_rigors',          symptom_obj_list) or r_rigors
        has_myalgia         = has_symptom('has_myalgia',         symptom_obj_list) or r_myalgia
        has_runny_nose      = has_symptom('has_runny_nose',      symptom_obj_list) or r_runnose
        has_sore_throat     = has_symptom('has_sore_throat',     symptom_obj_list) or r_sthroat
        has_prob_with_taste = has_symptom('has_prob_with_taste', symptom_obj_list) or r_smell_taste
        has_prob_with_smell = has_symptom('has_prob_with_smell', symptom_obj_list) or r_smell_taste
        has_fatigue         = has_symptom('has_fatigue',         symptom_obj_list) or r_fatigue
        has_wheezing        = has_symptom('has_wheezing',        symptom_obj_list) or r_wheezing
        has_chest_pain      = has_symptom('has_chest_pain',      symptom_obj_list) or r_chest
        has_nausea          = has_symptom('has_nausea',          symptom_obj_list) or r_nauvom
        has_vomiting        = has_symptom('has_vomiting',        symptom_obj_list) or r_nauvom
        has_headache        = has_symptom('has_headache',        symptom_obj_list) or r_head
        has_abdominal_pain  = has_symptom('has_abdominal_pain',  symptom_obj_list) or r_abdom
        has_diarrhea        = has_symptom('has_diarrhea',        symptom_obj_list) or r_diarrhea
        is_asymptomatic     = has_symptom('is_asymptomatic',     symptom_obj_list) or r_asymptomatic

        # check to see if the patient died from covid
        died_from_covid = covid_caused_death(txt_death)

        # get the two relevant date fields and convert to datetime objects if possible
        date_string_1 = line_items[date_col_indices[0]]
        date_string_2 = line_items[date_col_indices[1]]
        # check to see if both are actual dates
        match1 = re.search(r'\d\d\d\d\-\d\d\-\d\d', date_string_1)
        match2 = re.search(r'\d\d\d\d\-\d\d\-\d\d', date_string_2)
        if match1 and match2:
            # convert to datetime objects
            datetime1 = datetime.datetime.strptime(date_string_1, '%Y-%m-%d')
            datetime2 = datetime.datetime.strptime(date_string_2, '%Y-%m-%d')
        else:
            datetime1 = None
            datetime2 = None

        # all data has been extracted, so fill in data object for this patient
        patient_data = dc.PatientData(

            has_pneumonia = has_pneumonia,
            has_symptoms = has_symptoms,
            has_other_symptoms = has_other_symptoms,

            # covid-relevant symptoms
            has_fever = has_fever,
            has_dyspnea = has_dyspnea,
            has_cough = has_cough,
            is_intubated = is_intubated,
            is_ventilated = is_ventilated,
            in_icu = in_icu,
            has_ards_or_rf = has_ards_or_rf,
            on_ecmo = on_ecmo,
            has_septic_shock = has_septic_shock,
            has_mod = has_mod,
            on_remdesivir = on_remdesivir,
            on_plasma = on_plasma,
            on_plaquenil = on_plaquenil,
            on_azithromycin = on_azithromycin,
            on_other_drugs = on_other_drugs,
            on_dexamethasone = on_dexamethasone,

            # other symptoms
            has_chills = has_chills,
            has_rigors = has_rigors,
            has_myalgia = has_myalgia,
            has_runny_nose = has_runny_nose,
            has_sore_throat = has_sore_throat,
            has_prob_with_taste = has_prob_with_taste,
            has_prob_with_smell = has_prob_with_smell,
            has_fatigue = has_fatigue,
            has_wheezing = has_wheezing,
            has_chest_pain = has_chest_pain,
            has_nausea = has_nausea,
            has_vomiting = has_vomiting,
            has_headache = has_headache,
            has_abdominal_pain = has_abdominal_pain,
            has_diarrhea = has_diarrhea,

            is_asymptomatic = is_asymptomatic,

            # whether died from covid or not
            died_from_covid = died_from_covid,

            # from o2sat finder
            o2_flow_rate_list = o2_flow_rates, # L/min
            o2_device_list = o2_devices,
            needs_o2_list = o2_needs_o2,

            # save all text fields (mainly for debugging)
            text_list = [
                txt_notes, txt_other_comp, txt_death, txt_other_symptoms, txt_med
            ],

            datetime1 = datetime1,
            datetime2 = datetime2
        )

        # diagnose the severity of the Covid-19 infection
        diagnosis = dc.diagnose_covid_severity(patient_data)

        # store patient info and the diagnosis as a tuple in a map keyed by patient id
        assert user_id not in patient_map
        patient_map[user_id] = (diagnosis, patient_data)

        if i > 0 and (0 == i % 1000):
            print('Processed {0} patients...'.format(i))
        
end_time = time.time()
elapsed_time_s = end_time - start_time
print('\nCompleted processing for file {0}.'.format(INPUT_FILE))
print('\tFound {0} patients and {1} corrupted lines in the file.'.
      format(len(patient_map), len(corrupted_line_indices)))
print('\tElapsed time: {0:.3f} seconds'.format(elapsed_time_s))
print('\tAvg. rate: {0:.3f} patients/sec'.format(len(patient_map)/elapsed_time_s))
print('\nCorrupted lines (0-based indexing): ')
print(corrupted_line_indices)


Completed processing for file synthetic_data_20220328.csv.
	Found 200 patients and 0 corrupted lines in the file.
	Elapsed time: 0.296 seconds
	Avg. rate: 675.746 patients/sec

Corrupted lines (0-based indexing): 
[]


#### Group Patient IDs into Lists According to Diagnosis

In [17]:
# sort the patient ids
patient_ids = sorted([k for k,v in patient_map.items()])

# collect all patient ids for each diagnosis
critical_list = []
severe_list   = []
mild_list     = []
asymp_list    = []
unk_list      = []

# special handling for all patients on dexamethasone
dexa_list     = []

for i, pid in enumerate(patient_ids):
    
    diagnosis, patient_data = patient_map[pid]    
        
    if dc.DIAG_CRITICAL == diagnosis:
        critical_list.append(pid)
    elif dc.DIAG_SEVERE == diagnosis:
        severe_list.append(pid)
    elif dc.DIAG_MILD == diagnosis:
        mild_list.append(pid)
    elif dc.DIAG_ASYMP == diagnosis:
        asymp_list.append(pid)
    else:
        unk_list.append(pid)
    
    if patient_data.on_dexamethasone:
        dexa_list.append(pid)
    
print('Diagnosis summary: ')
print('\tCritical     : {0:>9}'.format(len(critical_list)))
print('\tSevere       : {0:>9}'.format(len(severe_list)))
print('\tMild         : {0:>9}'.format(len(mild_list)))
print('\tAsymptomatic : {0:>9}'.format(len(asymp_list)))
print('\tUnknown      : {0:>9}'.format(len(unk_list)))

total_patients = len(critical_list) + len(severe_list) + len(mild_list) + len(asymp_list) + len(unk_list)
print('\t       Total : {0:>9}'.format(total_patients))

print('\nFound {0} patients on dexamethasone.'.format(len(dexa_list)))
#print('\nFound {0} instances of unknown "other" symptoms.'.format(len(unknown_other_symptoms_list)))

Diagnosis summary: 
	Critical     :         5
	Severe       :        21
	Mild         :        45
	Asymptomatic :        10
	Unknown      :       119
	       Total :       200

Found 0 patients on dexamethasone.


#### Write Debug Files

In [18]:
# create the output dir if it doesn't already exist
output_dir = os.path.join(OUTDIR, date)
os.makedirs(output_dir, exist_ok=True)
    
# get max-length key for aligning output
obj = None
for patient_id, patient_tup in patient_map.items():
    # patient_tup == (diagnosis, patient_data)
    obj = patient_tup[1]
    break
maxlen = max([len(k) for k,v in obj._asdict().items()])    
    
data = [
    (critical_list, 'critical'),
    (severe_list,   'severe'),
    (mild_list,     'mild'),
    (asymp_list,    'asymptomatic'),
    (unk_list,      'unknown'),
    (dexa_list,     'dexamethasone'),
]    

# write up to this many patients per debug file
MAX_DEBUG_PATIENTS = 1000

for patient_id_list, str_diagnosis in data:
    # name the file by the diagnosis, such as 'debug_critical.txt'
    filename = os.path.join(output_dir, 'debug_{0}.txt'.format(str_diagnosis))
    with open(filename, 'w') as outfile:
        for i, pid in enumerate(patient_id_list):
            if i >= MAX_DEBUG_PATIENTS:
                break
            outfile.write('[{0}]: {1}\n'.format(i, pid))
            diagnosis, patient_data = patient_map[pid]
            for field, value in patient_data._asdict().items():
                outfile.write('\t{0:>{1}} : {2}\n'.format(field, maxlen, value))
            outfile.write('\n')
    print('Wrote file "{0}"'.format(filename))

Wrote file "results/20220328/debug_critical.txt"
Wrote file "results/20220328/debug_severe.txt"
Wrote file "results/20220328/debug_mild.txt"
Wrote file "results/20220328/debug_asymptomatic.txt"
Wrote file "results/20220328/debug_unknown.txt"
Wrote file "results/20220328/debug_dexamethasone.txt"


#### Write Output File

The output file is in CSV format with a single row per patient. Each row contains the patient ID and a text string for the diagnosis. The filename has the form "diagnosis_YYYYMM.csv", with the year and month matching those of the input file.

In [19]:
filename = os.path.join(output_dir, 'diagnoses_{0}.csv'.format(date))
with open(filename, 'w') as outfile:
    for i, pid in enumerate(patient_ids):
        diagnosis, patient_data = patient_map[pid]
        # convert numeric diagnosis code to text
        diagnosis_text = dc.DIAGNOSIS_CODE_TO_TEXT[diagnosis]
        outfile.write('{0},{1}\n'.format(pid, diagnosis_text))
        
print('Wrote output file "{0}"'.format(filename))

Wrote output file "results/20220328/diagnoses_20220328.csv"
