In [1]:
import numpy as np
import pandas as pd
import pickle
import os

import evaluation.descriptive_utils as u
import analyzer.dataset as ds
import analyzer.loaders.hartford.hartford as hartford

from scipy import stats

In [2]:
columns = ['Age', 'Gender', 'ABG: Oxygen Saturation (SaO2)', 'Cardiac Frequency', 'Alanine Aminotransferase (ALT)', 'Blood Creatinine', 'Blood Sodium', 'Blood Urea Nitrogen (BUN)', 'Body Temperature', 'C-Reactive Protein (CRP)', 'CBC: Hemoglobin', 'CBC: Leukocytes', 'CBC: Mean Corpuscular Volume (MCV)', 'Aspartate Aminotransferase (AST)', 'CBC: Platelets', 'Cardiac dysrhythmias', 'Chronic kidney disease', 'Coronary atherosclerosis and other heart disease', 'Diabetes', 'Glycemia', 'Potassium Blood Level', 'Prothrombin Time (INR)']
features = {'numeric': [{'name': 'Age', 'index': 0, 'min_val': 0.0, 'max_val': 100.0, 'default': 69.0, 'explanation': 'Age of the patient. Modeled only for adults.'}, {'name': 'ABG: Oxygen Saturation (SaO2)', 'index': 2, 'min_val': 80, 'max_val': 100.0, 'default': 94.0, 'explanation': 'Oxygen Saturation (SaO2) in %'}, {'name': 'Cardiac Frequency', 'index': 3, 'min_val': 40.0, 'max_val': 171.0, 'default': 90.0, 'explanation': 'Number of Beats per Minute.'}, {'name': 'Alanine Aminotransferase (ALT)', 'index': 4, 'min_val': 2.0, 'max_val': 929.0, 'default': 27.4, 'explanation': 'Alanine Aminotransferase (ALT) in U/L'}, {'name': 'Blood Creatinine', 'index': 5, 'min_val': 0.0, 'max_val': 11.0, 'default': 0.94, 'explanation': 'Blood Creatinine in mg/dL'}, {'name': 'Blood Sodium', 'index': 6, 'min_val': 115.0, 'max_val': 166.0, 'default': 137.3, 'explanation': 'Blood Sodium in mmol/L'}, {'name': 'Blood Urea Nitrogen (BUN)', 'index': 7, 'min_val': 4.0, 'max_val': 174.0, 'default': 17.29, 'explanation': 'Blood Urea Nitrogen (BUN) in mg/dL'}, {'name': 'Body Temperature', 'index': 8, 'min_val': 34, 'max_val': 104.0, 'default': 98.6, 'explanation': 'Body temperature measurement. Use the dropdown to select the unit (Fahrenheit or Celsius).'}, {'name': 'C-Reactive Protein (CRP)', 'index': 9, 'min_val': 0.0, 'max_val': 567.0, 'default': 75.47, 'explanation': 'C-Reactive Protein (CRP) in mg/L'}, {'name': 'CBC: Hemoglobin', 'index': 10, 'min_val': 6.0, 'max_val': 19.0, 'default': 13.9, 'explanation': 'Hemoglobin in g/dL'}, {'name': 'CBC: Leukocytes', 'index': 11, 'min_val': 0.0, 'max_val': 36.0, 'default': 6.93, 'explanation': 'Leukocytes in 10^3/muL'}, {'name': 'CBC: Mean Corpuscular Volume (MCV)', 'index': 12, 'min_val': 58.0, 'max_val': 116.0, 'default': 88.42, 'explanation': 'Mean Corpuscular Volume (MCV) in fL'}, {'name': 'Aspartate Aminotransferase (AST)', 'index': 13, 'min_val': 9.0, 'max_val': 941.0, 'default': 37.0, 'explanation': 'Aspartate Aminotransferase (AST) in U/L'}, {'name': 'CBC: Platelets', 'index': 14, 'min_val': 20.0, 'max_val': 756.0, 'default': 204.0, 'explanation': 'Platelets in 10^3/muL'}, {'name': 'Glycemia', 'index': 19, 'min_val': 57.0, 'max_val': 620.0, 'default': 119.0, 'explanation': 'Blood Glucose in mg/dL'}, {'name': 'Potassium Blood Level', 'index': 20, 'min_val': 2.0, 'max_val': 7.0, 'default': 4.06, 'explanation': 'Potassium Blood Level in mmol/L'}, {'name': 'Prothrombin Time (INR)', 'index': 21, 'min_val': 0.0, 'max_val': 17.0, 'default': 1.11, 'explanation': 'Prothrombin Time Ratio (INR)'}], 'categorical': [{'name': 'Gender', 'index': 1, 'vals': [0.0, 1.0], 'default': 0.0, 'explanation': 'Select the gender of the patient'}], 'checkboxes': [{'name': 'Symptoms', 'index': [], 'vals': [], 'explanation': ['Select the existing symptoms.']}], 'multidrop': [{'name': 'Comorbidities', 'index': [15, 16, 17, 18], 'vals': ['Cardiac dysrhythmias', 'Chronic kidney disease', 'Coronary atherosclerosis and other heart disease', 'Diabetes'], 'explanation': ['Select the existing chronic diseases or conditions.']}]}

In [3]:
model_type = "mortality"; model_lab = "with_lab";

website_path = '../../website/'

# with open(website_path+'assets/risk_calculators/'+model_type+'/model_'+model_lab+'.pkl', 'rb') as file:
#         model_file = pickle.load(file)

# model = model_file['model']
# features = model_file['json']
# columns = model_file['columns']
# imputer= model_file['imputer']

X, y = u.get_dataset_preload(model_type, model_lab)

['discharge' 'comorbidities' 'vitals' 'lab' 'demographics']
ABG: Oxygen Saturation (SaO2): LB = 75, UB = 99 (Filter = 181)
Age: LB = 26.0, UB = 96.0 (Filter = 49)
Alanine Aminotransferase (ALT): LB = 7.0, UB = 213.0 (Filter = 50)
Aspartate Aminotransferase (AST): LB = 13.0, UB = 228.0 (Filter = 52)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_X[col][outlier_inds] = np.nan


Blood Creatinine: LB = 0.0, UB = 6.0 (Filter = 27)
Blood Sodium: LB = 126.0, UB = 154.0 (Filter = 54)
Blood Urea Nitrogen (BUN): LB = 6.0, UB = 112.0 (Filter = 47)
Body Temperature: LB = 95.0, UB = 104.0 (Filter = 8)
C-Reactive Protein (CRP): LB = 0.0, UB = 402.0 (Filter = 29)
CBC: Hemoglobin: LB = 8.0, UB = 18.0 (Filter = 33)
CBC: Leukocytes: LB = 2.0, UB = 23.0 (Filter = 42)
CBC: Mean Corpuscular Volume (MCV): LB = 65.0, UB = 103.0 (Filter = 49)
CBC: Platelets: LB = 68.0, UB = 524.0 (Filter = 55)
Cardiac Frequency: LB = 56.0, UB = 140.0 (Filter = 54)
Cardiac dysrhythmias: LB = 0.0, UB = 1.0 (Filter = 0)
Chronic kidney disease: LB = 0.0, UB = 1.0 (Filter = 0)
Coronary atherosclerosis and other heart disease: LB = 0.0, UB = 1.0 (Filter = 0)
Diabetes: LB = 0.0, UB = 1.0 (Filter = 0)
Essential hypertension: LB = 0.0, UB = 1.0 (Filter = 0)
Gender: LB = 0.0, UB = 1.0 (Filter = 0)
Glycemia: LB = 80.0, UB = 381.0 (Filter = 53)
Potassium Blood Level: LB = 2.0, UB = 6.0 (Filter = 12)
Prothromb

In [4]:
data = X.drop(['Location'], axis = 1).copy()
data['Outcome'] = y

data_a= data.query('Outcome == 1')
data_b = data.query('Outcome == 0')
summary_table = u.pairwise_compare(data_a, data_b, features, 
                                   title_mapping = u.title_mapping_summary, row_order = u.row_order,
                                 filter_A = 'Non-Survivor', filter_B = 'Survivor')

summary_table.to_csv('../results/summary_tables/descriptive_derivation_bysurvival.csv',
                      index = False)

In [5]:
data_a = X.copy()
data_a['Outcome'] = y

val_df = pd.read_csv("../../covid19_greece/general_greek_registry.csv")

val_df = val_df.loc[val_df['Outcome'].isin([0,1])]
if model_lab == 'without_lab':
    val_df = val_df.rename(columns={'ABG: Oxygen Saturation (SaO2)':'SaO2'})
if val_df['Body Temperature'].mean() < 45:
    val_df['Body Temperature'] = ((val_df['Body Temperature']/5)*9)+32
    
data_gr = val_df.reindex(columns = columns+['Outcome'])

summary_table = u.pairwise_compare(data_a, data_gr, features,
                                title_mapping = u.title_mapping_summary, row_order = u.row_order,
                                filter_A = 'Derivation', filter_B = 'Greece')

summary_table.to_csv('../results/summary_tables/descriptive_derivation_greece.csv',
                      index = False)

In [7]:
#%% Sevilla
data_sev = pd.read_csv("../../covid19_sevilla/sevilla_clean.csv")
data_sev = data_sev.reindex(columns = columns+['Outcome'])

summary_table = u.pairwise_compare(data_a, data_sev, features,
                                title_mapping = u.title_mapping_summary, row_order = u.row_order,
                                filter_A = 'Derivation', filter_B = 'Sevilla')

summary_table.to_csv('../results/summary_tables/descriptive_derivation_sevilla.csv',
                      index = False)

In [8]:
#%% Hartford
df_hhc = pd.read_csv("/nfs/sloanlab003/projects/cov19_calc_proj/hartford/hhc_inpatient_main.csv")

if model_lab == "with_lab":
    df_hhc.rename(columns={'SaO2':'ABG: Oxygen Saturation (SaO2)'}, inplace = True)
    
data_b = df_hhc.reindex(columns = columns)
data_b['Outcome'] = df_hhc['Outcome']

summary_table = u.pairwise_compare(data_a, data_b, features,
                                title_mapping = u.title_mapping, row_order = u.row_order,
                                filter_A = 'Derivation', filter_B = 'Hartford')

summary_table.to_csv('../results/summary_tables/descriptive_derivation_hartford.csv',
                      index = True)

In [27]:
#%% Summarize derivation by site
data = X.copy()
data['Outcome'] = y

describe_cremona = u.descriptive_table(data.query('Location == "Cremona"'), features, short_version = True)
describe_spain = u.descriptive_table(data.query('Location == "Spain"'), features, short_version = True)
describe_hartford_other = u.descriptive_table(data.query('Location == "Hartford"'), features, short_version = True)
full = pd.concat([describe_cremona, describe_spain, describe_hartford_other], axis = 1)

full = full.reindex(u.row_order)
full.reset_index(inplace = True)
full['index'] = full['index'].replace(u.title_mapping_summary, inplace=False)

full.loc['Filter',:] = [np.nan, 'Cremona', 'Cremona', 'Spain', 'Spain', 'Hartford Affiliate', 'Hartford Affiliate']
full.to_csv("../results/descriptive_all_sites_derivation.csv")

In [None]:
#%% Summarize validation by site
data = X.copy()
data['Outcome'] = y

describe_greece = u.descriptive_table(data_gr, features, short_version = True)
describe_sevilla = u.descriptive_table(data_sev, features, short_version = True)
describe_hartford_main = u.descriptive_table(data.query('Location == "Hartford"'), features, short_version = True)
full = pd.concat([describe_greece, describe_sevilla, describe_hartford_other], axis = 1)

full = full.reindex(u.row_order)
full.reset_index(inplace = True)
full['index'] = full['index'].replace(u.title_mapping_summary, inplace=False)

full.loc['Filter',:] = [np.nan, 'Greek HC', 'Greek HC', 'Sevilla', 'Sevilla', 'Hartford Hospital', 'Hartford Hospital']
full.to_csv("../results/descriptive_all_sites_validation.csv")