In [514]:
import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import itertools
import multiprocessing
import time
from dask.distributed import Client, progress
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeRegressor
from dateutil.relativedelta import relativedelta
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from joblib import Parallel, delayed
from skopt.space import Real, Integer
from skopt.utils import use_named_args

pd.options.display.max_rows = 120000
num_cores = multiprocessing.cpu_count() - 1

In [58]:
conditions = pd.read_csv("train/conditions.csv")
observations = pd.read_csv("train/observations.csv")
patients = pd.read_csv("train/patients.csv")
procedures = pd.read_csv("train/procedures.csv")

In [59]:
careplans = pd.read_csv("train/careplans.csv")

In [60]:
encounters = pd.read_csv("train/encounters.csv")

In [61]:
medications = pd.read_csv("train/medications.csv")

In [270]:
supplies = pd.read_csv("train/supplies.csv")

In [272]:
devices = pd.read_csv("train/devices.csv")

In [324]:
img_studies = pd.read_csv("train/imaging_studies.csv")

In [363]:
immunizations = pd.read_csv("train/immunizations.csv")

**Medications:**

In [50]:
medications.head()

Unnamed: 0,START,STOP,PATIENT,PAYER,ENCOUNTER,CODE,DESCRIPTION,BASE_COST,PAYER_COVERAGE,DISPENSES,TOTALCOST,REASONCODE,REASONDESCRIPTION
0,1995-08-22,1995-08-22,93fa6213-73a1-4cce-b408-9986a4145801,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,4487de56-f20a-422d-a2f6-aa15dcd1cd6d,312961,Simvastatin 20 MG Oral Tablet,21.65,0.0,1,21.65,,
1,1995-08-22,1995-08-22,93fa6213-73a1-4cce-b408-9986a4145801,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,4487de56-f20a-422d-a2f6-aa15dcd1cd6d,197361,Amlodipine 5 MG Oral Tablet,36.13,0.0,1,36.13,,
2,1995-08-22,1996-08-27,93fa6213-73a1-4cce-b408-9986a4145801,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,4487de56-f20a-422d-a2f6-aa15dcd1cd6d,312961,Simvastatin 20 MG Oral Tablet,38.59,0.0,12,463.08,,
3,1995-08-22,1996-08-27,93fa6213-73a1-4cce-b408-9986a4145801,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,4487de56-f20a-422d-a2f6-aa15dcd1cd6d,197361,Amlodipine 5 MG Oral Tablet,34.21,0.0,12,410.52,,
4,1996-08-27,1997-09-02,93fa6213-73a1-4cce-b408-9986a4145801,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,c6d44945-d551-499e-aaf7-496ffd9a85e2,312961,Simvastatin 20 MG Oral Tablet,18.91,0.0,12,226.92,,


In [7]:
medications.REASONDESCRIPTION.unique()

array([nan, 'Acute viral pharyngitis (disorder)',
       'Posttraumatic stress disorder', 'Hypertension', 'Diabetes',
       'Acute deep venous thrombosis (disorder)', 'Hypoxemia (disorder)',
       'Major depression  single episode', 'Osteoarthritis of knee',
       'Bacterial infectious disease (disorder)',
       'Septic shock (disorder)', 'Acute pulmonary embolism (disorder)',
       'Pulmonary emphysema (disorder)',
       'Non-small cell carcinoma of lung  TNM stage 1 (disorder)',
       'Hyperlipidemia', 'Malignant tumor of colon',
       'Overlapping malignant neoplasm of colon',
       'Chronic congestive heart failure (disorder)', 'Childhood asthma',
       'Acute bronchitis (disorder)', 'Osteoarthritis of hip',
       'Contact dermatitis', "Alzheimer's disease (disorder)",
       'Localized  primary osteoarthritis of the hand',
       'Secondary malignant neoplasm of colon', 'COVID-19',
       'Primary fibromyalgia syndrome',
       'Escherichia coli urinary tract infection'

In [268]:
medications.DISPENSES.min()

1

**Encounters:**

In [8]:
encounters.head()

Unnamed: 0,Id,START,STOP,PATIENT,ORGANIZATION,PROVIDER,PAYER,ENCOUNTERCLASS,CODE,DESCRIPTION,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE,REASONDESCRIPTION
0,4487de56-f20a-422d-a2f6-aa15dcd1cd6d,1995-08-23T02:20:21Z,1995-08-23T02:35:21Z,93fa6213-73a1-4cce-b408-9986a4145801,066dc005-2020-39f1-ba27-8cf297e231d0,35509b88-c026-38e1-b3e1-cb218431a06d,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,wellness,410620009,Well child visit (procedure),129.16,129.16,0.0,,
1,c6d44945-d551-499e-aaf7-496ffd9a85e2,1996-08-28T02:20:21Z,1996-08-28T02:35:21Z,93fa6213-73a1-4cce-b408-9986a4145801,066dc005-2020-39f1-ba27-8cf297e231d0,35509b88-c026-38e1-b3e1-cb218431a06d,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,wellness,410620009,Well child visit (procedure),129.16,129.16,0.0,,
2,bf6651e1-8e8c-426d-ae01-136f36db137e,1997-09-03T02:20:21Z,1997-09-03T02:50:21Z,93fa6213-73a1-4cce-b408-9986a4145801,066dc005-2020-39f1-ba27-8cf297e231d0,35509b88-c026-38e1-b3e1-cb218431a06d,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,wellness,410620009,Well child visit (procedure),129.16,129.16,0.0,,
3,1d731dbe-3a08-4242-aacf-9ee71e186146,1998-09-09T02:20:21Z,1998-09-09T02:35:21Z,93fa6213-73a1-4cce-b408-9986a4145801,066dc005-2020-39f1-ba27-8cf297e231d0,35509b88-c026-38e1-b3e1-cb218431a06d,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,wellness,410620009,Well child visit (procedure),129.16,129.16,0.0,,
4,c6e318d5-c0a2-4317-80f5-10f187b1d1a2,1999-09-15T02:20:21Z,1999-09-15T02:50:21Z,93fa6213-73a1-4cce-b408-9986a4145801,066dc005-2020-39f1-ba27-8cf297e231d0,35509b88-c026-38e1-b3e1-cb218431a06d,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,wellness,410620009,Well child visit (procedure),129.16,129.16,0.0,,


In [9]:
encounters.DESCRIPTION.unique()

array(['Well child visit (procedure)', 'Encounter for symptom',
       'Myocardial Infarction', 'Death Certification',
       'Encounter for symptom (procedure)',
       'Initial Psychiatric Interview with mental status evaluation',
       'General examination of patient (procedure)',
       'posttraumatic stress disorder',
       'Hypertension follow-up encounter', 'Encounter for problem',
       'Prenatal initial visit', 'Obstetric emergency hospital admission',
       'Postnatal visit', 'Encounter for check up (procedure)',
       'Hospital admission for isolation (procedure)', 'Prenatal visit',
       'Patient encounter procedure', 'Urgent care clinic (procedure)',
       'Follow-up encounter', 'Consultation for treatment',
       'Emergency room admission (procedure)', "Encounter for 'check-up'",
       'Admission to intensive care unit (procedure)',
       'Patient-initiated encounter', 'Emergency Room Admission',
       'Encounter Inpatient', 'Cardiac Arrest', 'Outpatient proced

In [196]:
encounters.loc[:, 'START'] = pd.to_datetime(encounters['START'])
encounters.loc[:, 'STOP'] = pd.to_datetime(encounters['STOP'])

In [197]:
encounters.loc[:, 'START'] = pd.DatetimeIndex(pd.to_datetime(encounters.START)).tz_localize(None).to_series().values
encounters.loc[:, 'STOP'] = pd.DatetimeIndex(pd.to_datetime(encounters.STOP)).tz_localize(None).to_series().values

**Observations:**

In [10]:
observations.head()

Unnamed: 0,DATE,PATIENT,ENCOUNTER,CODE,DESCRIPTION,VALUE,UNITS,TYPE
0,2000-12-20,93fa6213-73a1-4cce-b408-9986a4145801,d0105ca5-d725-44e9-aaee-51f0e04392e4,8310-5,Body temperature,37.9,Cel,numeric
1,2001-09-25,93fa6213-73a1-4cce-b408-9986a4145801,8c82b36e-c3ac-428c-b56d-d51d5a4b8b16,8302-2,Body Height,141.0,cm,numeric
2,2001-09-25,93fa6213-73a1-4cce-b408-9986a4145801,8c82b36e-c3ac-428c-b56d-d51d5a4b8b16,72514-3,Pain severity - 0-10 verbal numeric rating [Sc...,1.0,{score},numeric
3,2001-09-25,93fa6213-73a1-4cce-b408-9986a4145801,8c82b36e-c3ac-428c-b56d-d51d5a4b8b16,29463-7,Body Weight,34.7,kg,numeric
4,2001-09-25,93fa6213-73a1-4cce-b408-9986a4145801,8c82b36e-c3ac-428c-b56d-d51d5a4b8b16,39156-5,Body Mass Index,17.4,kg/m2,numeric


In [11]:
observations.DESCRIPTION.unique()

array(['Body temperature', 'Body Height',
       'Pain severity - 0-10 verbal numeric rating [Score] - Reported',
       'Body Weight', 'Body Mass Index',
       'Body mass index (BMI) [Percentile] Per age and gender',
       'Diastolic Blood Pressure', 'Systolic Blood Pressure',
       'Heart rate', 'Respiratory rate', 'Tobacco smoking status NHIS',
       'Cause of Death [US Standard Certificate of Death]',
       'Leukocytes [#/volume] in Blood by Automated count',
       'Erythrocytes [#/volume] in Blood by Automated count',
       'Hemoglobin [Mass/volume] in Blood',
       'Hematocrit [Volume Fraction] of Blood by Automated count',
       'MCV [Entitic volume] by Automated count',
       'MCH [Entitic mass] by Automated count',
       'MCHC [Mass/volume] by Automated count',
       'Erythrocyte distribution width [Entitic volume] by Automated count',
       'Platelets [#/volume] in Blood by Automated count',
       'Platelet distribution width [Entitic volume] in Blood by Automat

**Care plans:**

In [12]:
careplans.head()

Unnamed: 0,Id,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION,REASONCODE,REASONDESCRIPTION
0,efd24baf-9341-46ee-abd5-706d38c3f19b,2016-11-09,,53bbfa41-5cab-422e-a8a7-8fa3ae55ed71,9c42c408-d3e7-4ad0-a725-ece855b46ca7,718347000,Mental health care plan,36923009.0,Major depression single episode
1,b760f725-018e-4537-9154-dc748060cd85,2007-09-05,,30fb496e-b8d3-4d36-8e50-86acb3d1a223,1e737ce2-b782-40a9-b9b4-3ee880d9afca,736254008,Psychiatry care plan,47505003.0,Posttraumatic stress disorder
2,a817efab-e967-4c1d-b8d6-86ba61c960b2,2020-03-10,2020-03-10,30fb496e-b8d3-4d36-8e50-86acb3d1a223,a703222d-f88a-45ef-be97-b3636357b5bc,736376001,Infectious disease care plan (record artifact),840544004.0,Suspected COVID-19
3,0b329cab-45e7-4adc-8dcd-b152c3f0a892,1996-08-09,,afa2680f-7f73-46d9-b0cd-2cf3db49724b,76fdf7e8-da54-4932-9e42-e9c3f735af42,443402002,Lifestyle education regarding hypertension,59621000.0,Hypertension
4,f47a0b0c-2004-48a8-af77-4b030bff16bc,2001-09-07,,afa2680f-7f73-46d9-b0cd-2cf3db49724b,c55e3a9a-0c1b-4de5-9013-769ed8bb2414,698360004,Diabetes self management plan,15777000.0,Prediabetes


In [13]:
careplans.REASONDESCRIPTION.unique()

array(['Major depression  single episode',
       'Posttraumatic stress disorder', 'Suspected COVID-19',
       'Hypertension', 'Prediabetes', 'COVID-19', nan, 'Normal pregnancy',
       'Diabetes', 'Major depression disorder', 'Smokes tobacco daily',
       'Whiplash injury to neck', 'Osteoarthritis of knee',
       'Concussion with no loss of consciousness',
       'Pulmonary emphysema (disorder)',
       'Non-small cell carcinoma of lung  TNM stage 1 (disorder)',
       'Sprain of wrist', 'Fracture of forearm', 'Neoplasm of prostate',
       'Hyperlipidemia', 'Drug overdose',
       'Chronic congestive heart failure (disorder)',
       'Acute bronchitis (disorder)',
       'Localized  primary osteoarthritis of the hand',
       'Overlapping malignant neoplasm of colon', 'Osteoarthritis of hip',
       'Fracture of rib', "Alzheimer's disease (disorder)",
       'Secondary malignant neoplasm of colon', 'Fracture of clavicle',
       'Fracture subluxation of wrist',
       'Escherichia

**Patients:**

In [62]:
patients.head()

Unnamed: 0,Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,LAST,SUFFIX,...,BIRTHPLACE,ADDRESS,CITY,STATE,COUNTY,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE
0,93fa6213-73a1-4cce-b408-9986a4145801,1989-09-05,2001-12-04,999-87-8276,,,,Roland928,Wolff180,,...,Raynham Massachusetts US,1020 Satterfield Meadow,Weston,Massachusetts,Middlesex County,,42.398277,-71.289383,6520.55,0.0
1,53bbfa41-5cab-422e-a8a7-8fa3ae55ed71,1997-09-25,,999-96-1808,S99962046,X54333499X,Mr.,Jorge203,Chavarría957,,...,Juarez Chihuahua MX,235 Bahringer Trail,Dracut,Massachusetts,Middlesex County,,42.691016,-71.275475,545777.93,3282.24
2,30fb496e-b8d3-4d36-8e50-86acb3d1a223,1981-09-11,,999-89-5604,S99970660,X20079551X,Mr.,Octavio643,Schneider199,,...,Berkley Massachusetts US,1073 Stark Gardens,Westfield,Massachusetts,Hampden County,1086.0,42.174484,-72.784777,1039005.83,2796.44
3,afa2680f-7f73-46d9-b0cd-2cf3db49724b,1978-06-16,,999-21-2569,S99940488,X6135636X,Mrs.,Janeth814,Beatty507,,...,Boston Massachusetts US,128 Considine Plaza Apt 21,Wakefield,Massachusetts,Middlesex County,1880.0,42.460402,-71.113293,878157.6,8128.09
4,26e5d262-6cdc-4274-a5cd-d7fd439e35f9,1989-09-05,,999-39-4400,S99956192,X43478634X,Mr.,Adrian111,Blanda868,,...,Hamilton Massachusetts US,840 Brekke Union,Weston,Massachusetts,Middlesex County,,42.338808,-71.34151,21147.02,0.0


In [32]:
patients.columns

Index(['Id', 'BIRTHDATE', 'DEATHDATE', 'SSN', 'DRIVERS', 'PASSPORT', 'PREFIX',
       'FIRST', 'LAST', 'SUFFIX', 'MAIDEN', 'MARITAL', 'RACE', 'ETHNICITY',
       'GENDER', 'BIRTHPLACE', 'ADDRESS', 'CITY', 'STATE', 'COUNTY', 'ZIP',
       'LAT', 'LON', 'HEALTHCARE_EXPENSES', 'HEALTHCARE_COVERAGE'],
      dtype='object')

Drop some (probably) redundant columns:

In [17]:
patients_col_subset = patients[['Id', 'BIRTHDATE', 'DEATHDATE', 'MARITAL', 'RACE', 'ETHNICITY',
       'GENDER', 'HEALTHCARE_EXPENSES', 'HEALTHCARE_COVERAGE']]

In [18]:
patients_col_subset.head()

Unnamed: 0,Id,BIRTHDATE,DEATHDATE,MARITAL,RACE,ETHNICITY,GENDER,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE
0,93fa6213-73a1-4cce-b408-9986a4145801,1989-09-05,2001-12-04,,white,nonhispanic,M,6520.55,0.0
1,53bbfa41-5cab-422e-a8a7-8fa3ae55ed71,1997-09-25,,,asian,hispanic,M,545777.93,3282.24
2,30fb496e-b8d3-4d36-8e50-86acb3d1a223,1981-09-11,,M,white,nonhispanic,M,1039005.83,2796.44
3,afa2680f-7f73-46d9-b0cd-2cf3db49724b,1978-06-16,,M,white,nonhispanic,F,878157.6,8128.09
4,26e5d262-6cdc-4274-a5cd-d7fd439e35f9,1989-09-05,,S,white,nonhispanic,M,21147.02,0.0


**Procedures:**

In [19]:
procedures.head()

Unnamed: 0,DATE,PATIENT,ENCOUNTER,CODE,DESCRIPTION,BASE_COST,REASONCODE,REASONDESCRIPTION
0,2001-09-25,93fa6213-73a1-4cce-b408-9986a4145801,8c82b36e-c3ac-428c-b56d-d51d5a4b8b16,430193006,Medication Reconciliation (procedure),750.79,,
1,2001-12-04,93fa6213-73a1-4cce-b408-9986a4145801,1123f059-daf8-4a9e-be5f-e1eee5fcdb48,40701008,Echocardiography (procedure),1127.5,22298006.0,Myocardial Infarction
2,2001-12-04,93fa6213-73a1-4cce-b408-9986a4145801,1123f059-daf8-4a9e-be5f-e1eee5fcdb48,415070008,Percutaneous coronary intervention,21860.94,22298006.0,Myocardial Infarction
3,2001-12-04,93fa6213-73a1-4cce-b408-9986a4145801,1123f059-daf8-4a9e-be5f-e1eee5fcdb48,232717009,Coronary artery bypass grafting,49880.21,22298006.0,Myocardial Infarction
4,2019-11-27,53bbfa41-5cab-422e-a8a7-8fa3ae55ed71,e3e925da-97a5-4af9-98ce-600244252bc9,430193006,Medication Reconciliation (procedure),623.16,,


In [20]:
procedures.DESCRIPTION.unique()

array(['Medication Reconciliation (procedure)',
       'Echocardiography (procedure)',
       'Percutaneous coronary intervention',
       'Coronary artery bypass grafting', 'Face mask (physical object)',
       'Intramuscular injection', 'Plain chest X-ray (procedure)',
       'Oxygen administration by mask (procedure)',
       'Placing subject in prone position (procedure)',
       'Standard pregnancy test', 'Ultrasound scan for fetal viability',
       'Evaluation of uterine fundal height',
       'Auscultation of the fetal heart', 'Blood typing  RH typing',
       'Hemoglobin / Hematocrit / Platelet count',
       'Hepatitis B Surface Antigen Measurement',
       'Human immunodeficiency virus antigen test',
       'Chlamydia antigen test', 'Gonorrhea infection test',
       'Syphilis infection test', 'Urine culture',
       'Cytopathology procedure  preparation of smear  genital source',
       'Urine screening test for diabetes', 'Hepatitis C antibody test',
       'Rubella screen

**Conditions:**

In [21]:
conditions.head()

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION
0,1995-08-22,,93fa6213-73a1-4cce-b408-9986a4145801,4487de56-f20a-422d-a2f6-aa15dcd1cd6d,53741008,Coronary Heart Disease
1,2000-12-20,2000-12-27,93fa6213-73a1-4cce-b408-9986a4145801,d0105ca5-d725-44e9-aaee-51f0e04392e4,195662009,Acute viral pharyngitis (disorder)
2,2001-12-04,,93fa6213-73a1-4cce-b408-9986a4145801,1123f059-daf8-4a9e-be5f-e1eee5fcdb48,22298006,Myocardial Infarction
3,2001-12-04,,93fa6213-73a1-4cce-b408-9986a4145801,1123f059-daf8-4a9e-be5f-e1eee5fcdb48,399211009,History of myocardial infarction (situation)
4,2016-11-09,,53bbfa41-5cab-422e-a8a7-8fa3ae55ed71,fdedb884-d844-420a-9ec7-5d6cab4ffd02,370143000,Major depression disorder


In [428]:
conditions.DESCRIPTION.value_counts()

Suspected COVID-19                                                                  75427
COVID-19                                                                            73697
Fever (finding)                                                                     67219
Body mass index 30+ - obesity (finding)                                             55968
Prediabetes                                                                         55868
Anemia (disorder)                                                                   54504
Cough (finding)                                                                     51508
Hypertension                                                                        42301
Loss of taste (finding)                                                             38100
Chronic sinusitis (disorder)                                                        31672
Fatigue (finding)                                                                   28990
Miscarriag

In [23]:
conditions[conditions.PATIENT == '00019696-c3de-4e1f-8ca8-0df186aa988c']

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION
940819,2011-09-12,,00019696-c3de-4e1f-8ca8-0df186aa988c,929b2e52-938f-4661-a8b5-ba9ec3b34108,59621000,Hypertension
940820,2020-03-11,2020-03-24,00019696-c3de-4e1f-8ca8-0df186aa988c,94ddafe6-8089-4a4d-b29b-ed785249c2d8,49727002,Cough (finding)
940821,2020-03-11,2020-03-24,00019696-c3de-4e1f-8ca8-0df186aa988c,94ddafe6-8089-4a4d-b29b-ed785249c2d8,386661006,Fever (finding)
940822,2020-03-11,2020-03-24,00019696-c3de-4e1f-8ca8-0df186aa988c,94ddafe6-8089-4a4d-b29b-ed785249c2d8,36955009,Loss of taste (finding)
940823,2020-03-11,2020-03-11,00019696-c3de-4e1f-8ca8-0df186aa988c,94ddafe6-8089-4a4d-b29b-ed785249c2d8,840544004,Suspected COVID-19
940824,2020-03-11,2020-03-24,00019696-c3de-4e1f-8ca8-0df186aa988c,94ddafe6-8089-4a4d-b29b-ed785249c2d8,840539006,COVID-19
940825,2020-03-11,2020-03-24,00019696-c3de-4e1f-8ca8-0df186aa988c,060cb520-c6b0-45e3-9c8d-8ce5cf336a5a,233604007,Pneumonia (disorder)
940826,2020-03-11,,00019696-c3de-4e1f-8ca8-0df186aa988c,060cb520-c6b0-45e3-9c8d-8ce5cf336a5a,389087006,Hypoxemia (disorder)
940827,2020-03-11,2020-03-24,00019696-c3de-4e1f-8ca8-0df186aa988c,060cb520-c6b0-45e3-9c8d-8ce5cf336a5a,271825005,Respiratory distress (finding)
940828,2020-03-19,2020-03-24,00019696-c3de-4e1f-8ca8-0df186aa988c,060cb520-c6b0-45e3-9c8d-8ce5cf336a5a,706870000,Acute pulmonary embolism (disorder)


In [24]:
patients[patients.Id == '00019696-c3de-4e1f-8ca8-0df186aa988c']

Unnamed: 0,Id,BIRTHDATE,DEATHDATE,SSN,DRIVERS,PASSPORT,PREFIX,FIRST,LAST,SUFFIX,...,BIRTHPLACE,ADDRESS,CITY,STATE,COUNTY,ZIP,LAT,LON,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE
81354,00019696-c3de-4e1f-8ca8-0df186aa988c,1993-07-19,,999-39-8572,S99934612,X53078335X,Mr.,Barney639,Eichmann909,,...,Boston Massachusetts US,140 Upton Bridge Apt 54,Stoneham,Massachusetts,Middlesex County,2180.0,42.448196,-71.109552,685024.86,3952.2


In [429]:
conditions['ONGOING'] = [2 if x else 1 for x in conditions.STOP.isna()]

In [430]:
patients_conditions = pd.pivot_table(conditions, values = 'ONGOING', index = 'PATIENT', columns = 'DESCRIPTION').reset_index().fillna(0)

In [431]:
patients_conditions.head()

DESCRIPTION,PATIENT,Acquired coagulation disorder (disorder),Acute Cholecystitis,Acute allergic reaction,Acute bacterial sinusitis (disorder),Acute bronchitis (disorder),Acute deep venous thrombosis (disorder),Acute pulmonary embolism (disorder),Acute respiratory distress syndrome (disorder),Acute respiratory failure (disorder),...,Suspected COVID-19,Suspected lung cancer (situation),Tear of meniscus of knee,Third degree burn,Traumatic brain injury (disorder),Tubal pregnancy,Viral sinusitis (disorder),Vomiting symptom (finding),Wheezing (finding),Whiplash injury to neck
0,0000641f-540d-408b-ad16-93c2de94f446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0000ae6a-e3bd-4136-98e0-6d99a3ca08ad,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00018c7f-b75e-4d55-a05a-ba3b3372f910,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00019696-c3de-4e1f-8ca8-0df186aa988c,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00028c0a-864f-4183-90d5-c703edf779d0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


Join with `patients` table:

In [432]:
patients_with_condis = patients_col_subset.merge(patients_conditions, left_on='Id', right_on='PATIENT').drop(columns=['PATIENT'])

Filter out patients who died before COVID-19 (which is probably around 31st December 2019, but some reports said it might have occured as early as August 2019 in Wuhan...applying this to the US is probably a stretch, considering average incubation period is 14 days.). The first known case in the US was 20 January 2020, so 31st December 2019 might be a good boundary:

In [433]:
patients_as_potential_covid_targets = patients_with_condis[(patients_with_condis.DEATHDATE >= '2019-12-31') | (patients_with_condis.DEATHDATE.isna())]
patients_as_potential_covid_targets.loc[:, 'BIRTHDATE'] = pd.to_datetime(patients_as_potential_covid_targets['BIRTHDATE'])
patients_as_potential_covid_targets.loc[:, 'DEATHDATE'] = pd.to_datetime(patients_as_potential_covid_targets['DEATHDATE'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [142]:
patients_as_potential_covid_targets.shape

(85616, 201)

Add another column called "AGE", which is DEATHDATE - BIRTHDATE, or the lastest date of encounters - BIRTHDATE:

In [158]:
# # Example
# relativedelta(pd.to_datetime(encounters.STOP.max()), pd.to_datetime(encounters.STOP.min()))

In [159]:
# np.isnan(patients_as_potential_covid_targets.DEATHDATE.iloc[0])

In [434]:
patients_as_potential_covid_targets.loc[:, 'BIRTHDATE'] = pd.DatetimeIndex(pd.to_datetime(patients_as_potential_covid_targets.BIRTHDATE)).tz_localize(None).to_series().values
patients_as_potential_covid_targets.loc[:, 'DEATHDATE'] = pd.DatetimeIndex(pd.to_datetime(patients_as_potential_covid_targets.DEATHDATE)).tz_localize(None).to_series().values

In [435]:
max_date = encounters.STOP.max()
def age_calc(x,y):
    if pd.isnull(y):
        temp = relativedelta(max_date, x)
    else:
        temp = relativedelta(y, x)
    return temp.years + temp.months/12
patients_as_potential_covid_targets.loc[:, 'AGE'] = patients_as_potential_covid_targets.apply(lambda x: age_calc(x['BIRTHDATE'], x['DEATHDATE']), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)


In [206]:
# temp = (np.datetime64('NOW') - patients_as_potential_covid_targets['BIRTHDATE'].astype('M8[D]')).astype('timedelta64[Y]').astype(float)
# df_age = pd.DataFrame(temp.values, index = patients_as_potential_covid_targets.Id)
# patients_as_potential_covid_targets = patients_as_potential_covid_targets.merge(df_age, left_on='Id', right_on='Id').rename(columns={0: 'AGE'})

Alive or ded?

In [436]:
patients_as_potential_covid_targets.loc[: ,'ALIVE'] = patients_as_potential_covid_targets['DEATHDATE'].apply(lambda x: 1 if pd.isnull(x) else 0)
#patients_as_potential_covid_targets.drop(columns = ['BIRTHDATE', 'DEATHDATE'])

Coverage-Expense Ratio:

In [437]:
patients_as_potential_covid_targets.loc[:, 'COVERAGE_EXPENSE_RATIO'] = patients_as_potential_covid_targets.apply(lambda x: x['HEALTHCARE_COVERAGE']/x['HEALTHCARE_EXPENSES'], axis = 1)

Coverage-Cost ratio per encounter:

In [236]:
encounters.loc[:, 'COVERAGE_COST_RATIO'] = encounters.apply(lambda x: x['PAYER_COVERAGE']/x['TOTAL_CLAIM_COST'], axis = 1) # expensive!

In [253]:
encounters.head()

Unnamed: 0,Id,START,STOP,PATIENT,ORGANIZATION,PROVIDER,PAYER,ENCOUNTERCLASS,CODE,DESCRIPTION,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE,REASONDESCRIPTION,COVERAGE_COST_RATIO
0,4487de56-f20a-422d-a2f6-aa15dcd1cd6d,1995-08-23 02:20:21,1995-08-23 02:35:21,93fa6213-73a1-4cce-b408-9986a4145801,066dc005-2020-39f1-ba27-8cf297e231d0,35509b88-c026-38e1-b3e1-cb218431a06d,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,wellness,410620009,Well child visit (procedure),129.16,129.16,0.0,,,0.0
1,c6d44945-d551-499e-aaf7-496ffd9a85e2,1996-08-28 02:20:21,1996-08-28 02:35:21,93fa6213-73a1-4cce-b408-9986a4145801,066dc005-2020-39f1-ba27-8cf297e231d0,35509b88-c026-38e1-b3e1-cb218431a06d,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,wellness,410620009,Well child visit (procedure),129.16,129.16,0.0,,,0.0
2,bf6651e1-8e8c-426d-ae01-136f36db137e,1997-09-03 02:20:21,1997-09-03 02:50:21,93fa6213-73a1-4cce-b408-9986a4145801,066dc005-2020-39f1-ba27-8cf297e231d0,35509b88-c026-38e1-b3e1-cb218431a06d,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,wellness,410620009,Well child visit (procedure),129.16,129.16,0.0,,,0.0
3,1d731dbe-3a08-4242-aacf-9ee71e186146,1998-09-09 02:20:21,1998-09-09 02:35:21,93fa6213-73a1-4cce-b408-9986a4145801,066dc005-2020-39f1-ba27-8cf297e231d0,35509b88-c026-38e1-b3e1-cb218431a06d,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,wellness,410620009,Well child visit (procedure),129.16,129.16,0.0,,,0.0
4,c6e318d5-c0a2-4317-80f5-10f187b1d1a2,1999-09-15 02:20:21,1999-09-15 02:50:21,93fa6213-73a1-4cce-b408-9986a4145801,066dc005-2020-39f1-ba27-8cf297e231d0,35509b88-c026-38e1-b3e1-cb218431a06d,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,wellness,410620009,Well child visit (procedure),129.16,129.16,0.0,,,0.0


In [308]:
encounters[encounters.CODE==1505002].head()

Unnamed: 0,Id,START,STOP,PATIENT,ORGANIZATION,PROVIDER,PAYER,ENCOUNTERCLASS,CODE,DESCRIPTION,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE,REASONDESCRIPTION,COVERAGE_COST_RATIO
74,e6ad9f78-fa0e-431a-b9b6-5d127a594261,2020-02-24 03:13:21,2020-03-12 10:30:21,26e5d262-6cdc-4274-a5cd-d7fd439e35f9,53b25c4b-e95f-3dd1-8e69-d427cdd83631,eae2941d-3881-3b34-ae59-45246a327c88,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,inpatient,1505002,Hospital admission for isolation (procedure),129.16,129.16,0.0,840539006.0,COVID-19,0.0
115,d068a77c-abcf-4c5e-a8b9-e74cbdbfe099,2020-03-03 10:58:09,2020-03-23 18:45:09,90031c21-e8a4-416e-b60b-23e871ee62dc,d311e70d-86e7-3c03-b115-53892bcf7ef1,474073ef-6bd0-3637-b86b-c9a2ef9d3327,d47b3510-2895-3b70-9897-342d681c769d,inpatient,1505002,Hospital admission for isolation (procedure),129.16,129.16,49.16,840539006.0,COVID-19,0.380613
295,b75fd86e-0f14-48aa-ab7e-509556f9bacc,2020-03-05 01:17:07,2020-03-09 04:14:07,c58ae02e-235e-4db2-a9c3-6926e9c0c5fa,8881e3ee-e915-31b3-867f-8e2fbe41e887,4dd85a1f-7453-34b9-9ca0-442957065e2f,7caa7254-5050-3b5e-9eae-bd5ea30e809c,inpatient,1505002,Hospital admission for isolation (procedure),129.16,129.16,89.16,840539006.0,COVID-19,0.690307
303,8dbdfc08-2852-4fe1-80b9-d551c461752d,2020-03-04 08:00:02,2020-03-24 15:42:02,0e07d4b4-8c0a-4cd0-a439-8f3b96ab884d,05462bf2-25d4-3ecd-afdf-bb3e1140929a,925c1564-360e-3177-b84f-15eb7b2bb933,4d71f845-a6a9-3c39-b242-14d25ef86a8d,inpatient,1505002,Hospital admission for isolation (procedure),129.16,129.16,64.16,840539006.0,COVID-19,0.496748
1518,f66edba1-5067-4d7e-8532-b6934c0bffc6,2020-02-16 19:01:07,2020-03-07 19:41:07,ef272f3c-6cbd-401c-97b5-db5de666b6e5,0b78995f-8b45-34d3-969d-afcc456bb1c7,6ff38ce5-7f34-3d25-ab3b-ce5608e9319d,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,inpatient,1505002,Hospital admission for isolation (procedure),129.16,129.16,0.0,840539006.0,COVID-19,0.0


In [262]:
patients_as_potential_covid_targets[(patients_as_potential_covid_targets["COVID-19"] >=0) & (patients_as_potential_covid_targets["Suspected COVID-19"] >=0)]

(73697, 204)

In [298]:
patients_as_potential_covid_targets[patients_as_potential_covid_targets["COVID-19"] == 0].shape

(68315, 204)

In [267]:
# Recovered from COVID-19, but died anyway...tragic...
patients_as_potential_covid_targets[(patients_as_potential_covid_targets["COVID-19"] == 0) & (patients_as_potential_covid_targets["ALIVE"] == 0)].shape

(186, 204)

## Immunization

In [367]:
immunizations.head()

Unnamed: 0,DATE,PATIENT,ENCOUNTER,CODE,DESCRIPTION,BASE_COST
0,2001-09-25,93fa6213-73a1-4cce-b408-9986a4145801,8c82b36e-c3ac-428c-b56d-d51d5a4b8b16,140,Influenza seasonal injectable preservative ...,140.52
1,2019-11-27,53bbfa41-5cab-422e-a8a7-8fa3ae55ed71,e3e925da-97a5-4af9-98ce-600244252bc9,140,Influenza seasonal injectable preservative ...,140.52
2,2019-11-27,53bbfa41-5cab-422e-a8a7-8fa3ae55ed71,e3e925da-97a5-4af9-98ce-600244252bc9,113,Td (adult) preservative free,140.52
3,2019-11-27,53bbfa41-5cab-422e-a8a7-8fa3ae55ed71,e3e925da-97a5-4af9-98ce-600244252bc9,43,Hep B adult,140.52
4,2019-11-27,53bbfa41-5cab-422e-a8a7-8fa3ae55ed71,e3e925da-97a5-4af9-98ce-600244252bc9,114,meningococcal MCV4P,140.52


In [368]:
immunizations.DESCRIPTION.unique()

array(['Influenza  seasonal  injectable  preservative free',
       'Td (adult) preservative free', 'Hep B  adult',
       'meningococcal MCV4P', 'Hep A  adult',
       'Pneumococcal conjugate PCV 13', 'zoster',
       'pneumococcal polysaccharide vaccine  23 valent',
       'Hep B  adolescent or pediatric', 'Hib (PRP-OMP)', 'IPV',
       'varicella', 'MMR', 'Tdap', 'HPV  quadrivalent',
       'Hep A  ped/adol  2 dose', 'DTaP'], dtype=object)

## Treatment history

In [438]:
inpatient = encounters[(encounters.REASONCODE==840539006) & (encounters.CODE == 1505002)].rename(columns = {'Id':'ENCOUNTER'})
inpatient[['START','STOP']] = inpatient[['START','STOP']].astype('datetime64[ns]')
inpatient['days_hospitalized'] = (inpatient['STOP']- inpatient['START']).astype('timedelta64[s]').astype(float)/86400
inpatient.head()

Unnamed: 0,ENCOUNTER,START,STOP,PATIENT,ORGANIZATION,PROVIDER,PAYER,ENCOUNTERCLASS,CODE,DESCRIPTION,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE,REASONDESCRIPTION,COVERAGE_COST_RATIO,days_hospitalized
74,e6ad9f78-fa0e-431a-b9b6-5d127a594261,2020-02-24 03:13:21,2020-03-12 10:30:21,26e5d262-6cdc-4274-a5cd-d7fd439e35f9,53b25c4b-e95f-3dd1-8e69-d427cdd83631,eae2941d-3881-3b34-ae59-45246a327c88,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,inpatient,1505002,Hospital admission for isolation (procedure),129.16,129.16,0.0,840539006.0,COVID-19,0.0,17.303472
115,d068a77c-abcf-4c5e-a8b9-e74cbdbfe099,2020-03-03 10:58:09,2020-03-23 18:45:09,90031c21-e8a4-416e-b60b-23e871ee62dc,d311e70d-86e7-3c03-b115-53892bcf7ef1,474073ef-6bd0-3637-b86b-c9a2ef9d3327,d47b3510-2895-3b70-9897-342d681c769d,inpatient,1505002,Hospital admission for isolation (procedure),129.16,129.16,49.16,840539006.0,COVID-19,0.380613,20.324306
295,b75fd86e-0f14-48aa-ab7e-509556f9bacc,2020-03-05 01:17:07,2020-03-09 04:14:07,c58ae02e-235e-4db2-a9c3-6926e9c0c5fa,8881e3ee-e915-31b3-867f-8e2fbe41e887,4dd85a1f-7453-34b9-9ca0-442957065e2f,7caa7254-5050-3b5e-9eae-bd5ea30e809c,inpatient,1505002,Hospital admission for isolation (procedure),129.16,129.16,89.16,840539006.0,COVID-19,0.690307,4.122917
303,8dbdfc08-2852-4fe1-80b9-d551c461752d,2020-03-04 08:00:02,2020-03-24 15:42:02,0e07d4b4-8c0a-4cd0-a439-8f3b96ab884d,05462bf2-25d4-3ecd-afdf-bb3e1140929a,925c1564-360e-3177-b84f-15eb7b2bb933,4d71f845-a6a9-3c39-b242-14d25ef86a8d,inpatient,1505002,Hospital admission for isolation (procedure),129.16,129.16,64.16,840539006.0,COVID-19,0.496748,20.320833
1518,f66edba1-5067-4d7e-8532-b6934c0bffc6,2020-02-16 19:01:07,2020-03-07 19:41:07,ef272f3c-6cbd-401c-97b5-db5de666b6e5,0b78995f-8b45-34d3-969d-afcc456bb1c7,6ff38ce5-7f34-3d25-ab3b-ce5608e9319d,b1c428d6-4f07-31e0-90f0-68ffa6ff8c76,inpatient,1505002,Hospital admission for isolation (procedure),129.16,129.16,0.0,840539006.0,COVID-19,0.0,20.027778


In [439]:
hospitalized_patients_with_medical_record = patients_as_potential_covid_targets.merge(inpatient[["PATIENT", "days_hospitalized"]], left_on = "Id", right_on = "PATIENT").drop(columns="PATIENT")

In [440]:
hospitalized_patients_with_medical_record.head()

Unnamed: 0,Id,BIRTHDATE,DEATHDATE,MARITAL,RACE,ETHNICITY,GENDER,HEALTHCARE_EXPENSES,HEALTHCARE_COVERAGE,Acquired coagulation disorder (disorder),...,Traumatic brain injury (disorder),Tubal pregnancy,Viral sinusitis (disorder),Vomiting symptom (finding),Wheezing (finding),Whiplash injury to neck,AGE,ALIVE,COVERAGE_EXPENSE_RATIO,days_hospitalized
0,26e5d262-6cdc-4274-a5cd-d7fd439e35f9,1989-09-05,NaT,S,white,nonhispanic,M,21147.02,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,30.666667,1,0.0,17.303472
1,90031c21-e8a4-416e-b60b-23e871ee62dc,1983-07-24,NaT,M,white,nonhispanic,F,746790.89,12704.15,0.0,...,2.0,0.0,1.0,0.0,1.0,0.0,36.833333,1,0.017012,20.324306
2,c58ae02e-235e-4db2-a9c3-6926e9c0c5fa,1954-03-19,2020-03-15,M,white,nonhispanic,M,1423163.19,8303.87,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,65.916667,0,0.005835,4.122917
3,0e07d4b4-8c0a-4cd0-a439-8f3b96ab884d,1982-07-26,NaT,M,white,nonhispanic,M,912547.28,3083.88,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,37.833333,1,0.003379,20.320833
4,ef272f3c-6cbd-401c-97b5-db5de666b6e5,1992-09-18,NaT,M,white,nonhispanic,F,741244.72,3439.76,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,27.666667,1,0.004641,20.027778


## IMAGING STUDIES

In [325]:
img_studies.head()

Unnamed: 0,Id,DATE,PATIENT,ENCOUNTER,BODYSITE_CODE,BODYSITE_DESCRIPTION,MODALITY_CODE,MODALITY_DESCRIPTION,SOP_CODE,SOP_DESCRIPTION
0,89147b5f-e439-41d3-ad9c-010803605889,2020-03-25,614a610f-bd80-4721-a6f7-16d77b23b12e,faf5f83c-c8b5-48c2-a18c-8e0f70dfc817,40983000,Arm,DX,Digital Radiography,1.2.840.10008.5.1.4.1.1.1.1,Digital X-Ray Image Storage
1,23515e56-0f58-4c5b-a8c0-b03222c10529,2015-08-21,c5c9f28e-dd72-4c10-8c76-9b62112506bd,f76ebda5-27a0-4105-8a90-60f027608170,261179002,thoracic,US,Ultrasound,1.2.840.10008.5.1.4.1.1.3.1,Ultrasound Multiframe Image Storage
2,b4828166-40ff-4d76-aa63-a8b3586b7a92,2018-02-06,c5c9f28e-dd72-4c10-8c76-9b62112506bd,82d83cf9-61de-49b3-9dec-f0d39233c226,51185008,Thoracic structure (body structure),CR,Computed Radiography,1.2.840.10008.5.1.4.1.1.1.1,Digital X-Ray Image Storage – for Presentation
3,3bb0ef4c-345c-4906-b8e8-029d8e984118,2018-02-06,c5c9f28e-dd72-4c10-8c76-9b62112506bd,1a4580b7-909d-406f-98d0-d41d65b4cef8,261179002,Thoracic,US,Ultrasound,1.2.840.10008.5.1.4.1.1.3.1,Ultrasound Multiframe Image Storage
4,8f26666f-932a-4f9a-a524-ec2312261f44,2018-02-06,c5c9f28e-dd72-4c10-8c76-9b62112506bd,1a4580b7-909d-406f-98d0-d41d65b4cef8,51185008,Thoracic structure,DX,Digital Radiography,1.2.840.10008.5.1.4.1.1.1.1,Digital X-Ray Image Storage – for Presentation


## Training!

In [400]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [441]:
train_df = hospitalized_patients_with_medical_record.drop(columns = ["Id", "BIRTHDATE", "DEATHDATE", "HEALTHCARE_EXPENSES", "HEALTHCARE_COVERAGE", "ALIVE"])
train_df['MARITAL'] = train_df['MARITAL'].fillna(value="U")
train_df.head()

Unnamed: 0,MARITAL,RACE,ETHNICITY,GENDER,Acquired coagulation disorder (disorder),Acute Cholecystitis,Acute allergic reaction,Acute bacterial sinusitis (disorder),Acute bronchitis (disorder),Acute deep venous thrombosis (disorder),...,Third degree burn,Traumatic brain injury (disorder),Tubal pregnancy,Viral sinusitis (disorder),Vomiting symptom (finding),Wheezing (finding),Whiplash injury to neck,AGE,COVERAGE_EXPENSE_RATIO,days_hospitalized
0,S,white,nonhispanic,M,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.666667,0.0,17.303472
1,M,white,nonhispanic,F,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,1.0,0.0,1.0,0.0,36.833333,0.017012,20.324306
2,M,white,nonhispanic,M,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,65.916667,0.005835,4.122917
3,M,white,nonhispanic,M,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.833333,0.003379,20.320833
4,M,white,nonhispanic,F,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,27.666667,0.004641,20.027778


In [426]:
train_df.MARITAL.value_counts()

M    15425
S     3882
U     1095
Name: MARITAL, dtype: int64

In [425]:
train_df.RACE.value_counts()

white     17206
black      1698
asian      1363
native      110
other        25
Name: RACE, dtype: int64

In [442]:
train_df.ETHNICITY.value_counts()

nonhispanic    18073
hispanic        2329
Name: ETHNICITY, dtype: int64

In [443]:
train_df.GENDER.value_counts()

F    10281
M    10121
Name: GENDER, dtype: int64

In [444]:
marital_dict = {"S" : 1, "M": 2, "U" : 0}
race_dict = {"white" : 4, "black" : 3, "asian" : 2, "native" : 1, "other" : 0}
ethnic_dict = {"nonhispanic" : 1, "hispanic" : 0}
gender_dict = {"F" : 1, "M" : 0}

In [459]:
df_X = train_df.drop(columns="days_hospitalized")
df_X = df_X.replace({"MARITAL": marital_dict, "RACE": race_dict, "ETHNICITY": ethnic_dict, "GENDER": gender_dict})
# cT = ColumnTransformer([('encoder', OneHotEncoder(), ['MARITAL', 'RACE', 'ETHNICITY', 'GENDER'])], remainder='passthrough')
# cT.fit(df_X)
# df_X = cT.transform(df_X)
df_y = train_df.days_hospitalized

In [455]:
# Features:
lst_features = df_X.columns
len(lst_features)

198

In [466]:
model = DecisionTreeRegressor(random_state = 0)
rfecv = RFECV(estimator=model, min_features_to_select = 5, step=1, cv=5, n_jobs=-1, scoring='neg_root_mean_squared_error')
rfecv.fit(df_X, df_y)

RFECV(cv=5,
      estimator=DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse',
                                      max_depth=None, max_features=None,
                                      max_leaf_nodes=None,
                                      min_impurity_decrease=0.0,
                                      min_impurity_split=None,
                                      min_samples_leaf=1, min_samples_split=2,
                                      min_weight_fraction_leaf=0.0,
                                      presort='deprecated', random_state=0,
                                      splitter='best'),
      min_features_to_select=5, n_jobs=-1,
      scoring='neg_root_mean_squared_error', step=1, verbose=0)

In [453]:
feature_importance = list(zip(lst_features, rfecv.support_))
new_features = []
for key,value in enumerate(feature_importance):
    if(value[1]) == True:
        new_features.append(value[0])

['MARITAL', 'RACE', 'ETHNICITY', 'GENDER', 'Acquired coagulation disorder (disorder)', 'Acute Cholecystitis', 'Acute allergic reaction', 'Acute bacterial sinusitis (disorder)', 'Acute bronchitis (disorder)', 'Acute deep venous thrombosis (disorder)', 'Acute pulmonary embolism (disorder)', 'Acute respiratory distress syndrome (disorder)', 'Acute respiratory failure (disorder)', 'Acute viral pharyngitis (disorder)', 'Alcoholism', "Alzheimer's disease (disorder)", 'Anemia (disorder)', 'Antepartum eclampsia', 'Appendicitis', 'Asthma', 'At risk for suicide (finding)', 'Atopic dermatitis', 'Atrial Fibrillation', 'Bacterial infectious disease (disorder)', 'Bleeding from anus', 'Blighted ovum', 'Blindness due to type 2 diabetes mellitus (disorder)', 'Body mass index 30+ - obesity (finding)', 'Body mass index 40+ - severely obese (finding)', 'Brain damage - traumatic', 'COVID-19', 'Carcinoma in situ of prostate (disorder)', 'Cardiac Arrest', 'Chill (finding)', 'Cholelithiasis', 'Chronic congest

In [460]:
df_X = df_X[new_features]
df_X.head()

Unnamed: 0,MARITAL,RACE,ETHNICITY,GENDER,Acquired coagulation disorder (disorder),Acute Cholecystitis,Acute allergic reaction,Acute bacterial sinusitis (disorder),Acute bronchitis (disorder),Acute deep venous thrombosis (disorder),...,Suspected lung cancer (situation),Tear of meniscus of knee,Traumatic brain injury (disorder),Tubal pregnancy,Viral sinusitis (disorder),Vomiting symptom (finding),Wheezing (finding),Whiplash injury to neck,AGE,COVERAGE_EXPENSE_RATIO
0,1,4,1,0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.666667,0.0
1,2,4,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,1.0,0.0,1.0,0.0,36.833333,0.017012
2,2,4,1,0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,65.916667,0.005835
3,2,4,1,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37.833333,0.003379
4,2,4,1,1,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,27.666667,0.004641


In [462]:
def 

In [486]:
kf = KFold(n_splits = 10, shuffle = True, random_state = 0)

In [487]:
for train_index, test_index in kf.split(df_X, df_y):
    print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [    0     1     2 ... 20398 20400 20401] TEST: [    7    12    16 ... 20352 20378 20399]
TRAIN: [    0     1     2 ... 20399 20400 20401] TEST: [    8    44    47 ... 20369 20375 20387]
TRAIN: [    0     1     2 ... 20399 20400 20401] TEST: [    9    31    35 ... 20363 20372 20385]
TRAIN: [    0     1     2 ... 20397 20398 20399] TEST: [   20    27    32 ... 20391 20400 20401]
TRAIN: [    1     4     6 ... 20399 20400 20401] TEST: [    0     2     3 ... 20388 20394 20398]
TRAIN: [    0     1     2 ... 20399 20400 20401] TEST: [   15    23    29 ... 20358 20360 20397]
TRAIN: [    0     1     2 ... 20399 20400 20401] TEST: [    6    21    22 ... 20386 20389 20392]
TRAIN: [    0     2     3 ... 20399 20400 20401] TEST: [    1     4    11 ... 20390 20393 20395]
TRAIN: [    0     1     2 ... 20399 20400 20401] TEST: [   10    13    19 ... 20371 20384 20396]
TRAIN: [    0     1     2 ... 20399 20400 20401] TEST: [   43    52    55 ... 20347 20361 20382]


In [500]:
max_features = range(1,21)
all_kf, all_max_features =  zip(*itertools.product(kf.split(df_X, df_y), max_features))
product = list(itertools.product(kf.split(df_X, df_y), max_features))

In [508]:
def f(x):
    X_train = df_X.iloc[x[0][0]]
    y_train = df_y.iloc[x[0][0]]

    # creating test set
    X_test = df_X.iloc[x[0][1]]
    y_test = df_y.iloc[x[0][1]]

    # intantiating model
    mdl = DecisionTreeRegressor(max_features = x[1], random_state = 0)

    # fit the model
    start = time.time()
    mdl.fit(X_train, y_train)
    arr_pred = mdl.predict(X_test)
    end = time.time()

    # fit time
    dbl_fit_time = end - start

    # RMSE
    dbl_rMSE = mean_squared_error(y_test, arr_pred, squared=False)
    
    #R2
    
    dbl_r2 = r2_score(y_test, arr_pred)

    return dbl_fit_time, dbl_rMSE, dbl_r2

In [509]:
results = Parallel(n_jobs=num_cores)(delayed(f)(x) for x in product)

In [None]:
space = [Integer(1, 20, name='max_depth')
         Integer()
        ]