In [38]:
# data prep for admission rates
import os

def search_all_paths():
    import sys
    module_path = os.path.abspath(os.path.join('../..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

import pandas as pd

search_all_paths()
from capstone_functions import *
from capstone_constants import *

In [32]:
# the only datasets available for admission rates are edstay and triage, both gathered upon arrival
# there will be no further need for additional stage modelling

In [74]:
triage = look_n_load('../../DATA/triage.csv')
edstays = look_n_load('../../DATA/edstays.csv')


Shape: (447712, 11)

Columns: ['subject_id', 'stay_id', 'temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'pain', 'acuity', 'chiefcomplaint']

Missing proportions:
 subject_id        0.000000
stay_id           0.000000
temperature       0.054935
heartrate         0.040111
resprate          0.047743
o2sat             0.048355
sbp               0.042941
dbp               0.044823
pain              0.030415
acuity            0.016368
chiefcomplaint    0.000051
dtype: float64

Data:
    subject_id   stay_id  temperature  heartrate  resprate  o2sat    sbp   dbp  \
0    10000032  32952584         97.8       87.0      14.0   97.0   71.0  43.0   
1    10000032  33258284         98.4       70.0      16.0   97.0  106.0  63.0   
2    10000032  35968195         99.4      105.0      18.0   96.0  106.0  57.0   
3    10000032  38112554         98.9       88.0      18.0   97.0  116.0  88.0   
4    10000032  39399961         98.7       77.0      16.0   98.0   96.0  50.0   

  pain  acuity

In [75]:
# create datetime variables to colelct in_hour, in_date, day of week, and capacity
from datetime import datetime
edstays['intime'] = pd.to_datetime(edstays.intime)
edstays['in_date'] = [x.date() for x in edstays.intime]
edstays['outtime'] = pd.to_datetime(edstays.outtime)
edstays['out_date'] = [x.date() for x in edstays.outtime]

edstays['diff_date'] = edstays['out_date'] - edstays['in_date']
edstays['in_hour'] = [x.hour for x in edstays.intime]

In [76]:
def work_hours(x):
    if x >= 8 and x <= 18:
        return 1 
    else:
        return 0

# whether they arrived during work hours or not 
edstays['work_hours'] = [work_hours(x) for x in edstays.in_hour]

In [77]:
# get day of week
edstays['in_day'] = [x.strftime('%A') for x in edstays.in_date]

In [78]:
# get capacity of patients admitted
# NOTE: a significantly improved method of hospital capacity was developed on my other laptop, will need to get it

def get_capacity_df(df, in_date = 'in_date'):
    capacity = df[df['disposition'].isin(['ADMITTED'])][in_date].value_counts().reset_index()
    capacity.columns = [in_date, 'capacity']
    return capacity

capacity_df = get_capacity_df(edstays)

edstays = pd.merge(edstays, capacity_df, on='in_date', how='left')

edstays.capacity = edstays.capacity.fillna(0)

In [79]:
# constant for desired dispositions
ADMIT_RISK_KEEP_DISP = ['HOME','ADMITTED','TRANSFER']

ADMIT_RISK_KEEP_COLS = [
    'subject_id',
    'stay_id',
    'arrival_transport',
    'admitted',
    'work_hours',
    'in_day',
    'capacity'
]

# create binary variable
edstays['admitted'] = edstays.apply(get_admission, axis=1)

# filter dispositions
edstays = edstays[edstays.disposition.isin(ADMIT_RISK_KEEP_DISP)]

# grab subset to join to triage data
edstays_subset = edstays[ADMIT_RISK_KEEP_COLS]

In [52]:
def rescale_temp(x):
    if x > 200:
        return x/10
    else:
        return x

triage['temperature'] = [rescale_temp(x) for x in triage['temperature']]

def rescale_heart(x):
    if x > 300:
        return x / 100
    else:
        return x
    
triage['heartrate'] = [rescale_heart(x) for x in triage['heartrate']]

def rescale_pain(x):
    try:
        x = int(x)
        if x > 100:
            return 10
        elif x > 10:
            return x/10
        elif x < 0:
            return 0
        else:
            return x
    except:
        return None
    
triage['pain'] = [rescale_pain(x) for x in triage['pain']]
triage['pain'] = pd.to_numeric(triage['pain'])


In [80]:
edstays_triage = pd.merge(edstays_subset, triage, on=['subject_id','stay_id'], how='left')
edstays_triage.to_csv('../../DATA/admission_rate_edstay_triage_prep.csv')