In [1]:
import pandas as pd
import numpy as np

import sys
sys.path.insert(0, '..') # this ensures you can leave the capstone_functions.py file in the main dir

from capstone_functions import get_admission, get_expired, look_n_load

In [5]:
print('TRIAGE')
triage = look_n_load('../DATA/triage.csv')
print('\n\nEDSTAYS')
edstays = look_n_load('../DATA/edstays.csv')

TRIAGE

Shape: (447712, 11)

Columns: ['subject_id', 'stay_id', 'temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'pain', 'acuity', 'chiefcomplaint']

Missing proportions:
 subject_id        0.000000
stay_id           0.000000
temperature       0.054935
heartrate         0.040111
resprate          0.047743
o2sat             0.048355
sbp               0.042941
dbp               0.044823
pain              0.030415
acuity            0.016368
chiefcomplaint    0.000051
dtype: float64

Data:
    subject_id   stay_id  temperature  heartrate  resprate  o2sat    sbp   dbp  \
0    10000032  32952584         97.8       87.0      14.0   97.0   71.0  43.0   
1    10000032  33258284         98.4       70.0      16.0   97.0  106.0  63.0   
2    10000032  35968195         99.4      105.0      18.0   96.0  106.0  57.0   
3    10000032  38112554         98.9       88.0      18.0   97.0  116.0  88.0   
4    10000032  39399961         98.7       77.0      16.0   98.0   96.0  50.0   

  pain 

In [6]:
triage.chiefcomplaint = [str(x).lower() for x in triage.chiefcomplaint] # cast to lower case to unify case differences
triage.chiefcomplaint.value_counts()

abd pain                  16601
chest pain                14619
s/p fall                   7902
dyspnea                    7758
headache                   5377
                          ...  
n/vd                          1
h/a  1 week                   1
laceration to face            1
left foot pain post op        1
humerus fx transfer           1
Name: chiefcomplaint, Length: 61228, dtype: int64

In [7]:
edstays_sub = edstays[edstays.disposition.isin(['ADMITTED', 'HOME','TRANSFER'])] # filter out dispositions we don't need
edstays_sub['admitted'] = edstays_sub.apply(get_admission,axis=1) # create admitted variable
edstays_sub.admitted.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


0    254545
1    173739
Name: admitted, dtype: int64

In [8]:
# what tokens in complaints are most common in each
all_data = pd.merge(edstays_sub, triage, on=['subject_id','stay_id'], how='left')

In [9]:
all_data[['admitted']].value_counts() # general distribution of admission

admitted
0           254545
1           173739
dtype: int64

In [10]:
all_data[['admitted', 'chiefcomplaint']].value_counts() # most common complaints and admissions

admitted  chiefcomplaint                              
0         chest pain                                      9106
          abd pain                                        8461
1         abd pain                                        7266
          dyspnea                                         5406
0         s/p fall                                        4765
                                                          ... 
1         back pain, positive blood cultures, transfer       1
          back pain, pneumonia, transfer                     1
          back pain, pneumonia                               1
          back pain, pleuritic chest pain                    1
0          10 min vision change                              1
Length: 67962, dtype: int64

In [11]:
# this script gets counts of complaints separated by ', ' and looks at the proportions against admission (repeated for mortality below)

home_tokens = all_data[all_data.admitted==0]['chiefcomplaint']
home_tokens = [x.split(', ') for x in home_tokens]
home_tokens_flat = list()
for tokens in home_tokens:
    for token in tokens:
        home_tokens_flat.append(token)
home_counts = pd.DataFrame(data={'home_count':home_tokens_flat}).home_count.value_counts()

home_counts_df = pd.DataFrame(home_counts).reset_index()


admit_tokens = all_data[all_data.admitted==1]['chiefcomplaint']
admit_tokens = [x.split(', ') for x in admit_tokens]
admit_tokens_flat = list()
for tokens in admit_tokens:
    for token in tokens:
        admit_tokens_flat.append(token)
admit_counts = pd.DataFrame(data={'admit_count':admit_tokens_flat}).admit_count.value_counts()

admit_counts_df = pd.DataFrame(admit_counts).reset_index()

all_tokens = all_data['chiefcomplaint']
all_tokens = [x.split(', ') for x in all_tokens]
all_tokens_flat = list()
for tokens in all_tokens:
    for token in tokens:
        all_tokens_flat.append(token)
total_counts = pd.DataFrame(data={'total':all_tokens_flat}).total.value_counts()

total_counts_df = pd.DataFrame(total_counts).reset_index()

all_counts = pd.merge(
    pd.merge(total_counts_df, home_counts_df, on='index', how='left'),
    admit_counts_df, on = 'index', how='left')
all_counts

all_counts = all_counts.fillna(0)

all_counts['home_pp'] = all_counts['home_count']/all_counts['total']
all_counts['admit_pp'] = all_counts['admit_count']/all_counts['total']
all_counts

Unnamed: 0,index,total,home_count,admit_count,home_pp,admit_pp
0,abd pain,33675,16348.0,17327.0,0.485464,0.514536
1,chest pain,26885,16520.0,10365.0,0.614469,0.385531
2,transfer,24102,3988.0,20114.0,0.165463,0.834537
3,s/p fall,21796,12846.0,8950.0,0.589374,0.410626
4,dyspnea,19194,6228.0,12966.0,0.324476,0.675524
...,...,...,...,...,...,...
27218,laceration forehead,1,1.0,0.0,1.000000,0.000000
27219,pain med script,1,1.0,0.0,1.000000,0.000000
27220,wound eval- rectal bleeding,1,1.0,0.0,1.000000,0.000000
27221,r leg pain and weakness,1,1.0,0.0,1.000000,0.000000


In [12]:
all_counts[all_counts.total>10].sort_values(by='home_pp', ascending=False)

# for example 100% of those that came for a rabies shot were not admitted to hospital

Unnamed: 0,index,total,home_count,admit_count,home_pp,admit_pp
958,r/o std,19,19.0,0.0,1.0,0.0
999,left thumb lac,18,18.0,0.0,1.0,0.0
332,rabies vaccine,156,156.0,0.0,1.0,0.0
807,removal ___/sutures,26,26.0,0.0,1.0,0.0
809,left calf pain,25,25.0,0.0,1.0,0.0
...,...,...,...,...,...,...
1346,dissection,11,0.0,11.0,0.0,1.0
589,s/p arrest,49,0.0,49.0,0.0,1.0
616,femur fx,44,0.0,44.0,0.0,1.0
1095,status epilepticus,15,0.0,15.0,0.0,1.0


In [13]:
# create look up tables for the tokens that appear at high or low proportions in the given rate 

high_home_pp = all_counts[all_counts.home_pp > 0.8].rename(columns={'index':'complaint'})
high_home_pp.to_csv('../CONSTANTS/high_home_complaints.csv')

high_admit_pp = all_counts[all_counts.admit_pp > 0.8].rename(columns={'index':'complaint'})
high_admit_pp.to_csv('../CONSTANTS/high_admission_complaints.csv')

In [15]:
# again for mortality

expired_df = edstays[edstays.disposition.isin(['ADMITTED','TRANSFER','EXPIRED'])]

expired_df['expired'] = expired_df.apply(get_expired,axis=1) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [16]:
expired_df.expired.value_counts()

0    173739
1       395
Name: expired, dtype: int64

In [17]:
all_data = pd.merge(expired_df, triage, on=['subject_id','stay_id'], how='left')
pd.crosstab(all_data.gender, all_data.expired)

expired,0,1
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
F,89156,183
M,84583,212


In [19]:
home_tokens = all_data[all_data.expired==0]['chiefcomplaint']
home_tokens = [x.split(', ') for x in home_tokens]
home_tokens_flat = list()
for tokens in home_tokens:
    for token in tokens:
        home_tokens_flat.append(token)
home_counts = pd.DataFrame(data={'discharge_count':home_tokens_flat}).discharge_count.value_counts()

home_counts_df = pd.DataFrame(home_counts).reset_index()


admit_tokens = all_data[all_data.expired==1]['chiefcomplaint']
admit_tokens = [x.split(', ') for x in admit_tokens]
admit_tokens_flat = list()
for tokens in admit_tokens:
    for token in tokens:
        admit_tokens_flat.append(token)
admit_counts = pd.DataFrame(data={'expire_count':admit_tokens_flat}).expire_count.value_counts()

admit_counts_df = pd.DataFrame(admit_counts).reset_index()

all_tokens = all_data['chiefcomplaint']
all_tokens = [x.split(', ') for x in all_tokens]
all_tokens_flat = list()
for tokens in all_tokens:
    for token in tokens:
        all_tokens_flat.append(token)
total_counts = pd.DataFrame(data={'total':all_tokens_flat}).total.value_counts()

total_counts_df = pd.DataFrame(total_counts).reset_index()

all_counts = pd.merge(
    pd.merge(total_counts_df, home_counts_df, on='index', how='left'),
    admit_counts_df, on = 'index', how='left')
all_counts

all_counts = all_counts.fillna(0)

all_counts['discharge_pp'] = all_counts['discharge_count']/all_counts['total']
all_counts['expired_pp'] = all_counts['expire_count']/all_counts['total']
all_counts

Unnamed: 0,index,total,discharge_count,expire_count,discharge_pp,expired_pp
0,transfer,20144,20114.0,30.0,0.998511,0.001489
1,abd pain,17332,17327.0,5.0,0.999712,0.000288
2,dyspnea,12974,12966.0,8.0,0.999383,0.000617
3,chest pain,10374,10365.0,9.0,0.999132,0.000868
4,s/p fall,8963,8950.0,13.0,0.998550,0.001450
...,...,...,...,...,...,...
13304,port erosion,1,1.0,0.0,1.000000,0.000000
13305,neuropathy pain,1,1.0,0.0,1.000000,0.000000
13306,bilateral arm/hand numbness,1,1.0,0.0,1.000000,0.000000
13307,atv crash,1,1.0,0.0,1.000000,0.000000


In [21]:
high_mort_pp = all_counts[all_counts.expired_pp > 0.6].rename(columns={'index':'complaint'}) # adjusted for 60% due to sample size
high_mort_pp.to_csv('../CONSTANTS/high_mortality_complaints.csv')

high_disch_pp = all_counts[all_counts.discharge_pp > 0.8].rename(columns={'index':'complaint'})
high_disch_pp.to_csv('../CONSTANTS/high_discharge_complaints.csv')