In [1]:
import pandas as pd

admissions = pd.read_csv("../data/ADMISSIONS.csv")
admissions['ADMITTIME'] = pd.to_datetime(admissions['ADMITTIME'])
# CPT codes
# chartdate for timing of procedure
cpt_events = pd.read_csv('../data/CPTEVENTS.csv',
                         usecols=['SUBJECT_ID','HADM_ID','CHARTDATE','CPT_NUMBER'])
d_cpt      = pd.read_csv('../data/D_CPT.csv',
                         usecols=['SECTIONHEADER','SUBSECTIONHEADER',
                                  'MINCODEINSUBSECTION','MAXCODEINSUBSECTION'])

# map code to section
intervals = pd.IntervalIndex.from_arrays(
    left=d_cpt['MINCODEINSUBSECTION'],
    right=d_cpt['MAXCODEINSUBSECTION'],
    closed='both'
)
d_cpt = d_cpt.assign(interval=intervals)

# cpt timing
cpt_events['CHARTDATE'] = pd.to_datetime(cpt_events['CHARTDATE'])
cpt_events = cpt_events.merge(admissions, on='HADM_ID', how='left')
cpt_events['days_since_admission'] = (
    cpt_events['CHARTDATE'] - cpt_events['ADMITTIME']
).dt.days

# look up
def lookup_cpt_sections(cpt_num):
    mask = intervals.contains(cpt_num)
    matches = d_cpt[mask]
    if matches.empty:
        return pd.Series({'sections': None, 'subsections': None})
    else:
        return pd.Series({
            'sections':    ';'.join(matches['SECTIONHEADER'].astype(str)),
            'subsections': ';'.join(matches['SUBSECTIONHEADER'].astype(str)),
        })

cpt_events[['section','subsection']] = (
    cpt_events['CPT_NUMBER']
              .apply(lookup_cpt_sections)
)
agg = cpt_events.groupby('HADM_ID').agg(
    total_cpt_count    = ('CPT_NUMBER','size'),
    unique_cpt_codes   = ('CPT_NUMBER','nunique'),
    distinct_sections  = ('section','nunique'),
    distinct_subsects  = ('subsection','nunique'),
)

timing_agg = cpt_events.groupby('HADM_ID').agg(
    first_cpt_day   = ('days_since_admission','min'),   # e.g. 0 = same day
    last_cpt_day    = ('days_since_admission','max'),
    cpt_span_days   = ('days_since_admission', lambda x: x.max() - x.min()),
)
cpt_features = (
    agg
    # .join(high_risk_agg,  how='left')
    .join(timing_agg,     how='left')
    # .join(code_agg,       how='left')
    .fillna(0)
    .reset_index()
)
print(cpt_features.head())
cpt_features.to_csv("cpt_values.csv", index=False)

  cpt_events = pd.read_csv('../data/CPTEVENTS.csv',


   HADM_ID  total_cpt_count  unique_cpt_codes  distinct_sections  \
0   100001                8                 4                  1   
1   100003                8                 6                  3   
2   100006               10                 4                  1   
3   100007                5                 3                  1   
4   100009               11                 8                  3   

   distinct_subsects  first_cpt_day  last_cpt_day  cpt_span_days  
0                  2            0.0           0.0            0.0  
1                  4            0.0           0.0            0.0  
2                  1            0.0           0.0            0.0  
3                  2            0.0           0.0            0.0  
4                  4            1.0           1.0            0.0  
