# Importing required libraries

In [36]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pickle
import random
import os
from glob import glob
import pyarrow.parquet as pq
import warnings
warnings.filterwarnings('ignore')
from dateutil.relativedelta import relativedelta
# To execute a cell line by line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Loading data - 2007 Optum DODR data

In [2]:
# Getting all latest dodr files

dodr_location = '/N/project/optum/data/parquet/dodr_81_202107/'

filenames = os.listdir(dodr_location)
for files in filenames:
    print(files)

dod_lr2021q1.parquet
dod_provider_bridge.parquet
dod_r2019q3.parquet
dod_lr2011q2.parquet
dod_r2007q2.parquet
dod_proc2021q1.parquet
dod_m2011q2.parquet
dod_m2015q4.parquet
dod_lr2008q1.parquet
dod_diag2020q2.parquet
dod_proc2008q4.parquet
dod_proc2016q2.parquet
dod_m2018q4.parquet
dod_lr2007q4.parquet
dod_c2014q3.parquet
dod_c2014q1.parquet
dod_c2017q3.parquet
dod_m2014q2.parquet
dod_diag2008q4.parquet
dod_proc2016q1.parquet
dod_lr2019q3.parquet
dod_proc2017q3.parquet
dod_lr2020q1.parquet
dod_lr2020q4.parquet
dod_lr2012q1.parquet
dod_diag2015q2.parquet
dod_proc2014q4.parquet
dod_diag2016q1.parquet
dod_r2013q2.parquet
dod_proc2017q2.parquet
dod_mbr_enroll_r.parquet
dod_lr2012q3.parquet
dod_r2010q1.parquet
dod_lr2015q4.parquet
dod_m2010q1.parquet
dod_r2018q1.parquet
dod_diag2015q3.parquet
dod_c2019q1.parquet
dod_r2011q1.parquet
dod_c2012q3.parquet
dod_lr2016q3.parquet
dod_m2016q2.parquet
dod_lr2015q2.parquet
dod_c2015q2.parquet
dod_lr2013q1.parquet
dod_r2015q3.parquet
dod_lr2017q4.parqu

# Identifying Patient IDs with Opioid in their prescriptions 

Using RX tables and NDC column

In [3]:
# Filtering all RX files

rx_files = glob(dodr_location + '*dod_r2007*')
rx_files

['/N/project/optum/data/parquet/dodr_81_202107/dod_r2007q2.parquet',
 '/N/project/optum/data/parquet/dodr_81_202107/dod_r2007q4.parquet',
 '/N/project/optum/data/parquet/dodr_81_202107/dod_r2007q3.parquet',
 '/N/project/optum/data/parquet/dodr_81_202107/dod_r2007q1.parquet']

In [4]:
# CMS NDC codes

with open('filter_list_no_star', 'rb') as f:
    op_ndc_codes = pickle.load(f)
    
len(op_ndc_codes)

41520

In [5]:
# Funtion to filter RX files for patient IDs with at least one opioid prescription using CMS NDC codes

def filterOpRxPat(files, op_ndc_codes):
    rx_df = pd.DataFrame()
    op_pat_id = []

    for i in files:
        x = pd.read_parquet(i)
        rx_ndc_codes = x['NDC'].unique()
        rx_op_ndc_codes = [s2 for s1 in op_ndc_codes for s2 in rx_ndc_codes if s1 in s2]
        op_pat_temp = x[x['NDC'].isin(rx_op_ndc_codes)]['PATID'].unique()
        temp = x[x['PATID'].isin(op_pat_temp)]
        print(i)
        print("Shape:", x.shape, temp.shape)
        print("NDC codes:", x['NDC'].nunique(), temp['NDC'].nunique())
        print("Patient count:", x['PATID'].nunique(), temp['PATID'].nunique())
        rx_df = rx_df.append(temp)
        op_pat_id.extend(op_pat_temp)
        del x, temp, op_pat_temp

    return rx_df, list(set(op_pat_id))


In [6]:
# Filtering RX files for patient IDs with at least one opioid prescription using CMS NDC codes

rx_op_df, op_pat_list = filterOpRxPat(rx_files, op_ndc_codes)
rx_op_df.shape
rx_op_df['NDC'].nunique()
rx_op_df['PATID'].nunique()

/N/project/optum/data/parquet/dodr_81_202107/dod_r2007q2.parquet
Shape: (39201711, 34) (9215094, 34)
NDC codes: 26166 19805
Patient count: 7185289 1035553
/N/project/optum/data/parquet/dodr_81_202107/dod_r2007q4.parquet
Shape: (40317373, 34) (9970291, 34)
NDC codes: 27134 20735
Patient count: 7203220 1101357
/N/project/optum/data/parquet/dodr_81_202107/dod_r2007q3.parquet
Shape: (38727543, 34) (9019324, 34)
NDC codes: 26613 20075
Patient count: 7047536 1004523
/N/project/optum/data/parquet/dodr_81_202107/dod_r2007q1.parquet
Shape: (39730026, 34) (9895122, 34)
NDC codes: 26012 19910
Patient count: 7438695 1164096


(38099831, 34)

26460

2962506

In [7]:
rx_op_df.head()

Unnamed: 0,PATID,PAT_PLANID,AHFSCLSS,AVGWHLSL,BRND_NM,CHARGE,CHK_DT,CLMID,COPAY,DAW,...,PRC_TYP,QUANTITY,RFL_NBR,SPCLT_IND,SPECCLSS,STD_COST,STD_COST_YR,STRENGTH,PRESCRIBER_PROV,PRESCRIPT_ID
0,33003282014,53016278960,81216,5.99,PENICILLIN V POTASSIUM,5.99,2007-05-03,706686128,8.99,0,...,2,15.0,0,N,W1A,5.93,2013,500MG,4777242697,106826382
1,33003282014,53016278960,280808,9.53,HYDROCODONE W/ACETAMINOPHEN,5.91,2007-05-03,708547314,10.0,0,...,1,30.0,0,N,H3A,1.89,2013,5MG-500MG,4777279937,104441636
2,33003282014,53016278960,280808,9.53,HYDROCODONE W/ACETAMINOPHEN,5.91,2007-05-03,709873159,10.0,0,...,1,30.0,0,N,H3A,1.89,2013,5MG-500MG,4771622829,104441671
3,33003282014,53016278960,280808,9.53,HYDROCODONE W/ACETAMINOPHEN,5.91,2007-05-17,712648362,10.0,0,...,1,30.0,0,N,H3A,1.89,2013,5MG-500MG,4771622829,104441729
4,33003282014,53016278960,281604,238.77,FLUOXETINE HCL,102.67,2007-05-17,713148753,25.0,0,...,1,90.0,3,N,H2S,14.82,2013,20MG,4774013864,107793026


## Indentifying index date for each patient - first date of opioid prescription

In [8]:
rx_op_df[rx_op_df['PATID'] == 33003282019].sort_values(['PATID', 'FILL_DT'], ascending=[True, True])['FILL_DT']

4    2007-01-31
8    2007-01-31
5    2007-02-05
9    2007-02-05
10   2007-02-18
11   2007-03-01
7    2007-03-19
6    2007-03-27
4    2007-11-26
5    2007-11-26
6    2007-11-26
Name: FILL_DT, dtype: datetime64[ns]

In [9]:
rx_op_df1 = rx_op_df.sort_values(['PATID', 'FILL_DT'], ascending=[True, True])

rx_op_df1 = rx_op_df1[['PATID', 'FILL_DT']]

rx_ind_df = rx_op_df1.drop_duplicates(subset=['PATID'], keep='first')
rx_ind_df.head()

Unnamed: 0,PATID,FILL_DT
0,33003282014,2007-04-10
4,33003282019,2007-01-31
16,33003282022,2007-08-16
21,33003282034,2007-01-21
37,33003282035,2007-01-08


In [10]:
rx_ind_df = rx_ind_df.rename(columns = {'FILL_DT' : 'INDEX_DT'})
rx_ind_df.head()

Unnamed: 0,PATID,INDEX_DT
0,33003282014,2007-04-10
4,33003282019,2007-01-31
16,33003282022,2007-08-16
21,33003282034,2007-01-21
37,33003282035,2007-01-08


# Filtering patient IDs with opioid prescriptions in Inpatient data

Using Confinement data and above pat IDs

In [11]:
rx_op_df['PATID'].nunique()

2962506

In [12]:
rx_ind_df['PATID'].nunique()

2962506

In [13]:
len(op_pat_list)

2962506

In [19]:
# op_pat_list  = list(op_pat_list)

In [18]:
type(op_pat_list)

list

In [20]:
# Filtering all inpatient (confinement) files

inp_files = glob(dodr_location + '*dod_c2007*')
inp_files

['/N/project/optum/data/parquet/dodr_81_202107/dod_c2007q4.parquet',
 '/N/project/optum/data/parquet/dodr_81_202107/dod_c2007q1.parquet',
 '/N/project/optum/data/parquet/dodr_81_202107/dod_c2007q3.parquet',
 '/N/project/optum/data/parquet/dodr_81_202107/dod_c2007q2.parquet']

In [21]:
# Function to filter patients with opioids in prescriptions in inpatient data

def filterOpioidPrescPatInp(files, op_presc_pat):
    
    inp_df = pd.DataFrame()
    for i in files:
        x = pd.read_parquet(i)
        temp = x[x['PATID'].isin(op_presc_pat) == True]
        print(i)
        print("Shape:", x.shape, temp.shape)
        print("Patient count:", x['PATID'].nunique(), temp['PATID'].nunique())
        inp_df = inp_df.append(temp)
        del x, temp
        
    return inp_df


In [22]:
# Filtering patients with opioid prescriptions in inpatient data

op_inp_df = filterOpioidPrescPatInp(inp_files, op_pat_list)
op_inp_df.shape

/N/project/optum/data/parquet/dodr_81_202107/dod_c2007q4.parquet
Shape: (308513, 35) (139620, 35)
Patient count: 262577 115859
/N/project/optum/data/parquet/dodr_81_202107/dod_c2007q1.parquet
Shape: (322815, 35) (144199, 35)
Patient count: 274575 119668
/N/project/optum/data/parquet/dodr_81_202107/dod_c2007q3.parquet
Shape: (298214, 35) (137779, 35)
Patient count: 256331 114957
/N/project/optum/data/parquet/dodr_81_202107/dod_c2007q2.parquet
Shape: (317027, 35) (148865, 35)
Patient count: 269858 122994


(570463, 35)

In [23]:
op_inp_df['PATID'].nunique()

407694

In [24]:
op_inp_df.head()

Unnamed: 0,PATID,PAT_PLANID,ADMIT_DATE,CHARGE,COINS,CONF_ID,COPAY,DEDUCT,DIAG1,DIAG2,...,PROV,STD_COST,STD_COST_YR,TOS_CD,ICU_IND,ICU_SURG_IND,MAJ_SURG_IND,MATERNITY_IND,NEWBORN_IND,TOS_EXT
0,33003282019,53013962277,2007-11-19,16868.94,0.0,MOLTNOKZKM4OT,250.0,0.0,78659,42789,...,4774878972,6358.05,2019,FAC_IP.ACUTE,Y,N,N,N,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE
2,33003282261,53056588883,2007-10-11,14436.23,1528.0,LN6RLMZN4OK4N,0.0,400.0,64511,66411,...,4774888919,6775.95,2019,FAC_IP.ACUTE,N,N,N,Y,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE
4,33003282417,53009524485,2007-12-12,27332.22,0.0,LTNRLLMO4OK4O,250.0,0.0,2182,78820,...,4774573813,11581.8,2019,FAC_IP.ACUTE,N,N,Y,N,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE
9,33003283085,53011734739,2007-11-27,32819.75,0.0,MZLT6RLMKM4OO,0.0,0.0,34690,29623,...,4774872509,9820.65,2019,FAC_IP.ACUTE,N,N,N,N,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE
10,33003283151,53010260932,2007-11-05,67109.48,1414.39,M4MK4OZ4KM4O4,0.0,0.0,41091,5180,...,4774876318,28029.15,2019,FAC_IP.ACUTE,Y,Y,Y,N,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE


# Filtering out patients with cancer

In [26]:
# List of procedure codes for cancer 

x = 77261
ids = [x]
for i in range(0,538):
    x+=1
    ids.append(x)
canc_proc_codes = map(str, ids)
canc_proc_codes = list(canc_proc_codes)

In [27]:
# Filter patient with cancer diagnosis from above patients

def filterCancPat(op_inp_df, canc_proc_list):
    canc_pat_list = op_inp_df[op_inp_df['PROC1'].isin(canc_proc_list) | op_inp_df['PROC2'].isin(canc_proc_list) | op_inp_df['PROC3'].isin(canc_proc_list) | op_inp_df['PROC4'].isin(canc_proc_list) | op_inp_df['PROC5'].isin(canc_proc_list)]['PATID'].unique()
    op_presc_no_cancer_df = op_inp_df[~op_inp_df['PATID'].isin(canc_pat_list)]    
    
    return op_presc_no_cancer_df

In [28]:
op_presc_no_canc_df = filterCancPat(op_inp_df, canc_proc_codes)
op_presc_no_canc_df.shape

(570463, 35)

In [29]:
len(op_pat_list)
op_inp_df['PATID'].nunique()
op_presc_no_canc_df['PATID'].nunique()

2962506

407694

407694

In [30]:
op_presc_no_canc_df.head()

Unnamed: 0,PATID,PAT_PLANID,ADMIT_DATE,CHARGE,COINS,CONF_ID,COPAY,DEDUCT,DIAG1,DIAG2,...,PROV,STD_COST,STD_COST_YR,TOS_CD,ICU_IND,ICU_SURG_IND,MAJ_SURG_IND,MATERNITY_IND,NEWBORN_IND,TOS_EXT
0,33003282019,53013962277,2007-11-19,16868.94,0.0,MOLTNOKZKM4OT,250.0,0.0,78659,42789,...,4774878972,6358.05,2019,FAC_IP.ACUTE,Y,N,N,N,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE
2,33003282261,53056588883,2007-10-11,14436.23,1528.0,LN6RLMZN4OK4N,0.0,400.0,64511,66411,...,4774888919,6775.95,2019,FAC_IP.ACUTE,N,N,N,Y,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE
4,33003282417,53009524485,2007-12-12,27332.22,0.0,LTNRLLMO4OK4O,250.0,0.0,2182,78820,...,4774573813,11581.8,2019,FAC_IP.ACUTE,N,N,Y,N,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE
9,33003283085,53011734739,2007-11-27,32819.75,0.0,MZLT6RLMKM4OO,0.0,0.0,34690,29623,...,4774872509,9820.65,2019,FAC_IP.ACUTE,N,N,N,N,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE
10,33003283151,53010260932,2007-11-05,67109.48,1414.39,M4MK4OZ4KM4O4,0.0,0.0,41091,5180,...,4774876318,28029.15,2019,FAC_IP.ACUTE,Y,Y,Y,N,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE


In [31]:
rx_ind_df

Unnamed: 0,PATID,INDEX_DT
0,33003282014,2007-04-10
4,33003282019,2007-01-31
16,33003282022,2007-08-16
21,33003282034,2007-01-21
37,33003282035,2007-01-08
...,...,...
39201669,33253093291,2007-04-26
39201673,33254367068,2007-04-13
40317332,33254368071,2007-10-11
38727529,33254539781,2007-07-20


In [61]:
rx_ind_pat_list = rx_ind_df['PATID'].unique()
len(rx_ind_pat_list)

2962506

In [62]:
# Filtering all member enrollment files for patients with at least one opioid prescription

file = dodr_location + 'dod_mbr_co_enroll_r.parquet'
mbr_temp = pq.ParquetDataset(file, use_legacy_dataset = False, filters=[('PATID','in', rx_ind_pat_list)])
mbr_enroll_df = mbr_temp.read().to_pandas()
mbr_enroll_df.shape

(3858478, 6)

In [63]:
mbr_enroll_df.head()

Unnamed: 0,PATID,ELIGEFF,ELIGEND,GDR_CD,RACE,YRDOB
0,33003282014,2007-01-01,2007-12-31,F,W,1952
1,33003282014,2012-05-21,2012-12-31,F,W,1952
2,33003282019,2007-01-01,2007-12-31,F,W,1962
3,33003282022,2007-01-01,2010-05-31,F,W,1960
4,33003282022,2010-11-01,2011-12-31,F,W,1960


In [64]:
# # filter for year 2022
# year = 2007
# mask = (df['ELIGEFF'].dt.year == year) & (df['ELIGEND'].dt.year == year)
# filtered_df = df.loc[mask]

# # print filtered dataframe
# filtered_df

In [65]:
# filtered_df[filtered_df.duplicated(subset=['PATID'], keep=False)]

In [66]:
# df1 = filtered_df[filtered_df['WIN_3_MON'] == 1]
# df1[df1.duplicated(subset=['PATID'], keep=False)]

In [67]:
# df1

In [68]:
mbr_enroll_df = mbr_enroll_df.merge(rx_ind_df, on=['PATID'], how='left')
mbr_enroll_df['PATID'].nunique()
mbr_enroll_df.shape
mbr_enroll_df.head()

2959912

(3858478, 7)

Unnamed: 0,PATID,ELIGEFF,ELIGEND,GDR_CD,RACE,YRDOB,INDEX_DT
0,33003282014,2007-01-01,2007-12-31,F,W,1952,2007-04-10
1,33003282014,2012-05-21,2012-12-31,F,W,1952,2007-04-10
2,33003282019,2007-01-01,2007-12-31,F,W,1962,2007-01-31
3,33003282022,2007-01-01,2010-05-31,F,W,1960,2007-08-16
4,33003282022,2010-11-01,2011-12-31,F,W,1960,2007-08-16


In [69]:
mbr_enroll_df[mbr_enroll_df.duplicated(subset=['PATID'], keep=False)]

Unnamed: 0,PATID,ELIGEFF,ELIGEND,GDR_CD,RACE,YRDOB,INDEX_DT
0,33003282014,2007-01-01,2007-12-31,F,W,1952,2007-04-10
1,33003282014,2012-05-21,2012-12-31,F,W,1952,2007-04-10
3,33003282022,2007-01-01,2010-05-31,F,W,1960,2007-08-16
4,33003282022,2010-11-01,2011-12-31,F,W,1960,2007-08-16
5,33003282022,2015-01-01,2019-03-31,F,W,1960,2007-08-16
...,...,...,...,...,...,...,...
3858469,33252910463,2020-10-01,2021-03-31,F,,1964,2007-07-19
3858471,33253021571,2007-08-01,2008-07-31,M,W,1972,2007-10-25
3858472,33253021571,2011-01-01,2021-03-31,M,W,1972,2007-10-25
3858474,33254367068,2007-01-01,2015-01-31,M,W,1970,2007-04-13


In [70]:
# Checking for 3 month look back and 3 month subsequent windows

flag = []

# iterate over the dates in all three columns
for dt1, dt2, dt3 in zip(mbr_enroll_df['ELIGEFF'], mbr_enroll_df['INDEX_DT'], mbr_enroll_df['ELIGEND']):

    # calculate the difference between the dates in months
    diff1 = relativedelta(dt2, dt1).months
    diff2 = relativedelta(dt3, dt2).months
        
    # check if the dates in column 1 are within 3 months of the dates in column 2
    if dt1 > dt2:
        flag_value = 0
    elif dt3 < dt2:
        fla_value = 0
    elif 3 <= diff1 <= 12 and 3 <= diff2 <= 12:
        flag_value = 1
    else:
        flag_value = 0
        
    # append the flag value to the list
    flag.append(flag_value)


In [71]:
mbr_enroll_df.shape
len(flag)

(3858478, 7)

3858478

In [72]:
mbr_enroll_df['WIN_3_MON'] = flag

In [73]:
mbr_enroll_df.head()

Unnamed: 0,PATID,ELIGEFF,ELIGEND,GDR_CD,RACE,YRDOB,INDEX_DT,WIN_3_MON
0,33003282014,2007-01-01,2007-12-31,F,W,1952,2007-04-10,1
1,33003282014,2012-05-21,2012-12-31,F,W,1952,2007-04-10,0
2,33003282019,2007-01-01,2007-12-31,F,W,1962,2007-01-31,0
3,33003282022,2007-01-01,2010-05-31,F,W,1960,2007-08-16,1
4,33003282022,2010-11-01,2011-12-31,F,W,1960,2007-08-16,0


In [74]:
mbr_enroll_df['WIN_3_MON'].value_counts()

0    2728202
1    1130276
Name: WIN_3_MON, dtype: int64

In [75]:
# from datetime import datetime

# dt1 = '2008-07-01'
# dt2 = '2007-02-23'
# dt3 = '2009-03-12'

# # convert the date strings to datetime objects
# dt1 = datetime.strptime(dt1, '%Y-%m-%d')
# dt2 = datetime.strptime(dt2, '%Y-%m-%d')
# dt3 = datetime.strptime(dt3, '%Y-%m-%d')

# # calculate the difference between the dates in months
# diff1 = relativedelta(dt2, dt1).months
# print("diff1:",diff1)
# diff2 = relativedelta(dt3, dt2).months
# print("diff2:", diff2)

# # check if the dates in column 1 are within 3 months of the dates in column 2
# if dt1 > dt2:
#     flag_value = 0
#     print(1)
# elif dt3 < dt2:                                                                                                                                                         
#     fla_value = 0
#     print(2)
# elif 3<=diff1<=4 and diff2>=3:
#     flag_value = 1
#     print(3)
# else:
#     flag_value = 0
#     print(4)
    
# print("flag_value:", flag_value)

In [76]:
mbr_enroll_df[mbr_enroll_df.duplicated(subset=['PATID'], keep=False)]

Unnamed: 0,PATID,ELIGEFF,ELIGEND,GDR_CD,RACE,YRDOB,INDEX_DT,WIN_3_MON
0,33003282014,2007-01-01,2007-12-31,F,W,1952,2007-04-10,1
1,33003282014,2012-05-21,2012-12-31,F,W,1952,2007-04-10,0
3,33003282022,2007-01-01,2010-05-31,F,W,1960,2007-08-16,1
4,33003282022,2010-11-01,2011-12-31,F,W,1960,2007-08-16,0
5,33003282022,2015-01-01,2019-03-31,F,W,1960,2007-08-16,0
...,...,...,...,...,...,...,...,...
3858469,33252910463,2020-10-01,2021-03-31,F,,1964,2007-07-19,0
3858471,33253021571,2007-08-01,2008-07-31,M,W,1972,2007-10-25,0
3858472,33253021571,2011-01-01,2021-03-31,M,W,1972,2007-10-25,0
3858474,33254367068,2007-01-01,2015-01-31,M,W,1970,2007-04-13,1


In [77]:
# mbr_enroll_df[mbr_enroll_df['PATID'] == 33003282126]

In [78]:
# mbr_enroll_df.sort_values(['PATID', 'ELIGEFF'], ascending=[True, True])

In [79]:
# df = mbr_enroll_df.sort_values(['PATID', 'ELIGEFF'], ascending=[True, True])

# df.drop_duplicates(subset=['PATID'], keep='first')

In [80]:
# df[df.duplicated(subset=['PATID'], keep=False)]

In [81]:
mbr_enroll_df1 = mbr_enroll_df[mbr_enroll_df['WIN_3_MON'] == 1]
mbr_enroll_df1 = mbr_enroll_df1.drop_duplicates(subset=['PATID'], keep='first')
mbr_enroll_df1

Unnamed: 0,PATID,ELIGEFF,ELIGEND,GDR_CD,RACE,YRDOB,INDEX_DT,WIN_3_MON
0,33003282014,2007-01-01,2007-12-31,F,W,1952,2007-04-10,1
3,33003282022,2007-01-01,2010-05-31,F,W,1960,2007-08-16,1
9,33003282052,2007-01-01,2014-02-28,M,W,1975,2007-06-11,1
11,33003282074,2007-01-01,2011-11-30,F,H,1978,2007-04-02,1
13,33003282075,2007-01-01,2007-09-30,F,W,1956,2007-06-18,1
...,...,...,...,...,...,...,...,...
3858470,33252979699,2007-01-01,2021-03-31,M,W,2001,2007-06-12,1
3858473,33253093291,2007-01-01,2021-03-31,M,,1997,2007-04-26,1
3858474,33254367068,2007-01-01,2015-01-31,M,W,1970,2007-04-13,1
3858476,33254368071,2007-01-01,2021-03-31,F,W,1979,2007-10-11,1


In [82]:
mbr_enroll_df1[mbr_enroll_df1.duplicated(subset=['PATID'], keep=False)]

Unnamed: 0,PATID,ELIGEFF,ELIGEND,GDR_CD,RACE,YRDOB,INDEX_DT,WIN_3_MON


In [83]:
mbr_enroll_df1['PATID'].nunique()

1129735

In [84]:
op_presc_no_canc_df['PATID'].nunique()

407694

In [62]:
# op_presc_no_canc_mbr_df['PATID'].nunique()

In [85]:
# set(op_presc_no_canc_df['PATID'].unique()) - set(mbr_enroll_df1['PATID'].unique())

In [86]:
len(list(set(mbr_enroll_df1['PATID'].unique()) - set(op_presc_no_canc_df['PATID'].unique())))

972762

In [87]:
op_presc_no_canc_mbr_df = op_presc_no_canc_df.merge(mbr_enroll_df1, on=['PATID'], how='inner')
op_presc_no_canc_mbr_df.shape
op_presc_no_canc_mbr_df.head()

(211195, 42)

Unnamed: 0,PATID,PAT_PLANID,ADMIT_DATE,CHARGE,COINS,CONF_ID,COPAY,DEDUCT,DIAG1,DIAG2,...,MATERNITY_IND,NEWBORN_IND,TOS_EXT,ELIGEFF,ELIGEND,GDR_CD,RACE,YRDOB,INDEX_DT,WIN_3_MON
0,33003282417,53009524485,2007-12-12,27332.22,0.0,LTNRLLMO4OK4O,250.0,0.0,2182,78820,...,N,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE,2007-01-01,2009-06-30,F,W,1952,2007-10-25,1
1,33003283085,53011734739,2007-11-27,32819.75,0.0,MZLT6RLMKM4OO,0.0,0.0,34690,29623,...,N,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE,2007-01-01,2008-02-29,F,W,1952,2007-04-12,1
2,33003283085,53011734739,2007-01-31,57710.0,1638.24,MZLT6RLTKM4OO,0.0,0.0,34690,78650,...,N,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE,2007-01-01,2008-02-29,F,W,1952,2007-04-12,1
3,33003283085,53011734739,2007-04-04,32655.0,0.0,MZLT6RLOKM4OO,0.0,0.0,34681,78791,...,N,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE,2007-01-01,2008-02-29,F,W,1952,2007-04-12,1
4,33003283085,53011734739,2007-04-20,8680.6,475.0,MZLT6RLZKM4OO,0.0,0.0,29623,V6284,...,N,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE,2007-01-01,2008-02-29,F,W,1952,2007-04-12,1


In [88]:
op_presc_no_canc_mbr_df.columns

Index(['PATID', 'PAT_PLANID', 'ADMIT_DATE', 'CHARGE', 'COINS', 'CONF_ID',
       'COPAY', 'DEDUCT', 'DIAG1', 'DIAG2', 'DIAG3', 'DIAG4', 'DIAG5',
       'DISCH_DATE', 'DRG', 'DSTATUS', 'ICD_FLAG', 'IPSTATUS', 'LOS', 'POS',
       'PROC1', 'PROC2', 'PROC3', 'PROC4', 'PROC5', 'PROV', 'STD_COST',
       'STD_COST_YR', 'TOS_CD', 'ICU_IND', 'ICU_SURG_IND', 'MAJ_SURG_IND',
       'MATERNITY_IND', 'NEWBORN_IND', 'TOS_EXT', 'ELIGEFF', 'ELIGEND',
       'GDR_CD', 'RACE', 'YRDOB', 'INDEX_DT', 'WIN_3_MON'],
      dtype='object')

In [89]:
# Function to aggregate MEMBER ENROLLMENT data at patient level

def mbrPreprocess(mbr_df, year):
    
    mbr_df['AGE'] = year - mbr_df['YRDOB']
    mbr_df = mbr_df[mbr_df['AGE'] > 10] # Removing all patients with age less than 10 years
    
#     mbr_df['TOTAL_MON_COV'] = round(((mbr_df.ELIGEND - mbr_df.ELIGEFF)/np.timedelta64(1, 'M')),0)
    
    for i in ['GDR_CD', 'RACE']:
        mbr_df[i] = mbr_df[i].fillna('U')
        mbr_df[i] = np.where(mbr_df[i] == '', 'U', mbr_df[i])

#     mbr_grp_df = mbr_df.groupby('PATID').agg({'TOTAL_MON_COV':'sum', 'AGE':'mean', 'GDR_CD':'last', 'RACE':'last', 'STATE':'last'})
#     mbr_grp_df.reset_index(inplace = True)
    
    return mbr_df


In [90]:
op_presc_no_canc_mbr_df1 = mbrPreprocess(op_presc_no_canc_mbr_df, 2007)
op_presc_no_canc_mbr_df1.shape
op_presc_no_canc_mbr_df1.head()

(211195, 43)

Unnamed: 0,PATID,PAT_PLANID,ADMIT_DATE,CHARGE,COINS,CONF_ID,COPAY,DEDUCT,DIAG1,DIAG2,...,NEWBORN_IND,TOS_EXT,ELIGEFF,ELIGEND,GDR_CD,RACE,YRDOB,INDEX_DT,WIN_3_MON,AGE
0,33003282417,53009524485,2007-12-12,27332.22,0.0,LTNRLLMO4OK4O,250.0,0.0,2182,78820,...,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE,2007-01-01,2009-06-30,F,W,1952,2007-10-25,1,55
1,33003283085,53011734739,2007-11-27,32819.75,0.0,MZLT6RLMKM4OO,0.0,0.0,34690,29623,...,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE,2007-01-01,2008-02-29,F,W,1952,2007-04-12,1,55
2,33003283085,53011734739,2007-01-31,57710.0,1638.24,MZLT6RLTKM4OO,0.0,0.0,34690,78650,...,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE,2007-01-01,2008-02-29,F,W,1952,2007-04-12,1,55
3,33003283085,53011734739,2007-04-04,32655.0,0.0,MZLT6RLOKM4OO,0.0,0.0,34681,78791,...,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE,2007-01-01,2008-02-29,F,W,1952,2007-04-12,1,55
4,33003283085,53011734739,2007-04-20,8680.6,475.0,MZLT6RLZKM4OO,0.0,0.0,29623,V6284,...,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE,2007-01-01,2008-02-29,F,W,1952,2007-04-12,1,55


In [91]:
def inpPreprocess(inp_df1):
    
    inp_df1.drop(columns = ['COINS', 'CONF_ID', 'DSTATUS', 'IPSTATUS', 'POS',
       'PROV', 'STD_COST_YR', 'TOS_CD', 'NEWBORN_IND', 'TOS_EXT'], inplace = True)
    
#     inp_df['DRG'] = np.where(inp_df['DRG'] == '', 'UNK', inp_df['DRG'])
#     inp_df1 = pd.merge(inp_df, drg_codes, on = ['DRG'], how = 'left')
#     inp_df1.rename(columns = {'DESCRIPTION': 'DRG_DESC'}, inplace=True)
    
    diag_cols = ['DIAG1', 'DIAG2', 'DIAG3', 'DIAG4', 'DIAG5']
    for i in diag_cols:
        inp_df1[i] = np.where(inp_df1[i] == '', '0000', inp_df1[i])
        
    proc_cols = ['PROC1', 'PROC2', 'PROC3', 'PROC4', 'PROC5']
    for i in diag_cols:
        inp_df1[i] = np.where(inp_df1[i] == '', '0000', inp_df1[i])
        
    ind_cols = ['ICU_IND', 'ICU_SURG_IND', 'MAJ_SURG_IND','MATERNITY_IND']
    for i in ind_cols:
        inp_df1[i] = inp_df1[i].replace({'Y': 1, 'N': 0})
    
    return inp_df1

In [92]:
op_presc_no_canc_mbr_df1.head()

Unnamed: 0,PATID,PAT_PLANID,ADMIT_DATE,CHARGE,COINS,CONF_ID,COPAY,DEDUCT,DIAG1,DIAG2,...,NEWBORN_IND,TOS_EXT,ELIGEFF,ELIGEND,GDR_CD,RACE,YRDOB,INDEX_DT,WIN_3_MON,AGE
0,33003282417,53009524485,2007-12-12,27332.22,0.0,LTNRLLMO4OK4O,250.0,0.0,2182,78820,...,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE,2007-01-01,2009-06-30,F,W,1952,2007-10-25,1,55
1,33003283085,53011734739,2007-11-27,32819.75,0.0,MZLT6RLMKM4OO,0.0,0.0,34690,29623,...,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE,2007-01-01,2008-02-29,F,W,1952,2007-04-12,1,55
2,33003283085,53011734739,2007-01-31,57710.0,1638.24,MZLT6RLTKM4OO,0.0,0.0,34690,78650,...,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE,2007-01-01,2008-02-29,F,W,1952,2007-04-12,1,55
3,33003283085,53011734739,2007-04-04,32655.0,0.0,MZLT6RLOKM4OO,0.0,0.0,34681,78791,...,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE,2007-01-01,2008-02-29,F,W,1952,2007-04-12,1,55
4,33003283085,53011734739,2007-04-20,8680.6,475.0,MZLT6RLZKM4OO,0.0,0.0,29623,V6284,...,N,FAC_IP.ACUTE.ACUTE.ACUTE.ACUTE,2007-01-01,2008-02-29,F,W,1952,2007-04-12,1,55


In [93]:
op_presc_no_canc_mbr_df2 = inpPreprocess(op_presc_no_canc_mbr_df1)
op_presc_no_canc_mbr_df2.shape
op_presc_no_canc_mbr_df2.head()

(211195, 33)

Unnamed: 0,PATID,PAT_PLANID,ADMIT_DATE,CHARGE,COPAY,DEDUCT,DIAG1,DIAG2,DIAG3,DIAG4,...,MAJ_SURG_IND,MATERNITY_IND,ELIGEFF,ELIGEND,GDR_CD,RACE,YRDOB,INDEX_DT,WIN_3_MON,AGE
0,33003282417,53009524485,2007-12-12,27332.22,250.0,0.0,2182,78820,6271,6259,...,1,0,2007-01-01,2009-06-30,F,W,1952,2007-10-25,1,55
1,33003283085,53011734739,2007-11-27,32819.75,0.0,0.0,34690,29623,30780,0,...,0,0,2007-01-01,2008-02-29,F,W,1952,2007-04-12,1,55
2,33003283085,53011734739,2007-01-31,57710.0,0.0,0.0,34690,78650,4739,0,...,0,0,2007-01-01,2008-02-29,F,W,1952,2007-04-12,1,55
3,33003283085,53011734739,2007-04-04,32655.0,0.0,0.0,34681,78791,56400,29620,...,0,0,2007-01-01,2008-02-29,F,W,1952,2007-04-12,1,55
4,33003283085,53011734739,2007-04-20,8680.6,0.0,0.0,29623,V6284,34690,0,...,0,0,2007-01-01,2008-02-29,F,W,1952,2007-04-12,1,55


In [94]:
from dateutil.relativedelta import relativedelta

flag = []

# iterate over the dates in all three columns
for dt1, dt2, dt3 in zip(op_presc_no_canc_mbr_df2['ADMIT_DATE'], op_presc_no_canc_mbr_df2['INDEX_DT'], op_presc_no_canc_mbr_df2['DISCH_DATE']):
        
    # check if the dates in column 1 are within 3 months of the dates in column 2
    if dt1 <= dt2:
        flag_value = 'PRE_3MON'
    elif dt1 >= dt2:
        flag_value = 'POST_3MON'
    else:
        flag_value = ''
        
    # append the flag value to the list
    flag.append(flag_value)


In [95]:
op_presc_no_canc_mbr_df2.shape
len(flag)

(211195, 33)

211195

In [96]:
op_presc_no_canc_mbr_df2['WIN_3_MON'] = flag
op_presc_no_canc_mbr_df2

Unnamed: 0,PATID,PAT_PLANID,ADMIT_DATE,CHARGE,COPAY,DEDUCT,DIAG1,DIAG2,DIAG3,DIAG4,...,MAJ_SURG_IND,MATERNITY_IND,ELIGEFF,ELIGEND,GDR_CD,RACE,YRDOB,INDEX_DT,WIN_3_MON,AGE
0,33003282417,53009524485,2007-12-12,27332.22,250.0,0.0,2182,78820,6271,6259,...,1,0,2007-01-01,2009-06-30,F,W,1952,2007-10-25,POST_3MON,55
1,33003283085,53011734739,2007-11-27,32819.75,0.0,0.0,34690,29623,30780,0000,...,0,0,2007-01-01,2008-02-29,F,W,1952,2007-04-12,POST_3MON,55
2,33003283085,53011734739,2007-01-31,57710.00,0.0,0.0,34690,78650,4739,0000,...,0,0,2007-01-01,2008-02-29,F,W,1952,2007-04-12,PRE_3MON,55
3,33003283085,53011734739,2007-04-04,32655.00,0.0,0.0,34681,78791,56400,29620,...,0,0,2007-01-01,2008-02-29,F,W,1952,2007-04-12,PRE_3MON,55
4,33003283085,53011734739,2007-04-20,8680.60,0.0,0.0,29623,V6284,34690,0000,...,0,0,2007-01-01,2008-02-29,F,W,1952,2007-04-12,POST_3MON,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211190,33234444633,53015919240,2007-05-29,12259.29,0.0,0.0,78659,V5414,2859,0000,...,0,0,2007-01-01,2021-03-31,F,W,1962,2007-04-14,POST_3MON,45
211191,33234444633,53015919240,2007-05-24,96604.00,0.0,0.0,82100,V140,0000,0000,...,1,0,2007-01-01,2021-03-31,F,W,1962,2007-04-14,POST_3MON,45
211192,33246466382,53069334293,2007-06-05,33934.28,0.0,0.0,73329,0000,0000,0000,...,0,0,2007-01-01,2019-10-31,F,H,1997,2007-06-06,PRE_3MON,10
211193,33247614952,53068957593,2007-06-08,24053.40,0.0,0.0,4271,5846,27651,7469,...,0,0,2007-01-01,2008-12-31,M,A,1964,2007-06-18,PRE_3MON,43


In [97]:
op_presc_no_canc_mbr_df2.columns

Index(['PATID', 'PAT_PLANID', 'ADMIT_DATE', 'CHARGE', 'COPAY', 'DEDUCT',
       'DIAG1', 'DIAG2', 'DIAG3', 'DIAG4', 'DIAG5', 'DISCH_DATE', 'DRG',
       'ICD_FLAG', 'LOS', 'PROC1', 'PROC2', 'PROC3', 'PROC4', 'PROC5',
       'STD_COST', 'ICU_IND', 'ICU_SURG_IND', 'MAJ_SURG_IND', 'MATERNITY_IND',
       'ELIGEFF', 'ELIGEND', 'GDR_CD', 'RACE', 'YRDOB', 'INDEX_DT',
       'WIN_3_MON', 'AGE'],
      dtype='object')

In [98]:
op_presc_no_canc_mbr_df2.nunique()

PATID            156973
PAT_PLANID       157664
ADMIT_DATE          365
CHARGE           195346
COPAY              2919
DEDUCT            13896
DIAG1              5237
DIAG2              5513
DIAG3              5329
DIAG4              4888
DIAG5              4593
DISCH_DATE          456
DRG                 857
ICD_FLAG              1
LOS                 195
PROC1              2166
PROC2              2091
PROC3              1805
PROC4              1489
PROC5              1155
STD_COST          15947
ICU_IND               2
ICU_SURG_IND          2
MAJ_SURG_IND          2
MATERNITY_IND         2
ELIGEFF             243
ELIGEND            3124
GDR_CD                3
RACE                  5
YRDOB                91
INDEX_DT            293
WIN_3_MON             2
AGE                  91
dtype: int64

# Feature Engineering

## Idenitfying major DIAG and PROC code categories to create new columns

In [99]:
diag_cat_map = pd.read_excel('Opioid_analytics/Akhila/overdose_prediction/files/diag_code_map.xlsx')
diag_cat_map = diag_cat_map[diag_cat_map['STATUS'] == 1]
diag_cat_map.head()

Unnamed: 0,ICD_CODE,ICD_DESC,HARM_SAMPLE_PCT,NON_HARM_SAMPLE_PCT,RELATIVE_DIFF,ICD_CAT,STATUS
0,5771,CHRONIC PANCREATITIS,0.001671,0.00038,3.398981,ABDOMEN_REL_DIAG_POS,1
1,5770,ACUTE PANCREATITIS,0.0049,0.00201,1.437666,ABDOMEN_REL_DIAG_POS,1
2,5363,GASTROPARESIS,0.00102,0.000472,1.160766,ABDOMEN_REL_DIAG_POS,1
3,53550,UNS GASTRIT&GASTRODUODIT NO HEMORR,0.001095,0.00065,0.684,ABDOMEN_REL_DIAG_POS,1
4,5641,IRRITABLE BOWEL SYNDROME,0.001599,0.000998,0.601998,ABDOMEN_REL_DIAG_POS,1


In [100]:
diag_cat_map['STATUS'].value_counts()

1    104
Name: STATUS, dtype: int64

In [101]:
diag_cat_map['ICD_CODE'] = diag_cat_map['ICD_CODE'].astype(str)
diag_cat_dict = diag_cat_map.groupby('ICD_CAT')['ICD_CODE'].agg(list).to_dict()
diag_cat_dict

{'ABDOMEN_REL_DIAG_NEG': ['9974'],
 'ABDOMEN_REL_DIAG_POS': ['5771',
  '5770',
  '5363',
  '53550',
  '5641',
  '5559',
  '56211',
  'V4586'],
 'BLOOD_REL_DIAG': ['2859', '2800', '0389', '2851'],
 'HEART_REL_DIAG': ['4254', 'V4581', '41400', '4240', '4280', '42731'],
 'HX_TOB_HAZ_DIAG': ['V1582'],
 'INJURY_REL_DIAG': ['V173', 'V454'],
 'KIDNEY_REL_DIAG': ['591', '5849', '5859'],
 'LEUK_UNS_DIAG': ['28860'],
 'LIVER_REL_DIAG': ['07070', '07054'],
 'LUNG_REL_DIAG': ['49122',
  '49322',
  '49121',
  '4928',
  '1623',
  '496',
  '4660',
  '49392'],
 'MENTAL_DISORDER_DIAG': ['29284',
  '3019',
  '30183',
  '29650',
  '29633',
  'V6284',
  '29690',
  '29630',
  '29689',
  '30981',
  '29620',
  '29680',
  '30002',
  '30001',
  '31401',
  '3004',
  '30000',
  '311',
  '78052',
  '78097'],
 'OTHER_DIAG_NEG': ['V6549',
  'V1581',
  '34590',
  '78039',
  '7291',
  'V5869',
  '6823',
  '4439'],
 'OTHER_DIAG_POS': ['25000',
  '78079',
  '6170',
  '2449',
  '185',
  '78820',
  '99591',
  'V5789',
  

In [102]:
proc_cat_map = pd.read_excel('Opioid_analytics/Akhila/overdose_prediction/files/proc_code_map.xlsx')
proc_cat_map = proc_cat_map[proc_cat_map['STATUS'] == 1]
proc_cat_map.head()

Unnamed: 0,PROC_CODE,PROC_DESC,HARM_SAMPLE_PCT,NON_OP_SAMPLE_PCT,RELATIVE_DIFF,PROC_CAT,STATUS
0,391,INTRA-ABDOMINAL VENOUS SHUNT,0.000304,7e-06,39.74619,ABDOMEN_REL_PROC_POS,1
1,4719,OTHER INCIDENTAL APPENDECTOMY,0.000297,0.000215,0.384879,ABDOMEN_REL_PROC_POS,1
2,5451,LAPAROSCOPIC LYSIS PERITONEAL ADHES,0.000304,0.0005,-0.392325,ABDOMEN_REL_PROC_NEG,1
3,5459,OTHER LYSIS OF PERITONEAL ADHESIONS,0.0007,0.001585,-0.558612,ABDOMEN_REL_PROC_NEG,1
4,4576,OPEN AND OTHER SIGMOIDECTOMY,0.000523,0.000371,0.409438,ABDOMEN_REL_PROC_POS,1


In [103]:
proc_cat_map['STATUS'].value_counts()

1    68
Name: STATUS, dtype: int64

In [104]:
proc_cat_map['PROC_CODE'] = proc_cat_map['PROC_CODE'].astype(str)
proc_cat_dict = proc_cat_map.groupby('PROC_CAT')['PROC_CODE'].agg(list).to_dict()
proc_cat_dict

{'ABDOMEN_REL_PROC_NEG': ['5451', '5459', '544'],
 'ABDOMEN_REL_PROC_POS': ['391', '4719', '4576', '5361'],
 'BLOOD_REL_PROC': ['9904', '9907', '3995'],
 'BONE_REL_PROC': ['41', '7779', '7869', '7936', '8452'],
 'BRAIN_REL_PROC': ['8841', '8891'],
 'HEART_REL_PROC_NEG': ['3615', '3961', '9925'],
 'HEART_REL_PROC_POS': ['390',
  '3929',
  '66',
  '3990',
  '3607',
  '3606',
  '8853',
  '3950',
  '3722',
  '8856',
  '9910',
  '3812',
  '3812',
  '8842'],
 'LUNG_REL_PROC': ['331', '3229', '9671', '3404', '9604'],
 'OTHER_PROC_NEG': ['9915', '8154', '605', '6561'],
 'OTHER_PROC_POS': ['8848',
  '8102',
  '75',
  '9920',
  '8659',
  '8703',
  '8604',
  '8741',
  '4516',
  '8703'],
 'RECTUM_PROC': ['481'],
 'SPINE_REL_PROC': ['8106', '8162', '8051', '8451', '331', '8108', '8163'],
 'SUB_REL_PROC': ['9465', '9468', '9462'],
 'URINARY_REL_PROC': ['598', '8774', '5979', '5732']}

# Creating columns for DIAG and PROC codes with high differential

In [105]:
# Creating a new column with category name - with 1 if any code in the list is present across all 5 DIAG/PROC columns

def catMap(cat_dict, check_cols, df):
    
    for i in cat_dict.keys():
        codes = cat_dict[i]
        df[i] = df[check_cols].isin(codes).any(axis=1).astype(int)
    
    return df

In [106]:
diag_cols = ['DIAG1', 'DIAG2', 'DIAG3', 'DIAG4', 'DIAG5']

# inp_rand_df1 = catMap(diag_cat_dict, diag_cols, inp_rand_df)
inp_op_df1 = catMap(diag_cat_dict, diag_cols, op_presc_no_canc_mbr_df2)

In [107]:
proc_cols = ['PROC1', 'PROC2', 'PROC3', 'PROC4', 'PROC5']

# inp_rand_df1 = catMap(proc_cat_dict, proc_cols, inp_rand_df1)
inp_op_df1 = catMap(proc_cat_dict, proc_cols, inp_op_df1)

In [108]:
inp_op_df1

Unnamed: 0,PATID,PAT_PLANID,ADMIT_DATE,CHARGE,COPAY,DEDUCT,DIAG1,DIAG2,DIAG3,DIAG4,...,BRAIN_REL_PROC,HEART_REL_PROC_NEG,HEART_REL_PROC_POS,LUNG_REL_PROC,OTHER_PROC_NEG,OTHER_PROC_POS,RECTUM_PROC,SPINE_REL_PROC,SUB_REL_PROC,URINARY_REL_PROC
0,33003282417,53009524485,2007-12-12,27332.22,250.0,0.0,2182,78820,6271,6259,...,0,0,0,0,0,0,0,0,0,0
1,33003283085,53011734739,2007-11-27,32819.75,0.0,0.0,34690,29623,30780,0000,...,0,0,0,0,0,0,0,0,0,0
2,33003283085,53011734739,2007-01-31,57710.00,0.0,0.0,34690,78650,4739,0000,...,0,0,0,0,0,0,0,0,0,0
3,33003283085,53011734739,2007-04-04,32655.00,0.0,0.0,34681,78791,56400,29620,...,0,0,0,0,0,0,0,0,0,0
4,33003283085,53011734739,2007-04-20,8680.60,0.0,0.0,29623,V6284,34690,0000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211190,33234444633,53015919240,2007-05-29,12259.29,0.0,0.0,78659,V5414,2859,0000,...,0,0,0,0,0,0,0,0,0,0
211191,33234444633,53015919240,2007-05-24,96604.00,0.0,0.0,82100,V140,0000,0000,...,0,0,0,0,0,0,0,0,0,0
211192,33246466382,53069334293,2007-06-05,33934.28,0.0,0.0,73329,0000,0000,0000,...,0,0,0,0,0,0,0,0,0,0
211193,33247614952,53068957593,2007-06-08,24053.40,0.0,0.0,4271,5846,27651,7469,...,0,0,0,0,0,0,0,0,0,0


In [109]:
inp_op_df1.columns

Index(['PATID', 'PAT_PLANID', 'ADMIT_DATE', 'CHARGE', 'COPAY', 'DEDUCT',
       'DIAG1', 'DIAG2', 'DIAG3', 'DIAG4', 'DIAG5', 'DISCH_DATE', 'DRG',
       'ICD_FLAG', 'LOS', 'PROC1', 'PROC2', 'PROC3', 'PROC4', 'PROC5',
       'STD_COST', 'ICU_IND', 'ICU_SURG_IND', 'MAJ_SURG_IND', 'MATERNITY_IND',
       'ELIGEFF', 'ELIGEND', 'GDR_CD', 'RACE', 'YRDOB', 'INDEX_DT',
       'WIN_3_MON', 'AGE', 'ABDOMEN_REL_DIAG_NEG', 'ABDOMEN_REL_DIAG_POS',
       'BLOOD_REL_DIAG', 'HEART_REL_DIAG', 'HX_TOB_HAZ_DIAG',
       'INJURY_REL_DIAG', 'KIDNEY_REL_DIAG', 'LEUK_UNS_DIAG', 'LIVER_REL_DIAG',
       'LUNG_REL_DIAG', 'MENTAL_DISORDER_DIAG', 'OTHER_DIAG_NEG',
       'OTHER_DIAG_POS', 'PAIN_CONTROL_DIAG', 'SEC_MAL_NEO_LIV_DIAG',
       'SPINE_REL_DIAG', 'SUB_ABUSE_DIAG', 'SUB_WITHDRAWAL_DIAG',
       'ABDOMEN_REL_PROC_NEG', 'ABDOMEN_REL_PROC_POS', 'BLOOD_REL_PROC',
       'BONE_REL_PROC', 'BRAIN_REL_PROC', 'HEART_REL_PROC_NEG',
       'HEART_REL_PROC_POS', 'LUNG_REL_PROC', 'OTHER_PROC_NEG',
       'OTHER_PR

# Identifying opioid harm patients

In [110]:
# ICD9 codes for high opioid risk

opioid_harm_icd9_codes = ["96500","96501","96502","96509","E8500","E8501","E8502","E9350","E9351","E9352","3055","3040","3047","E9350","E9351","E9352","E9401","9701","T400X1A","T401X1A","T403X1A","T402X1A","T507X1A"]
len(opioid_harm_icd9_codes)


23

In [111]:
# ICD10 codes for high opioid risk

opioid_harm_icd10_codes = ['F11.10','F11.120','F11.121','F11.122','F11.129','F11.14','F11.150','F11.151','F11.159','F11.181','F11.182','F11.188','F11.19','F11.20','F11.220','F11.221','F11.222','F11.229','F11.23','F11.24','F11.250','F11.251','F11.259','F11.281','F11.282','F11.288','F11.29','F11.90','F11.920','F11.921','F11.922','F11.929','F11.93','F11.94','F11.950','F11.951','F11.959','F11.981','F11.982','F11.988','F11.99','T40.0X1A','T40.0X1D','T40.0X2A','T40.0X2DT40.0X3A','T40.0X3D','T40.0X4A','T40.0X4D','T40.0X5A','T40.0X5D','T40.1X1A','T40.1X1D','T40.1X2A','T40.1X2D','T40.1X3A','T40.1X3D','T40.1X4A','T40.1X4D','T40.2X1A','T40.2X1D','T40.2X2A','T40.2X2D','T40.2X3A','T40.2X3D','T40.2X4A','T40.2X4D','T40.2X5A','T40.2X5D','T40.3X1A','T40.3X1D','T40.3X2A','T40.3X2D','T40.3X3A','T40.3X3D','T40.3X4A','T40.3X4D','T40.3X5A','T40.3X5D','T40.4X1A','T40.4X1D','T40.4X2A','T40.4X2D','T40.4X3A','T40.4X3D','T40.4X4A','T40.4X4D','T40.4X5A','T40.4X5D','T40.601A','T40.601D','T40.602A','T40.602D','T40.603A','T40.603D','T40.604A','T40.604D','T40.605A','T40.605D','T40.691A','T40.691D','T40.692A','T40.692D','T40.693A','T40.693D','T40.694A','T40.694D','T40.695A','T40.695D']

opioid_harm_icd10_codes1 = []

for i in opioid_harm_icd10_codes:
    x = i.replace('.','')
    opioid_harm_icd10_codes1.append(x)
    
len(opioid_harm_icd10_codes1)


108

In [112]:
inp_op_df1['count_304'] = np.where(inp_op_df1['DIAG1'].str.startswith('304') | inp_op_df1['DIAG2'].str.startswith('304') | inp_op_df1['DIAG3'].str.startswith('304') | inp_op_df1['DIAG4'].str.startswith('304') | inp_op_df1['DIAG5'].str.startswith('304'), 1, 0)

inp_op_df1['count_305'] = np.where(inp_op_df1['DIAG1'].str.startswith('305') | inp_op_df1['DIAG2'].str.startswith('305') | inp_op_df1['DIAG3'].str.startswith('305') | inp_op_df1['DIAG4'].str.startswith('305') | inp_op_df1['DIAG5'].str.startswith('305'), 1, 0)

inp_op_df1['count_icd9'] = np.where(inp_op_df1['DIAG1'].isin(opioid_harm_icd9_codes) | inp_op_df1['DIAG2'].isin(opioid_harm_icd9_codes) | inp_op_df1['DIAG3'].isin(opioid_harm_icd9_codes) | inp_op_df1['DIAG4'].isin(opioid_harm_icd9_codes) | inp_op_df1['DIAG5'].isin(opioid_harm_icd9_codes), 1, 0)

inp_op_df1['count_icd10'] = np.where(inp_op_df1['DIAG1'].isin(opioid_harm_icd10_codes) | inp_op_df1['DIAG2'].isin(opioid_harm_icd10_codes) | inp_op_df1['DIAG3'].isin(opioid_harm_icd10_codes) | inp_op_df1['DIAG4'].isin(opioid_harm_icd10_codes) | inp_op_df1['DIAG5'].isin(opioid_harm_icd10_codes), 1, 0)

inp_op_df1 ['OPIOID_HARM'] = inp_op_df1['count_304'] + inp_op_df1['count_305'] + inp_op_df1['count_icd9'] + inp_op_df1['count_icd10']

inp_op_df1.drop(columns = ['count_304', 'count_305', 'count_icd9', 'count_icd10'], inplace = True)

inp_op_df1.head()


Unnamed: 0,PATID,PAT_PLANID,ADMIT_DATE,CHARGE,COPAY,DEDUCT,DIAG1,DIAG2,DIAG3,DIAG4,...,HEART_REL_PROC_NEG,HEART_REL_PROC_POS,LUNG_REL_PROC,OTHER_PROC_NEG,OTHER_PROC_POS,RECTUM_PROC,SPINE_REL_PROC,SUB_REL_PROC,URINARY_REL_PROC,OPIOID_HARM
0,33003282417,53009524485,2007-12-12,27332.22,250.0,0.0,2182,78820,6271,6259,...,0,0,0,0,0,0,0,0,0,0
1,33003283085,53011734739,2007-11-27,32819.75,0.0,0.0,34690,29623,30780,0,...,0,0,0,0,0,0,0,0,0,0
2,33003283085,53011734739,2007-01-31,57710.0,0.0,0.0,34690,78650,4739,0,...,0,0,0,0,0,0,0,0,0,0
3,33003283085,53011734739,2007-04-04,32655.0,0.0,0.0,34681,78791,56400,29620,...,0,0,0,0,0,0,0,0,0,0
4,33003283085,53011734739,2007-04-20,8680.6,0.0,0.0,29623,V6284,34690,0,...,0,0,0,0,0,0,0,0,0,0


In [113]:
inp_op_df1['OPIOID_HARM'].value_counts()

0    197768
1     13154
2       267
3         6
Name: OPIOID_HARM, dtype: int64

In [114]:
inp_op_df1.columns

Index(['PATID', 'PAT_PLANID', 'ADMIT_DATE', 'CHARGE', 'COPAY', 'DEDUCT',
       'DIAG1', 'DIAG2', 'DIAG3', 'DIAG4', 'DIAG5', 'DISCH_DATE', 'DRG',
       'ICD_FLAG', 'LOS', 'PROC1', 'PROC2', 'PROC3', 'PROC4', 'PROC5',
       'STD_COST', 'ICU_IND', 'ICU_SURG_IND', 'MAJ_SURG_IND', 'MATERNITY_IND',
       'ELIGEFF', 'ELIGEND', 'GDR_CD', 'RACE', 'YRDOB', 'INDEX_DT',
       'WIN_3_MON', 'AGE', 'ABDOMEN_REL_DIAG_NEG', 'ABDOMEN_REL_DIAG_POS',
       'BLOOD_REL_DIAG', 'HEART_REL_DIAG', 'HX_TOB_HAZ_DIAG',
       'INJURY_REL_DIAG', 'KIDNEY_REL_DIAG', 'LEUK_UNS_DIAG', 'LIVER_REL_DIAG',
       'LUNG_REL_DIAG', 'MENTAL_DISORDER_DIAG', 'OTHER_DIAG_NEG',
       'OTHER_DIAG_POS', 'PAIN_CONTROL_DIAG', 'SEC_MAL_NEO_LIV_DIAG',
       'SPINE_REL_DIAG', 'SUB_ABUSE_DIAG', 'SUB_WITHDRAWAL_DIAG',
       'ABDOMEN_REL_PROC_NEG', 'ABDOMEN_REL_PROC_POS', 'BLOOD_REL_PROC',
       'BONE_REL_PROC', 'BRAIN_REL_PROC', 'HEART_REL_PROC_NEG',
       'HEART_REL_PROC_POS', 'LUNG_REL_PROC', 'OTHER_PROC_NEG',
       'OTHER_PR

In [115]:
inp_op_df1[inp_op_df1.duplicated(subset=['PATID', 'PAT_PLANID', 'INDEX_DT', 'WIN_3_MON'], keep=False)]

Unnamed: 0,PATID,PAT_PLANID,ADMIT_DATE,CHARGE,COPAY,DEDUCT,DIAG1,DIAG2,DIAG3,DIAG4,...,HEART_REL_PROC_NEG,HEART_REL_PROC_POS,LUNG_REL_PROC,OTHER_PROC_NEG,OTHER_PROC_POS,RECTUM_PROC,SPINE_REL_PROC,SUB_REL_PROC,URINARY_REL_PROC,OPIOID_HARM
1,33003283085,53011734739,2007-11-27,32819.75,0.00,0.0,34690,29623,30780,0000,...,0,0,0,0,0,0,0,0,0,0
2,33003283085,53011734739,2007-01-31,57710.00,0.00,0.0,34690,78650,4739,0000,...,0,0,0,0,0,0,0,0,0,0
3,33003283085,53011734739,2007-04-04,32655.00,0.00,0.0,34681,78791,56400,29620,...,0,0,0,0,0,0,0,0,0,0
4,33003283085,53011734739,2007-04-20,8680.60,0.00,0.0,29623,V6284,34690,0000,...,0,0,0,0,0,0,0,0,0,0
5,33003283758,53004825422,2007-10-01,11356.70,639.79,0.0,99832,42731,25000,V4581,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211172,33139810285,53057821048,2007-05-02,1860.72,0.00,0.0,V3000,0000,0000,0000,...,0,0,0,0,0,0,0,0,0,0
211184,33220600405,53013970002,2007-05-30,3610.40,150.00,0.0,4580,31400,4019,32723,...,0,0,0,0,0,0,0,0,0,0
211185,33220600405,53013970002,2007-05-15,15010.22,150.00,0.0,5409,4019,2724,30000,...,0,0,0,0,0,0,0,0,0,0
211190,33234444633,53015919240,2007-05-29,12259.29,0.00,0.0,78659,V5414,2859,0000,...,0,0,0,0,0,0,0,0,0,0


In [116]:
inp_op_df1['WIN_3_MON'].value_counts()

POST_3MON    130532
PRE_3MON      80663
Name: WIN_3_MON, dtype: int64

In [117]:
inp_op_df1.isnull().sum().any()

False

In [118]:
inp_op_df1.shape

(211195, 66)

In [119]:
target_df1 = inp_op_df1[inp_op_df1['WIN_3_MON'] == 'POST_3MON']
target_df1.shape
target_df1.head()

(130532, 66)

Unnamed: 0,PATID,PAT_PLANID,ADMIT_DATE,CHARGE,COPAY,DEDUCT,DIAG1,DIAG2,DIAG3,DIAG4,...,HEART_REL_PROC_NEG,HEART_REL_PROC_POS,LUNG_REL_PROC,OTHER_PROC_NEG,OTHER_PROC_POS,RECTUM_PROC,SPINE_REL_PROC,SUB_REL_PROC,URINARY_REL_PROC,OPIOID_HARM
0,33003282417,53009524485,2007-12-12,27332.22,250.0,0.0,2182,78820,6271,6259,...,0,0,0,0,0,0,0,0,0,0
1,33003283085,53011734739,2007-11-27,32819.75,0.0,0.0,34690,29623,30780,0000,...,0,0,0,0,0,0,0,0,0,0
4,33003283085,53011734739,2007-04-20,8680.6,0.0,0.0,29623,V6284,34690,0000,...,0,0,0,0,0,0,0,0,0,0
5,33003283758,53004825422,2007-10-01,11356.7,639.79,0.0,99832,42731,25000,V4581,...,0,0,0,0,0,0,0,0,0,0
6,33003283758,53004825422,2007-10-25,16618.63,0.0,0.0,99832,V4581,25000,4019,...,0,0,0,0,0,0,0,0,0,0


In [120]:
target_df = target_df1[['PATID', 'PAT_PLANID', 'INDEX_DT', 'WIN_3_MON', 'OPIOID_HARM']]
target_df.shape
target_df.head()

(130532, 5)

Unnamed: 0,PATID,PAT_PLANID,INDEX_DT,WIN_3_MON,OPIOID_HARM
0,33003282417,53009524485,2007-10-25,POST_3MON,0
1,33003283085,53011734739,2007-04-12,POST_3MON,0
4,33003283085,53011734739,2007-04-12,POST_3MON,0
5,33003283758,53004825422,2007-07-05,POST_3MON,0
6,33003283758,53004825422,2007-07-05,POST_3MON,0


In [121]:
target_df[target_df.duplicated(subset=['PATID', 'PAT_PLANID', 'INDEX_DT',
       'WIN_3_MON'], keep=False)]

Unnamed: 0,PATID,PAT_PLANID,INDEX_DT,WIN_3_MON,OPIOID_HARM
1,33003283085,53011734739,2007-04-12,POST_3MON,0
4,33003283085,53011734739,2007-04-12,POST_3MON,0
5,33003283758,53004825422,2007-07-05,POST_3MON,0
6,33003283758,53004825422,2007-07-05,POST_3MON,0
7,33003283758,53004825422,2007-07-05,POST_3MON,0
...,...,...,...,...,...
211162,33081554869,53096485876,2007-04-18,POST_3MON,0
211184,33220600405,53013970002,2007-04-11,POST_3MON,0
211185,33220600405,53013970002,2007-04-11,POST_3MON,0
211190,33234444633,53015919240,2007-04-14,POST_3MON,0


In [122]:
inp_op_df1[(inp_op_df1['PATID'] == 33003283085) & (inp_op_df1['PAT_PLANID'] == 53011734739) & (inp_op_df1['INDEX_DT'] == '2007-04-12') & (inp_op_df1['WIN_3_MON'] == 'POST_3MON')]


Unnamed: 0,PATID,PAT_PLANID,ADMIT_DATE,CHARGE,COPAY,DEDUCT,DIAG1,DIAG2,DIAG3,DIAG4,...,HEART_REL_PROC_NEG,HEART_REL_PROC_POS,LUNG_REL_PROC,OTHER_PROC_NEG,OTHER_PROC_POS,RECTUM_PROC,SPINE_REL_PROC,SUB_REL_PROC,URINARY_REL_PROC,OPIOID_HARM
1,33003283085,53011734739,2007-11-27,32819.75,0.0,0.0,34690,29623,30780,0,...,0,0,0,0,0,0,0,0,0,0
4,33003283085,53011734739,2007-04-20,8680.6,0.0,0.0,29623,V6284,34690,0,...,0,0,0,0,0,0,0,0,0,0


In [123]:
# Drop duplicate rows based on all columns
target_df = target_df.drop_duplicates()
target_df.shape
target_df.head()

(103245, 5)

Unnamed: 0,PATID,PAT_PLANID,INDEX_DT,WIN_3_MON,OPIOID_HARM
0,33003282417,53009524485,2007-10-25,POST_3MON,0
1,33003283085,53011734739,2007-04-12,POST_3MON,0
5,33003283758,53004825422,2007-07-05,POST_3MON,0
8,33003284019,53004502444,2007-10-15,POST_3MON,0
9,33003284598,53054044619,2007-08-01,POST_3MON,1


In [124]:
target_df[target_df.duplicated(subset=['PATID', 'PAT_PLANID', 'INDEX_DT',
       'WIN_3_MON'], keep=False)]

Unnamed: 0,PATID,PAT_PLANID,INDEX_DT,WIN_3_MON,OPIOID_HARM
247,33003385757,53008875787,2007-04-04,POST_3MON,1
248,33003385757,53008875787,2007-04-04,POST_3MON,0
315,33003436611,53006279044,2007-07-05,POST_3MON,0
316,33003436611,53006279044,2007-07-05,POST_3MON,1
430,33003482646,53014248960,2007-05-02,POST_3MON,0
...,...,...,...,...,...
209934,33060873270,53006377270,2007-04-14,POST_3MON,0
210534,33069616576,53055369848,2007-05-01,POST_3MON,0
210535,33069616576,53055369848,2007-05-01,POST_3MON,1
210617,33069696650,53042971064,2007-04-07,POST_3MON,0


In [125]:
target_df['OPIOID_HARM'].value_counts()

0    95704
1     7393
2      146
3        2
Name: OPIOID_HARM, dtype: int64

In [126]:
target_df['OPIOID_HARM'] = np.where(target_df['OPIOID_HARM'] >= 1, 1, target_df['OPIOID_HARM'])
target_df['OPIOID_HARM'].value_counts()

0    95704
1     7541
Name: OPIOID_HARM, dtype: int64

In [127]:
target_df.columns

Index(['PATID', 'PAT_PLANID', 'INDEX_DT', 'WIN_3_MON', 'OPIOID_HARM'], dtype='object')

In [128]:
var_df = inp_op_df1[inp_op_df1['WIN_3_MON'] == 'PRE_3MON']
var_df.shape
var_df.head()

(80663, 66)

Unnamed: 0,PATID,PAT_PLANID,ADMIT_DATE,CHARGE,COPAY,DEDUCT,DIAG1,DIAG2,DIAG3,DIAG4,...,HEART_REL_PROC_NEG,HEART_REL_PROC_POS,LUNG_REL_PROC,OTHER_PROC_NEG,OTHER_PROC_POS,RECTUM_PROC,SPINE_REL_PROC,SUB_REL_PROC,URINARY_REL_PROC,OPIOID_HARM
2,33003283085,53011734739,2007-01-31,57710.0,0.0,0.0,34690,78650,4739,0000,...,0,0,0,0,0,0,0,0,0,0
3,33003283085,53011734739,2007-04-04,32655.0,0.0,0.0,34681,78791,56400,29620,...,0,0,0,0,0,0,0,0,0,0
10,33003285138,53049845397,2007-11-30,10123.38,0.0,0.0,486,4019,2774,78659,...,0,0,0,0,0,0,0,0,0,0
11,33003285138,53049845397,2007-11-30,17955.26,0.0,0.0,486,4019,2774,78659,...,0,0,0,0,0,0,0,0,0,0
12,33003286248,53015259103,2007-10-15,11509.0,0.0,0.0,65421,V270,65961,V252,...,0,0,0,0,0,0,0,0,0,0


In [129]:
var_df.columns

Index(['PATID', 'PAT_PLANID', 'ADMIT_DATE', 'CHARGE', 'COPAY', 'DEDUCT',
       'DIAG1', 'DIAG2', 'DIAG3', 'DIAG4', 'DIAG5', 'DISCH_DATE', 'DRG',
       'ICD_FLAG', 'LOS', 'PROC1', 'PROC2', 'PROC3', 'PROC4', 'PROC5',
       'STD_COST', 'ICU_IND', 'ICU_SURG_IND', 'MAJ_SURG_IND', 'MATERNITY_IND',
       'ELIGEFF', 'ELIGEND', 'GDR_CD', 'RACE', 'YRDOB', 'INDEX_DT',
       'WIN_3_MON', 'AGE', 'ABDOMEN_REL_DIAG_NEG', 'ABDOMEN_REL_DIAG_POS',
       'BLOOD_REL_DIAG', 'HEART_REL_DIAG', 'HX_TOB_HAZ_DIAG',
       'INJURY_REL_DIAG', 'KIDNEY_REL_DIAG', 'LEUK_UNS_DIAG', 'LIVER_REL_DIAG',
       'LUNG_REL_DIAG', 'MENTAL_DISORDER_DIAG', 'OTHER_DIAG_NEG',
       'OTHER_DIAG_POS', 'PAIN_CONTROL_DIAG', 'SEC_MAL_NEO_LIV_DIAG',
       'SPINE_REL_DIAG', 'SUB_ABUSE_DIAG', 'SUB_WITHDRAWAL_DIAG',
       'ABDOMEN_REL_PROC_NEG', 'ABDOMEN_REL_PROC_POS', 'BLOOD_REL_PROC',
       'BONE_REL_PROC', 'BRAIN_REL_PROC', 'HEART_REL_PROC_NEG',
       'HEART_REL_PROC_POS', 'LUNG_REL_PROC', 'OTHER_PROC_NEG',
       'OTHER_PR

In [130]:
var_df1 = var_df.groupby(['PATID', 'PAT_PLANID', 'INDEX_DT']).agg({'CHARGE' : 'sum', 
        'COPAY': 'sum', 
        'DEDUCT': 'sum',
        'LOS': 'sum', 
        'STD_COST': 'sum', 
        'ICU_IND': 'sum', 
        'ICU_SURG_IND': 'sum', 
        'MAJ_SURG_IND': 'sum', 
        'MATERNITY_IND': 'sum',
        'GDR_CD': 'last', 
        'RACE': 'last',  
        'AGE': 'last', 
        'ABDOMEN_REL_DIAG_NEG' : 'sum', 
        'ABDOMEN_REL_DIAG_POS' : 'sum', 
        'BLOOD_REL_DIAG' : 'sum', 
        'HEART_REL_DIAG' : 'sum', 
        'HX_TOB_HAZ_DIAG' : 'sum', 
        'INJURY_REL_DIAG' : 'sum', 
        'KIDNEY_REL_DIAG' : 'sum', 
        'LEUK_UNS_DIAG' : 'sum', 
        'LIVER_REL_DIAG' : 'sum',
        'LUNG_REL_DIAG' : 'sum', 
        'MENTAL_DISORDER_DIAG' : 'sum', 
        'OTHER_DIAG_NEG' : 'sum',
       'OTHER_DIAG_POS' : 'sum', 
       'PAIN_CONTROL_DIAG' : 'sum', 
       'SEC_MAL_NEO_LIV_DIAG' : 'sum',
       'SPINE_REL_DIAG' : 'sum', 
       'SUB_ABUSE_DIAG' : 'sum', 
       'SUB_WITHDRAWAL_DIAG' : 'sum',
       'ABDOMEN_REL_PROC_NEG' : 'sum', 
       'ABDOMEN_REL_PROC_POS' : 'sum', 
       'BLOOD_REL_PROC' : 'sum',
       'BONE_REL_PROC' : 'sum', 
       'BRAIN_REL_PROC' : 'sum', 
       'HEART_REL_PROC_NEG' : 'sum',
       'HEART_REL_PROC_POS' : 'sum', 
       'LUNG_REL_PROC' : 'sum', 
       'OTHER_PROC_NEG' : 'sum',
       'OTHER_PROC_POS' : 'sum', 
       'RECTUM_PROC' : 'sum', 
       'SPINE_REL_PROC' : 'sum', 
       'SUB_REL_PROC' : 'sum',
       'URINARY_REL_PROC' : 'sum', 
       'OPIOID_HARM' : 'sum'})

var_df1.reset_index(inplace = True)
var_df1

Unnamed: 0,PATID,PAT_PLANID,INDEX_DT,CHARGE,COPAY,DEDUCT,LOS,STD_COST,ICU_IND,ICU_SURG_IND,...,HEART_REL_PROC_NEG,HEART_REL_PROC_POS,LUNG_REL_PROC,OTHER_PROC_NEG,OTHER_PROC_POS,RECTUM_PROC,SPINE_REL_PROC,SUB_REL_PROC,URINARY_REL_PROC,OPIOID_HARM
0,33003283080,53011798434,2007-07-12,25855.00,0.0,0.0,3,28029.15,1,0,...,0,0,0,0,0,0,0,0,0,0
1,33003283085,53011734739,2007-04-12,90365.00,0.0,0.0,27,43013.86,1,0,...,0,0,0,0,0,0,0,0,0,0
2,33003283341,53042726712,2007-07-01,16024.93,0.0,0.0,3,20566.65,0,0,...,0,0,0,0,0,0,0,0,0,0
3,33003283860,53011896577,2007-09-21,3343.46,0.0,750.0,2,15690.06,0,0,...,0,0,0,0,0,0,0,0,0,0
4,33003285138,53049845397,2007-12-03,28078.64,0.0,0.0,3,13671.30,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67660,33221156226,53056167887,2007-11-21,13398.00,0.0,0.0,2,7298.33,0,0,...,0,0,0,0,0,0,0,0,0,0
67661,33222543648,53055404870,2007-05-19,10036.64,0.0,0.0,3,20566.65,0,0,...,0,0,0,0,0,0,0,0,0,0
67662,33236432156,53055832425,2007-04-06,6170.50,0.0,0.0,2,2700.00,0,0,...,0,0,0,0,0,0,0,0,0,0
67663,33246466382,53069334293,2007-06-06,33934.28,0.0,0.0,1,33932.00,0,0,...,0,0,0,0,0,0,0,0,0,0


In [131]:
var_df1.nunique()

PATID                   67445
PAT_PLANID              67665
INDEX_DT                  290
CHARGE                  65708
COPAY                    1192
DEDUCT                   7127
LOS                       215
STD_COST                12232
ICU_IND                     9
ICU_SURG_IND                6
MAJ_SURG_IND                7
MATERNITY_IND               6
GDR_CD                      3
RACE                        5
AGE                        91
ABDOMEN_REL_DIAG_NEG        4
ABDOMEN_REL_DIAG_POS        6
BLOOD_REL_DIAG              8
HEART_REL_DIAG             12
HX_TOB_HAZ_DIAG             4
INJURY_REL_DIAG             3
KIDNEY_REL_DIAG             8
LEUK_UNS_DIAG               3
LIVER_REL_DIAG              7
LUNG_REL_DIAG              10
MENTAL_DISORDER_DIAG        8
OTHER_DIAG_NEG              7
OTHER_DIAG_POS              9
PAIN_CONTROL_DIAG           4
SEC_MAL_NEO_LIV_DIAG        7
SPINE_REL_DIAG              3
SUB_ABUSE_DIAG              9
SUB_WITHDRAWAL_DIAG         5
ABDOMEN_RE

In [132]:
var_df1.rename(columns = {'OPIOID_HARM' : 'PREV_3MON_OP_HARM'}, inplace = True)
var_df1.drop(columns = 'RECTUM_PROC', inplace = True)

In [133]:
target_df.drop(columns = 'WIN_3_MON', inplace = True)

In [464]:
final_df = var_df1.merge(target_df, on=['PATID', 'PAT_PLANID', 'INDEX_DT'], how='left')
final_df.shape
final_df

(67958, 48)

Unnamed: 0,PATID,PAT_PLANID,INDEX_DT,CHARGE,COPAY,DEDUCT,LOS,STD_COST,ICU_IND,ICU_SURG_IND,...,HEART_REL_PROC_NEG,HEART_REL_PROC_POS,LUNG_REL_PROC,OTHER_PROC_NEG,OTHER_PROC_POS,SPINE_REL_PROC,SUB_REL_PROC,URINARY_REL_PROC,PREV_3MON_OP_HARM,OPIOID_HARM
0,33003283080,53011798434,2007-07-12,25855.00,0.0,0.0,3,28029.15,1,0,...,0,0,0,0,0,0,0,0,0,
1,33003283085,53011734739,2007-04-12,90365.00,0.0,0.0,27,43013.86,1,0,...,0,0,0,0,0,0,0,0,0,0.0
2,33003283341,53042726712,2007-07-01,16024.93,0.0,0.0,3,20566.65,0,0,...,0,0,0,0,0,0,0,0,0,
3,33003283860,53011896577,2007-09-21,3343.46,0.0,750.0,2,15690.06,0,0,...,0,0,0,0,0,0,0,0,0,
4,33003285138,53049845397,2007-12-03,28078.64,0.0,0.0,3,13671.30,0,0,...,0,0,0,0,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67953,33221156226,53056167887,2007-11-21,13398.00,0.0,0.0,2,7298.33,0,0,...,0,0,0,0,0,0,0,0,0,
67954,33222543648,53055404870,2007-05-19,10036.64,0.0,0.0,3,20566.65,0,0,...,0,0,0,0,0,0,0,0,0,
67955,33236432156,53055832425,2007-04-06,6170.50,0.0,0.0,2,2700.00,0,0,...,0,0,0,0,0,0,0,0,0,
67956,33246466382,53069334293,2007-06-06,33934.28,0.0,0.0,1,33932.00,0,0,...,0,0,0,0,0,0,0,0,0,


In [465]:
final_df.isnull().sum()

PATID                       0
PAT_PLANID                  0
INDEX_DT                    0
CHARGE                      0
COPAY                       0
DEDUCT                      0
LOS                         0
STD_COST                    0
ICU_IND                     0
ICU_SURG_IND                0
MAJ_SURG_IND                0
MATERNITY_IND               0
GDR_CD                      0
RACE                        0
AGE                         0
ABDOMEN_REL_DIAG_NEG        0
ABDOMEN_REL_DIAG_POS        0
BLOOD_REL_DIAG              0
HEART_REL_DIAG              0
HX_TOB_HAZ_DIAG             0
INJURY_REL_DIAG             0
KIDNEY_REL_DIAG             0
LEUK_UNS_DIAG               0
LIVER_REL_DIAG              0
LUNG_REL_DIAG               0
MENTAL_DISORDER_DIAG        0
OTHER_DIAG_NEG              0
OTHER_DIAG_POS              0
PAIN_CONTROL_DIAG           0
SEC_MAL_NEO_LIV_DIAG        0
SPINE_REL_DIAG              0
SUB_ABUSE_DIAG              0
SUB_WITHDRAWAL_DIAG         0
ABDOMEN_RE

In [466]:
final_df['OPIOID_HARM'] = final_df['OPIOID_HARM'].fillna(0)
final_df['OPIOID_HARM'].value_counts()

0.0    67161
1.0      797
Name: OPIOID_HARM, dtype: int64

In [467]:
final_df['OPIOID_HARM'] = final_df['OPIOID_HARM'].astype(int)
final_df['OPIOID_HARM'] = final_df['OPIOID_HARM'].astype(str)
final_df['OPIOID_HARM'].value_counts()

0    67161
1      797
Name: OPIOID_HARM, dtype: int64

In [468]:
final_df.head()

Unnamed: 0,PATID,PAT_PLANID,INDEX_DT,CHARGE,COPAY,DEDUCT,LOS,STD_COST,ICU_IND,ICU_SURG_IND,...,HEART_REL_PROC_NEG,HEART_REL_PROC_POS,LUNG_REL_PROC,OTHER_PROC_NEG,OTHER_PROC_POS,SPINE_REL_PROC,SUB_REL_PROC,URINARY_REL_PROC,PREV_3MON_OP_HARM,OPIOID_HARM
0,33003283080,53011798434,2007-07-12,25855.0,0.0,0.0,3,28029.15,1,0,...,0,0,0,0,0,0,0,0,0,0
1,33003283085,53011734739,2007-04-12,90365.0,0.0,0.0,27,43013.86,1,0,...,0,0,0,0,0,0,0,0,0,0
2,33003283341,53042726712,2007-07-01,16024.93,0.0,0.0,3,20566.65,0,0,...,0,0,0,0,0,0,0,0,0,0
3,33003283860,53011896577,2007-09-21,3343.46,0.0,750.0,2,15690.06,0,0,...,0,0,0,0,0,0,0,0,0,0
4,33003285138,53049845397,2007-12-03,28078.64,0.0,0.0,3,13671.3,0,0,...,0,0,0,0,0,0,0,0,0,0


In [469]:
final_df.columns

Index(['PATID', 'PAT_PLANID', 'INDEX_DT', 'CHARGE', 'COPAY', 'DEDUCT', 'LOS',
       'STD_COST', 'ICU_IND', 'ICU_SURG_IND', 'MAJ_SURG_IND', 'MATERNITY_IND',
       'GDR_CD', 'RACE', 'AGE', 'ABDOMEN_REL_DIAG_NEG', 'ABDOMEN_REL_DIAG_POS',
       'BLOOD_REL_DIAG', 'HEART_REL_DIAG', 'HX_TOB_HAZ_DIAG',
       'INJURY_REL_DIAG', 'KIDNEY_REL_DIAG', 'LEUK_UNS_DIAG', 'LIVER_REL_DIAG',
       'LUNG_REL_DIAG', 'MENTAL_DISORDER_DIAG', 'OTHER_DIAG_NEG',
       'OTHER_DIAG_POS', 'PAIN_CONTROL_DIAG', 'SEC_MAL_NEO_LIV_DIAG',
       'SPINE_REL_DIAG', 'SUB_ABUSE_DIAG', 'SUB_WITHDRAWAL_DIAG',
       'ABDOMEN_REL_PROC_NEG', 'ABDOMEN_REL_PROC_POS', 'BLOOD_REL_PROC',
       'BONE_REL_PROC', 'BRAIN_REL_PROC', 'HEART_REL_PROC_NEG',
       'HEART_REL_PROC_POS', 'LUNG_REL_PROC', 'OTHER_PROC_NEG',
       'OTHER_PROC_POS', 'SPINE_REL_PROC', 'SUB_REL_PROC', 'URINARY_REL_PROC',
       'PREV_3MON_OP_HARM', 'OPIOID_HARM'],
      dtype='object')

In [470]:
final_df.to_csv('Opioid_analytics/Akhila/overdose_prediction/data/3mon_model_final_2007.csv', index=False)