In [74]:
# Importing required libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pickle
import random
import os
from glob import glob
import pyarrow.parquet as pq
import warnings
warnings.filterwarnings('ignore')
# To execute a cell line by line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Loading Member Enrollment and Inpatient data

In [75]:
# Opioid harm datasets

mbr_op_df = pd.read_csv('Opioid_analytics/Akhila/overdose_prediction/data/mbr_op_2007.csv')
mbr_op_df.shape
mbr_op_df.head()

inp_op_df = pd.read_csv('Opioid_analytics/Akhila/overdose_prediction/data/inp_op_2007.csv')
inp_op_df.shape
inp_op_df.head()

(37644, 6)

Unnamed: 0,PATID,TOTAL_MON_COV,AGE,GDR_CD,RACE,STATE
0,33003284598,193.0,54.0,M,W,MO
1,33003285835,54.0,64.0,F,W,AZ
2,33003287387,58.0,38.0,F,W,AZ
3,33003288343,88.0,62.0,F,W,CO
4,33003289068,30.0,50.0,M,W,MN


(61179, 20)

Unnamed: 0,PATID,DIAG1,DIAG2,DIAG3,DIAG4,DIAG5,DRG,ICD_FLAG,LOS,PROC1,PROC2,PROC3,PROC4,PROC5,STD_COST,ICU_IND,ICU_SURG_IND,MAJ_SURG_IND,MATERNITY_IND,DRG_DESC
0,33003284598,78659,4019,3051,53081,496,313,9,1,0,0,0,0,0,6358.05,1,0,0,0,CHEST PAIN
1,33003290698,55321,9974,51889,49320,5601,354,9,6,5361,0,0,0,0,18954.75,0,0,1,0,HERNIA PX EXC ING & FEM W CC
2,33003292417,2920,30401,3051,311,30501,897,9,4,9468,0,0,0,0,8430.48,0,0,0,0,"ALC/DRUG ABUS/DEP, WO REHAB THER WO MCC"
3,33003292982,49122,41401,4019,412,V4582,192,9,2,0,0,0,0,0,6402.83,0,0,0,0,CHRONIC OBS PULM DISEASE WO CC/MCC
4,33003294584,5770,2721,3051,V1581,0000,440,9,2,0,0,0,0,0,8014.73,0,0,0,0,DIS OF PANCREAS EXC MALIG WO CC/MCC


In [76]:
# Non opioid harm datasets

mbr_non_op_df = pd.read_csv('Opioid_analytics/Akhila/overdose_prediction/data/mbr_non_op_2007.csv')
mbr_non_op_df.shape
mbr_non_op_df.head()

inp_non_op_df = pd.read_csv('Opioid_analytics/Akhila/overdose_prediction/data/inp_non_op_2007.csv')
inp_non_op_df.shape
inp_non_op_df.head()

(363849, 6)

Unnamed: 0,PATID,TOTAL_MON_COV,AGE,GDR_CD,RACE,STATE
0,33003282019,12.0,45.0,F,W,NE
1,33003282261,12.0,31.0,F,B,CA
2,33003282417,30.0,55.0,F,W,AZ
3,33003282532,23.0,70.0,F,W,MO
4,33003282641,78.0,39.0,F,B,NJ


(509284, 20)

Unnamed: 0,PATID,DIAG1,DIAG2,DIAG3,DIAG4,DIAG5,DRG,ICD_FLAG,LOS,PROC1,PROC2,PROC3,PROC4,PROC5,STD_COST,ICU_IND,ICU_SURG_IND,MAJ_SURG_IND,MATERNITY_IND,DRG_DESC
0,33003282019,78659,42789,340,32723,33394,313,9,2,0,0,0,0,0,6358.05,1,0,0,0,CHEST PAIN
1,33003282261,64511,66411,66541,66201,V270,775,9,2,7359,7569,7534,0,0,6775.95,0,0,0,1,VAG DEL WO COMPLICATING DX
2,33003282417,2182,78820,6271,6259,2449,743,9,2,6841,6564,39,0,0,11581.8,0,0,1,0,UTER&ADNX PX/NON-MALIGNANCY WO CC/MCC
3,33003283085,34690,29623,30780,0,0000,103,9,9,3893,0,0,0,0,9820.65,0,0,0,0,HEADACHES WO MCC
4,33003283151,41091,5180,41401,25000,53081,247,9,1,66,3607,3722,8856,8853,28029.15,1,1,1,0,PERC CVASC PX W DRG-ELUT STNT WO MCC


# Analyzing harm

In [77]:
from collections import Counter
print("Gender count in Opioid harm patients: ",Counter(mbr_op_df['GDR_CD']))
print("Gender count in Non-Opioid harm patients: ",Counter(mbr_non_op_df.GDR_CD))


Gender count in Opioid harm patients:  Counter({'M': 19124, 'F': 18518, 'U': 2})
Gender count in Non-Opioid harm patients:  Counter({'F': 244709, 'M': 119109, 'U': 31})


## Checking for ICD code incidence differential in both samples 

In [78]:
# Loading DIAG codes dictionary

diag_table = pd.read_csv('Opioid_analytics/Akhila/overdose_prediction/files/diag_icd_codes.csv')

# df2 = {'DIAG_CD': '0000', 'DIAG_DESC': 'NO DIAGNOSIS CODE'}
# diag_table = diag_table.append(df2, ignore_index = True)

diag_table.tail()


Unnamed: 0,DIAG_CD,DIAG_DESC
113415,Z992,DEPENDENCE ON RENAL DIALYSIS
113416,Z993,DEPENDENCE ON WHEELCHAIR
113417,Z998,DEPEND OTH ENABLING MACHINES&DEVICE
113418,Z9981,DEPENDENCE ON SUPPLEMENTAL OXYGEN
113419,Z9989,DEPEND OTH ENABLING MACHINES&DEVICE


In [79]:
diag_dict = dict(diag_table.values)
diag_dict

{'001': 'CHOLERA',
 '0010': 'CHOLERA DUE TO VIBRIO CHOLERAE',
 '0011': 'CHOLERA DUE VIBRIO CHOLERAE EL TOR',
 '0019': 'UNSPECIFIED CHOLERA',
 '002': 'TYPHOID AND PARATYPHOID FEVERS',
 '0020': 'TYPHOID FEVER',
 '0021': 'PARATYPHOID FEVER A',
 '0022': 'PARATYPHOID FEVER B',
 '0023': 'PARATYPHOID FEVER C',
 '0029': 'UNSPECIFIED PARATYPHOID FEVER',
 '003': 'OTHER SALMONELLA INFECTIONS',
 '0030': 'SALMONELLA GASTROENTERITIS',
 '0031': 'SALMONELLA SEPTICEMIA',
 '0032': 'LOCALIZED SALMONELLA INFECTIONS',
 '00320': 'UNSPEC LOCALIZED SALMONELLA INF',
 '00321': 'SALMONELLA MENINGITIS',
 '00322': 'SALMONELLA PNEUMONIA',
 '00323': 'SALMONELLA ARTHRITIS',
 '00324': 'SALMONELLA OSTEOMYELITIS',
 '00329': 'OTH LOCALIZED SALMONELLA INFECTIONS',
 '0038': 'OTHER SPEC SALMONELLA INFECTIONS',
 '0039': 'UNSPECIFIED SALMONELLA INFECTION',
 '004': 'SHIGELLOSIS',
 '0040': 'SHIGELLA DYSENTERIAE',
 '0041': 'SHIGELLA FLEXNERI',
 '0042': 'SHIGELLA BOYDII',
 '0043': 'SHIGELLA SONNEI',
 '0048': 'OTHER SPECIFIED SHIG

In [80]:
# Loading PROC codes dictionary

proc_table = pd.read_csv('Opioid_analytics/Akhila/overdose_prediction/files/proc_codes.csv')

# df2 = {'PROC_CD': '0', 'PROC_DESC': 'NO PROCEDURE CODE'}
# proc_table = proc_table.append(df2, ignore_index = True)

# df3 = {'PROC_CD': '00000', 'PROC_DESC': 'NO PROCEDURE CODE'}
# proc_table = proc_table.append(df3, ignore_index = True)

proc_table.tail()

Unnamed: 0,PROC_CD,PROC_DESC
112566,Z9173,VAN LIFTS WHEELCHAIR TIE DWN
112567,Z9174,ENVIR SAFETY /CONTROLDEVICES
112568,Z9175,ALLERGY CONTROL SUPPLIES
112569,Z9176,SINGLE ROOM AIR CONDITIONER
112570,ZZINT,INTEREST PAYMENT -- PACIFICARE


In [81]:
proc_dict = dict(proc_table.values)
proc_dict

{'.2540': 'AUDIO DIAGNOSTIC BATTERY',
 '00': 'PROCEDURES AND INTERVENTIONS NEC',
 '000': 'THERAPEUTIC ULTRASOUND',
 '0001': 'THERAPEUTIC US VESSELS HEAD&NECK',
 '0001A': 'ADM SARSCOV2 30MCG/0.3ML 1ST',
 '0001F': 'HEART FAILURE COMPOSITE',
 '0001M': 'INFCT DS CHRNC HCV 6 BIOCHEM ASSAY SRM ALG LVR',
 '0001T': 'ENDOVASCULAR REPAIR',
 '0001U': 'RBC DNA HEA 35 AG 11 BLD GRP',
 '0002': 'THERAPEUTIC ULTRASOUND OF HEART',
 '0002A': 'ADM SARSCOV2 30MCG/0.3ML 2ND',
 '0002F': 'TOBACCO USE SMOKING ASSESSED',
 '0002M': 'LIVER DIS 10 ASSAYS W/ASH',
 '0002T': 'AORTO-UNI-ILIAC PROSTHES',
 '0002U': 'ONC CLRCT 3 UR METAB ALG PLP',
 '0003': 'THERAPEUTIC US PERIPH VASC VESSELS',
 '0003A': 'ADM SARSCOV2 30MCG/0.3ML 3RD',
 '0003F': 'TOBACCO USE NON-SMOKING ASSESSED',
 '0003M': 'LIVER DIS 10 ASSAYS W/NASH',
 '0003T': 'CERVICOGRAPHY',
 '0003U': 'ONC OVAR 5 PRTN SER ALG SCOR',
 '0004A': 'ADM SARSCOV2 30MCG/0.3ML BST',
 '0004F': 'TOBACCO USE CESSATION INTERVEN CNSL',
 '0004M': 'SCOLIOSIS 53 SNP SALIVA SCOR',
 '

In [82]:
# Function to get all the ICD and PROC codes from the inpatient file for both samples

def getCodes(op_inp, non_op_inp, code_col):
    op_list = []
    non_op_list = []
    for i in range(1,6):
        op_list = op_list + list(op_inp[code_col + str(i)])
        non_op_list = non_op_list + list(non_op_inp[code_col + str(i)])
    
    return op_list, non_op_list

In [83]:
# Collect the ICD codes from the inpatient file

full_icd_op, full_icd_non_op = getCodes(inp_op_df, inp_non_op_df, 'DIAG')


In [84]:
counter_icd_op = Counter(full_icd_op)
counter_icd_op

counter_icd_non_op = Counter(full_icd_non_op)
counter_icd_non_op

Counter({'78659': 1480,
         '55321': 198,
         '2920': 1450,
         '49122': 361,
         '5770': 1499,
         '25092': 36,
         '7384': 216,
         '99931': 25,
         '570': 81,
         '29630': 476,
         '71697': 14,
         '65421': 50,
         '185': 305,
         '33818': 160,
         '30391': 1198,
         '71596': 221,
         '5771': 511,
         '99649': 177,
         '30781': 39,
         '5641': 489,
         '566': 136,
         '78650': 760,
         '1456': 4,
         '4358': 14,
         '34590': 423,
         '71536': 687,
         '99667': 65,
         '6256': 201,
         '72402': 386,
         '83906': 6,
         '53020': 30,
         '481': 61,
         '29644': 75,
         '8244': 96,
         '7220': 599,
         '29650': 428,
         '2189': 503,
         '59010': 126,
         '30301': 309,
         '29634': 193,
         '96500': 138,
         '57400': 369,
         '28731': 43,
         '53550': 335,
         '29633': 10

Counter({'78659': 8009,
         '64511': 6357,
         '2182': 2269,
         '34690': 4458,
         '41091': 521,
         '64421': 5227,
         '64891': 14951,
         '65201': 108,
         '99832': 1006,
         '6185': 731,
         '42821': 256,
         '1561': 76,
         '99859': 4867,
         '72210': 5956,
         '486': 18842,
         '65421': 16904,
         '65961': 8803,
         '5589': 3674,
         '81000': 260,
         '64231': 2923,
         '5533': 3432,
         '41400': 8778,
         '7802': 4538,
         'V5789': 8866,
         '5551': 363,
         '57471': 127,
         '41401': 25873,
         '99604': 163,
         '5762': 646,
         '1551': 184,
         '72402': 3649,
         '71536': 10933,
         '65821': 778,
         '515': 2001,
         '57470': 307,
         '25041': 321,
         '075': 269,
         '6259': 2724,
         '5400': 1116,
         '71596': 3579,
         '7211': 964,
         'V5429': 47,
         '1965': 166,
  

In [85]:
counter_icd_op['']

0

In [86]:
# del counter_icd_op[np.nan]
# del counter_icd_non_op[np.nan]
# del counter_icd_op['OTHER']
# del counter_icd_non_op['OTHER']

In [87]:
# Function to calculate ICD code incidence difference between top 'n' variables for two samples

def incDiffCalcICD(n, icd_codes, op_counter, non_op_counter, sample = 'op', diff_threshold = 0.3):
    
    feature_df = pd.DataFrame({'ICD_CODE':[],
                            'ICD_DESC':[],
                            'HARM_SAMPLE_PCT':[],
                            'NON_HARM_SAMPLE_PCT':[],
                            'RELATIVE_DIFF':[]})
    
    op_icd_size, non_op_icd_size = (sum(op_counter.values()), sum(non_op_counter.values()))
    
    print("Sample type to check: ", sample)
    if sample == 'op':
        common_op = op_counter.most_common(n)
        for i in range(0,n):
            print(i)
            op_sample_pct = common_op[i][1]/op_icd_size
            
            if common_op[i][0] not in icd_codes.keys():
                print(common_op[i][0], "not in DIAG code dictionary")
                continue
            
            icd_desc = icd_codes[common_op[i][0]]
            print(common_op[i], icd_desc, op_sample_pct)
            
            non_op_sample_pct = non_op_counter[str(common_op[i][0])]/non_op_icd_size
            print('control group:',non_op_counter[str(common_op[i][0])],'/',non_op_icd_size,non_op_sample_pct)
            
            if non_op_sample_pct == 0:
                print("Non opioid harm sample% is 0 for", common_op[i][0])
                continue
            
            relative_differential = (op_sample_pct/non_op_sample_pct)-1
            print('relative differential:', relative_differential)

            if abs(relative_differential) > diff_threshold:
                feature_df = feature_df.append({'ICD_CODE':common_op[i][0],
                                        'ICD_DESC':icd_desc,
                                        'HARM_SAMPLE_PCT':op_sample_pct,
                                        'NON_HARM_SAMPLE_PCT':non_op_sample_pct,
                                        'RELATIVE_DIFF':relative_differential}, ignore_index=True)
    else:
        common_non_op = non_op_counter.most_common(n)
        for i in range(0,n):
            print(i)
            non_op_sample_pct = common_non_op[i][1]/non_op_icd_size
            
            if common_non_op[i][0] not in icd_codes.keys():
                print(common_op[i][0], "not in DIAG code dictionary")
                continue
            
            icd_desc = icd_codes[common_non_op[i][0]]    
            print(common_non_op[i], icd_desc, non_op_sample_pct)

            op_sample_pct = op_counter[str(common_non_op[i][0])]/op_icd_size
            print('test group:',op_counter[str(common_non_op[i][0])],'/',op_icd_size, op_sample_pct)
            
            if op_sample_pct == 0:
                print("Opioid harm sample% is 0 for", common_op[i][0])
                continue

            relative_differential = (op_sample_pct/non_op_sample_pct) -1
            print('relative differential:', relative_differential)

            if abs(relative_differential) > diff_threshold:
                feature_df = feature_df.append({'ICD_CODE':common_non_op[i][0],
                                        'ICD_DESC':icd_desc,
                                        'HARM_SAMPLE_PCT':op_sample_pct,
                                        'non_op_SAMPLE_PCT':non_op_sample_pct,
                                        'RELATIVE_DIFF':relative_differential}, ignore_index=True)
                    
    return feature_df


In [88]:
icd_rel_diff_df = incDiffCalcICD(200, diag_dict, counter_icd_op, counter_icd_non_op, sample = 'op', diff_threshold = 0.3)


Sample type to check:  op
0
0000 not in DIAG code dictionary
1
('3051', 29533) NONDEPENDENT TOBACCO USE DISORDER 0.09654620049363344
control group: 0 / 2546420 0.0
Non opioid harm sample% is 0 for 3051
2
('4019', 11985) UNSPECIFIED ESSENTIAL HYPERTENSION 0.039180110822341
control group: 100941 / 2546420 0.0396403578357066
relative differential: -0.011610566566355063
3
('30500', 4140) NONDPND ALCOHL ABS UNS DRUNKENNESS 0.013534055803461972
control group: 0 / 2546420 0.0
Non opioid harm sample% is 0 for 30500
4
('496', 3655) CHRONIC AIRWAY OBSTRUCTION NEC 0.011948544435182007
control group: 20146 / 2546420 0.007911499281344004
relative differential: 0.5102756140492488
5
('41401', 3608) COR ATHEROSLERO NATIVE COR ART 0.011794896941761062
control group: 25873 / 2546420 0.010160539109809066
relative differential: 0.16085345612952517
6
('53081', 3452) ESOPHAGEAL REFLUX 0.011284918027427713
control group: 26461 / 2546420 0.010391451528027585
relative differential: 0.08598091392624885
7
('2724

73
('04111', 556) METH SUSCEPT STAPH INF CCE&UNS SITE 0.0018176171562137335
control group: 3955 / 2546420 0.0015531609082555116
relative differential: 0.17026970390032248
74
('07070', 540) UNS VIRAL HEPATITIS C W/O HEP COMA 0.0017653116265385182
control group: 1414 / 2546420 0.0005552893866683422
relative differential: 2.179084039639472
75
('7291', 539) UNSPECIFIED MYALGIA AND MYOSITIS 0.0017620425309338172
control group: 2610 / 2546420 0.001024968386990363
relative differential: 0.7191189048354372
76
('56400', 536) UNSPECIFIED CONSTIPATION 0.0017522352441197142
control group: 4276 / 2546420 0.0016792202386094988
relative differential: 0.04348149446476213
77
('4254', 534) OTHER PRIMARY CARDIOMYOPATHIES 0.0017456970529103125
control group: 6976 / 2546420 0.002739532363082288
relative differential: -0.36277553189910006
78
('9694', 521) POISN BENZODIAZEPINE-BASED TRANQ 0.0017031988100492
control group: 356 / 2546420 0.0001398041171527085
relative differential: 11.18275144349855
79
('5771'

130
('5712', 365) ALCOHOLIC CIRRHOSIS OF LIVER 0.0011932198957158502
control group: 1245 / 2546420 0.0004889217018402306
relative differential: 1.4405132585130564
131
('7140', 362) RHEUMATOID ARTHRITIS 0.0011834126089017474
control group: 3517 / 2546420 0.0013811547191743703
relative differential: -0.14317158499869564
132
('49122', 361) OBST CHRN BRONCHITIS W/AC BRONCHIT 0.0011801435132970464
control group: 1239 / 2546420 0.00048656545267473553
relative differential: 1.4254568564405687
133
('7806', 361) FEVER & OTH PHYSIOL DISTURBANC TEMP 0.0011801435132970464
control group: 4851 / 2546420 0.001905027450302778
relative differential: -0.3805110193506772
134
('9974', 355) DIGESTIVE SYSTEM COMPLICATION NEC 0.0011605289396688406
control group: 4463 / 2546420 0.0017526566709340956
relative differential: -0.3378458206247974
135
('43310', 351) OCCL&STENOS CAROTID ART W/O INFARCT 0.0011474525572500367
control group: 2278 / 2546420 0.0008945892664996347
relative differential: 0.2826585341670933

197
('7895', 242) ASCITES 0.0007911211363376321
control group: 1999 / 2546420 0.0007850236803041132
relative differential: 0.007767225609241368
198
('E8490', 241) PLACE OF OCCURRENCE, HOME 0.0007878520407329313
control group: 1720 / 2546420 0.0006754580941085917
relative differential: 0.16639662416462264
199
('30002', 240) GENERALIZED ANXIETY DISORDER 0.0007845829451282303
control group: 503 / 2546420 0.00019753222170733814
relative differential: 2.9719238630883265


In [89]:
icd_rel_diff_df = icd_rel_diff_df.sort_values(by = ['RELATIVE_DIFF'], ascending = False)
icd_rel_diff_df


Unnamed: 0,ICD_CODE,ICD_DESC,HARM_SAMPLE_PCT,NON_HARM_SAMPLE_PCT,RELATIVE_DIFF
10,2920,DRUG WITHDRAWAL,0.004740,0.000060,78.411257
102,29284,DRUG-INDUCED MOOD DISORDER,0.000807,0.000023,34.450847
95,V6549,OTHER SPECIFIED COUNSELING,0.000876,0.000027,31.808286
42,9694,POISN BENZODIAZEPINE-BASED TRANQ,0.001703,0.000140,11.182751
79,30301,ACUT ALCOHLIC INTOXICATION CONT,0.001010,0.000099,9.207411
...,...,...,...,...,...
52,V1582,PERS HX TOBACCO PRS HAZARDS HLTH,0.001471,0.003080,-0.522313
6,4280,CHF UNSPECIFIED,0.005306,0.011382,-0.533842
88,1977,SEC MALIGNANT NEOPLASM OF LIVER,0.000915,0.002013,-0.545199
44,5859,CHRONIC KIDNEY DISEASE UNSPECIFIED,0.001648,0.003949,-0.582741


In [90]:
icd_rel_diff_df.to_excel('Opioid_analytics/Akhila/overdose_prediction/files/diag_code_differential.xlsx')

## Checking for PROC code incidence differential in both samples 

In [91]:
# Collect the proc codes from the inpatient file

full_proc_op, full_proc_non_op = getCodes(inp_op_df, inp_non_op_df, 'PROC')


In [92]:
counter_proc_op = Counter(full_proc_op)
counter_proc_op

counter_proc_non_op = Counter(full_proc_non_op)
counter_proc_non_op

Counter({'0000000': 90427,
         '5361': 109,
         '9468': 154,
         '8108': 441,
         '9904': 634,
         '3893': 1004,
         '8191': 34,
         '741': 67,
         '605': 125,
         '9462': 424,
         '8154': 465,
         '3491': 86,
         '9915': 122,
         '8132': 17,
         '8841': 116,
         '4881': 30,
         '4041': 28,
         '8622': 198,
         '6859': 174,
         '0309': 264,
         '8051': 1148,
         '4516': 577,
         '7936': 275,
         '8102': 573,
         '6839': 36,
         '9604': 309,
         '3794': 36,
         '4131': 39,
         '5110': 40,
         '3950': 184,
         '4023': 26,
         '3220': 17,
         '539': 8,
         '9463': 59,
         '4576': 160,
         '8107': 46,
         '6849': 480,
         '0066': 688,
         '8659': 154,
         '3972': 27,
         '9461': 11,
         '4562': 86,
         '6529': 42,
         '3421': 27,
         '4579': 40,
         '00000': 279,
     

Counter({0: 807289,
         7359: 9538,
         6841: 287,
         3893: 8357,
         66: 3365,
         741: 22068,
         8622: 1489,
         7051: 364,
         5185: 778,
         5419: 202,
         8051: 5221,
         4513: 2750,
         5491: 1421,
         7569: 8392,
         537: 213,
         3950: 843,
         4525: 1720,
         5123: 3993,
         3775: 57,
         8754: 54,
         9705: 97,
         309: 1967,
         8154: 6886,
         3249: 98,
         5569: 200,
         4701: 2722,
         403: 1193,
         736: 2973,
         5503: 296,
         5459: 3272,
         8151: 3214,
         9671: 1624,
         9925: 2359,
         8102: 2264,
         8011: 32,
         8180: 281,
         4011: 675,
         6829: 844,
         4516: 4490,
         8105: 341,
         9904: 9707,
         34: 312,
         6849: 5569,
         3409: 264,
         282: 177,
         7936: 1544,
         4497: 46,
         765: 57,
         3324: 1011,
         84

In [93]:
counter_proc_op['']

0

In [94]:
# del counter_proc_op[np.nan]
# del counter_proc_non_op[np.nan]
# del counter_proc_op['OTHER']
# del counter_proc_non_op['OTHER']

In [95]:
# check for condition where most frequent code is in op sample but not in non-op sample

In [96]:
# Function to calculate PROC code incidence difference between top 'n' variables for two samples

def incDiffCalcPROC(n, proc_codes, op_counter, non_op_counter, sample = 'op', diff_threshold = 0.3):
    
    feature_df = pd.DataFrame({'PROC_CODE':[],
                            'PROC_DESC':[],
                            'HARM_SAMPLE_PCT':[],
                            'NON_OP_SAMPLE_PCT':[],
                            'RELATIVE_DIFF':[]})
    
    op_proc_size, non_op_proc_size = (sum(op_counter.values()), sum(non_op_counter.values()))
    
    print("Sample type to check: ", sample)
    if sample == 'op':
        common_op = op_counter.most_common(n)
        for i in range(0,n):
            print(i)
            op_sample_pct = common_op[i][1]/op_proc_size
            
            if str(common_op[i][0]) not in proc_codes.keys():
                print(common_op[i][0], "not in PROC code dictionary")
                continue
                
            proc_desc = proc_codes[str(common_op[i][0])]
            print(common_op[i], op_sample_pct)
            
            non_op_sample_pct = non_op_counter[str(common_op[i][0])]/non_op_proc_size
            print('control group:',non_op_counter[str(common_op[i][0])],'/',non_op_proc_size, non_op_sample_pct)
            
            if non_op_sample_pct == 0:
                print("Non opioid harm sample% is 0 for", common_op[i][0])
                continue
            
            relative_differential = (op_sample_pct/non_op_sample_pct)-1
            print('relative differential:', relative_differential)

            if abs(relative_differential) > diff_threshold:
                feature_df = feature_df.append({'PROC_CODE':common_op[i][0],
                                        'PROC_DESC':proc_desc,
                                        'HARM_SAMPLE_PCT':op_sample_pct,
                                        'NON_OP_SAMPLE_PCT':non_op_sample_pct,
                                        'RELATIVE_DIFF':relative_differential}, ignore_index=True)
    else:
        common_non_op = non_op_counter.most_common(n)
        for i in range(0,n):
            print(i)
            non_op_sample_pct = common_non_op[i][1]/non_op_proc_size
            
            if str(common_non_op[i][0]) not in proc_codes.keys():
                print(common_non_op[i][0], "not in PROC code dictionary")
                continue
                
            proc_desc = proc_codes[str(common_non_op[i][0])]
            print(common_non_op[i], non_op_sample_pct)

            op_sample_pct = op_counter[str(common_non_op[i][0])]/op_proc_size
            print('test group:', op_counter[str(common_non_op[i][0])],'/',op_proc_size, op_sample_pct)
            
            if op_sample_pct == 0:
                print("Opioid harm sample% is 0 for", common_non_op[i][0])
                continue

            relative_differential = (op_sample_pct/non_op_sample_pct) -1
            print('relative differential:', relative_differential)

            if abs(relative_differential) > diff_threshold:
                feature_df = feature_df.append({'PROC_CODE':common_non_op[i][0],
                                        'PROC_DESC':proc_desc,
                                        'HARM_SAMPLE_PCT':op_sample_pct,
                                        'NON_OP_SAMPLE_PCT':non_op_sample_pct,
                                        'RELATIVE_DIFF':relative_differential}, ignore_index=True)
                    
    return feature_df


In [97]:
proc_rel_diff_df = incDiffCalcPROC(200, proc_dict, counter_proc_op, counter_proc_non_op, sample = 'op', diff_threshold = 0.3)


Sample type to check:  op
0
0 not in PROC code dictionary
1
0000000 not in PROC code dictionary
2
(3893, 1417) 0.004632308471861259
control group: 9667 / 2546420 0.003796310113806835
relative differential: 0.22021340011554225
3
(8162, 1363) 0.004455777309207408
control group: 5333 / 2546420 0.0020943127999308833
relative differential: 1.1275605579808605
4
(3722, 1353) 0.004423086353160398
control group: 5964 / 2546420 0.0023421116705021165
relative differential: 0.8885036135839541
5
('8051', 1148) 0.0037529217541967014
control group: 4697 / 2546420 0.001844550388388404
relative differential: 1.0345997473539632
6
(8853, 1133) 0.003703885320126187
control group: 4792 / 2546420 0.0018818576668420764
relative differential: 0.9682069400825803
7
(8856, 1063) 0.00347504862779712
control group: 6333 / 2546420 0.0024870209941800646
relative differential: 0.3972735396802687
8
(8051, 1011) 0.00330505565635267
control group: 4697 / 2546420 0.001844550388388404
relative differential: 0.791794725239

92
('8753', 179) 0.0005851681132414717
control group: 1416 / 2546420 0.0005560748030568406
relative differential: 0.05231905855956809
93
(5491, 176) 0.0005753608264273688
control group: 1288 / 2546420 0.0005058081541929454
relative differential: 0.1375080090304197
94
('6859', 174) 0.000568822635217967
control group: 1384 / 2546420 0.0005435081408408668
relative differential: 0.046576108939115235
95
(8163, 174) 0.000568822635217967
control group: 897 / 2546420 0.00035225925024151555
relative differential: 0.614784096735491
96
(8753, 173) 0.000565553539613266
control group: 1416 / 2546420 0.0005560748030568406
relative differential: 0.01704579402684514
97
('598', 171) 0.0005590153484038641
control group: 2448 / 2546420 0.0009613496595219956
relative differential: -0.41850985968849364
98
('4523', 171) 0.0005590153484038641
control group: 1619 / 2546420 0.0006357945664894243
relative differential: -0.12076104787982234
99
(3606, 170) 0.0005557462527991631
control group: 833 / 2546420 0.0003

172
('5794', 92) 0.0003007567956324883
control group: 971 / 2546420 0.00038131965661595494
relative differential: -0.211273821282718
173
('7935', 91) 0.0002974877000277873
control group: 698 / 2546420 0.00027411031958592846
relative differential: 0.08528456891799174
174
('9339', 91) 0.0002974877000277873
control group: 913 / 2546420 0.00035854258134950244
relative differential: -0.17028627699369325
175
(4719, 91) 0.0002974877000277873
control group: 547 / 2546420 0.0002148113822543021
relative differential: 0.384878663811258
176
(7935, 90) 0.00029421860442308635
control group: 698 / 2546420 0.00027411031958592846
relative differential: 0.07335836486394776
177
(8106, 88) 0.0002876804132136844
control group: 465 / 2546420 0.00018260931032586925
relative differential: 0.5753874361625597
178
(605, 88) 0.0002876804132136844
control group: 1076 / 2546420 0.000422554017012119
relative differential: -0.3191866563052135
179
(9905, 88) 0.0002876804132136844
control group: 1014 / 2546420 0.000398

In [98]:
proc_rel_diff_df = proc_rel_diff_df.sort_values(by = ['RELATIVE_DIFF'], ascending = False)
proc_rel_diff_df

Unnamed: 0,PROC_CODE,PROC_DESC,HARM_SAMPLE_PCT,NON_OP_SAMPLE_PCT,RELATIVE_DIFF
62,481,PROCTOSTOMY,0.000369,3.927082e-07,939.667419
66,41,OPERATIONS BONE MARROW AND SPLEEN,0.000350,3.927082e-07,889.720476
89,75,OTHER OBSTETRIC OPERATIONS,0.000265,3.927082e-07,673.283725
14,9465,DRUG DETOXIFICATION,0.001419,2.356249e-06,601.138141
19,9465,DRUG DETOXIFICATION,0.001275,2.356249e-06,540.091878
...,...,...,...,...,...
35,5459,OTHER LYSIS OF PERITONEAL ADHESIONS,0.000700,1.584970e-03,-0.558612
60,3961,XTRACORP CIRC DURING OPN HEART SURG,0.000386,9.707747e-04,-0.602634
68,5732,OTHER CYSTOSCOPY,0.000330,8.502132e-04,-0.611652
81,3995,HEMODIALYSIS,0.000281,9.464267e-04,-0.702943


In [99]:
proc_rel_diff_df.dtypes

PROC_CODE             object
PROC_DESC             object
HARM_SAMPLE_PCT      float64
NON_OP_SAMPLE_PCT    float64
RELATIVE_DIFF        float64
dtype: object

In [100]:
proc_rel_diff_df[proc_rel_diff_df['PROC_DESC'] == '3893']

Unnamed: 0,PROC_CODE,PROC_DESC,HARM_SAMPLE_PCT,NON_OP_SAMPLE_PCT,RELATIVE_DIFF


In [101]:
proc_rel_diff_df.to_excel('Opioid_analytics/Akhila/overdose_prediction/files/proc_code_differential.xlsx')

In [102]:
# remove new borns - remove age<10, ignore related to OBSTETRIC

# Generating random sample for non-opioid harm patients

In [103]:
num_of_patients = 40000

In [104]:
rand_patient_sample =  random.sample(list(set(inp_non_op_df.PATID)), num_of_patients)
len(rand_patient_sample)

40000

In [105]:
mbr_non_op_df.head()

inp_non_op_df.head()

Unnamed: 0,PATID,TOTAL_MON_COV,AGE,GDR_CD,RACE,STATE
0,33003282019,12.0,45.0,F,W,NE
1,33003282261,12.0,31.0,F,B,CA
2,33003282417,30.0,55.0,F,W,AZ
3,33003282532,23.0,70.0,F,W,MO
4,33003282641,78.0,39.0,F,B,NJ


Unnamed: 0,PATID,DIAG1,DIAG2,DIAG3,DIAG4,DIAG5,DRG,ICD_FLAG,LOS,PROC1,PROC2,PROC3,PROC4,PROC5,STD_COST,ICU_IND,ICU_SURG_IND,MAJ_SURG_IND,MATERNITY_IND,DRG_DESC
0,33003282019,78659,42789,340,32723,33394,313,9,2,0,0,0,0,0,6358.05,1,0,0,0,CHEST PAIN
1,33003282261,64511,66411,66541,66201,V270,775,9,2,7359,7569,7534,0,0,6775.95,0,0,0,1,VAG DEL WO COMPLICATING DX
2,33003282417,2182,78820,6271,6259,2449,743,9,2,6841,6564,39,0,0,11581.8,0,0,1,0,UTER&ADNX PX/NON-MALIGNANCY WO CC/MCC
3,33003283085,34690,29623,30780,0,0000,103,9,9,3893,0,0,0,0,9820.65,0,0,0,0,HEADACHES WO MCC
4,33003283151,41091,5180,41401,25000,53081,247,9,1,66,3607,3722,8856,8853,28029.15,1,1,1,0,PERC CVASC PX W DRG-ELUT STNT WO MCC


In [106]:
# Filtering non-opiod harm member enrollment files for random patients sample

mbr_rand_df = mbr_non_op_df[mbr_non_op_df['PATID'].isin(rand_patient_sample)]
mbr_rand_df.shape
mbr_rand_df['PATID'].nunique()

(39374, 6)

39374

In [107]:
# Filtering non-opiod harm inpatient files for random patients sample

inp_rand_df = inp_non_op_df[inp_non_op_df['PATID'].isin(rand_patient_sample)]
inp_rand_df.shape
inp_rand_df['PATID'].nunique()

(54960, 20)

40000

# Feature Engineering

## Idenitfying major DIAG and PROC code categories to create new columns

In [108]:
diag_cat_map = pd.read_excel('Opioid_analytics/Akhila/overdose_prediction/files/diag_code_map.xlsx')
diag_cat_map = diag_cat_map[diag_cat_map['STATUS'] == 1]
diag_cat_map.head()

Unnamed: 0,ICD_CODE,ICD_DESC,HARM_SAMPLE_PCT,NON_HARM_SAMPLE_PCT,RELATIVE_DIFF,ICD_CAT,STATUS
0,5771,CHRONIC PANCREATITIS,0.001671,0.00038,3.398981,ABDOMEN_REL_DIAG_POS,1
1,5770,ACUTE PANCREATITIS,0.0049,0.00201,1.437666,ABDOMEN_REL_DIAG_POS,1
2,5363,GASTROPARESIS,0.00102,0.000472,1.160766,ABDOMEN_REL_DIAG_POS,1
3,53550,UNS GASTRIT&GASTRODUODIT NO HEMORR,0.001095,0.00065,0.684,ABDOMEN_REL_DIAG_POS,1
4,5641,IRRITABLE BOWEL SYNDROME,0.001599,0.000998,0.601998,ABDOMEN_REL_DIAG_POS,1


In [109]:
diag_cat_map['STATUS'].value_counts()

1    104
Name: STATUS, dtype: int64

In [110]:
diag_cat_map['ICD_CODE'] = diag_cat_map['ICD_CODE'].astype(str)
diag_cat_dict = diag_cat_map.groupby('ICD_CAT')['ICD_CODE'].agg(list).to_dict()
diag_cat_dict

{'ABDOMEN_REL_DIAG_NEG': ['9974'],
 'ABDOMEN_REL_DIAG_POS': ['5771',
  '5770',
  '5363',
  '53550',
  '5641',
  '5559',
  '56211',
  'V4586'],
 'BLOOD_REL_DIAG': ['2859', '2800', '0389', '2851'],
 'HEART_REL_DIAG': ['4254', 'V4581', '41400', '4240', '4280', '42731'],
 'HX_TOB_HAZ_DIAG': ['V1582'],
 'INJURY_REL_DIAG': ['V173', 'V454'],
 'KIDNEY_REL_DIAG': ['591', '5849', '5859'],
 'LEUK_UNS_DIAG': ['28860'],
 'LIVER_REL_DIAG': ['07070', '07054'],
 'LUNG_REL_DIAG': ['49122',
  '49322',
  '49121',
  '4928',
  '1623',
  '496',
  '4660',
  '49392'],
 'MENTAL_DISORDER_DIAG': ['29284',
  '3019',
  '30183',
  '29650',
  '29633',
  'V6284',
  '29690',
  '29630',
  '29689',
  '30981',
  '29620',
  '29680',
  '30002',
  '30001',
  '31401',
  '3004',
  '30000',
  '311',
  '78052',
  '78097'],
 'OTHER_DIAG_NEG': ['V6549',
  'V1581',
  '34590',
  '78039',
  '7291',
  'V5869',
  '6823',
  '4439'],
 'OTHER_DIAG_POS': ['25000',
  '78079',
  '6170',
  '2449',
  '185',
  '78820',
  '99591',
  'V5789',
  

In [111]:
proc_cat_map = pd.read_excel('Opioid_analytics/Akhila/overdose_prediction/files/proc_code_map.xlsx')
proc_cat_map = proc_cat_map[proc_cat_map['STATUS'] == 1]
proc_cat_map.head()

Unnamed: 0,PROC_CODE,PROC_DESC,HARM_SAMPLE_PCT,NON_OP_SAMPLE_PCT,RELATIVE_DIFF,PROC_CAT,STATUS
0,391,INTRA-ABDOMINAL VENOUS SHUNT,0.000304,7e-06,39.74619,ABDOMEN_REL_PROC_POS,1
1,4719,OTHER INCIDENTAL APPENDECTOMY,0.000297,0.000215,0.384879,ABDOMEN_REL_PROC_POS,1
2,5451,LAPAROSCOPIC LYSIS PERITONEAL ADHES,0.000304,0.0005,-0.392325,ABDOMEN_REL_PROC_NEG,1
3,5459,OTHER LYSIS OF PERITONEAL ADHESIONS,0.0007,0.001585,-0.558612,ABDOMEN_REL_PROC_NEG,1
4,4576,OPEN AND OTHER SIGMOIDECTOMY,0.000523,0.000371,0.409438,ABDOMEN_REL_PROC_POS,1


In [112]:
proc_cat_map['STATUS'].value_counts()

1    68
Name: STATUS, dtype: int64

In [113]:
proc_cat_map['PROC_CODE'] = proc_cat_map['PROC_CODE'].astype(str)
proc_cat_dict = proc_cat_map.groupby('PROC_CAT')['PROC_CODE'].agg(list).to_dict()
proc_cat_dict

{'ABDOMEN_REL_PROC_NEG': ['5451', '5459', '544'],
 'ABDOMEN_REL_PROC_POS': ['391', '4719', '4576', '5361'],
 'BLOOD_REL_PROC': ['9904', '9907', '3995'],
 'BONE_REL_PROC': ['41', '7779', '7869', '7936', '8452'],
 'BRAIN_REL_PROC': ['8841', '8891'],
 'HEART_REL_PROC_NEG': ['3615', '3961', '9925'],
 'HEART_REL_PROC_POS': ['390',
  '3929',
  '66',
  '3990',
  '3607',
  '3606',
  '8853',
  '3950',
  '3722',
  '8856',
  '9910',
  '3812',
  '3812',
  '8842'],
 'LUNG_REL_PROC': ['331', '3229', '9671', '3404', '9604'],
 'OTHER_PROC_NEG': ['9915', '8154', '605', '6561'],
 'OTHER_PROC_POS': ['8848',
  '8102',
  '75',
  '9920',
  '8659',
  '8703',
  '8604',
  '8741',
  '4516',
  '8703'],
 'RECTUM_PROC': ['481'],
 'SPINE_REL_PROC': ['8106', '8162', '8051', '8451', '331', '8108', '8163'],
 'SUB_REL_PROC': ['9465', '9468', '9462'],
 'URINARY_REL_PROC': ['598', '8774', '5979', '5732']}

# Creating columns for DIAG and PROC codes with high differential

In [114]:
# Creating a new column with category name - with 1 if any code in the list is present across all 5 DIAG/PROC columns

def catMap(cat_dict, check_cols, df):
    
    for i in cat_dict.keys():
        codes = cat_dict[i]
        df[i] = df[check_cols].isin(codes).any(axis=1).astype(int)
    
    return df

In [115]:
diag_cols = ['DIAG1', 'DIAG2', 'DIAG3', 'DIAG4', 'DIAG5']

inp_rand_df1 = catMap(diag_cat_dict, diag_cols, inp_rand_df)
inp_op_df1 = catMap(diag_cat_dict, diag_cols, inp_op_df)

In [116]:
proc_cols = ['PROC1', 'PROC2', 'PROC3', 'PROC4', 'PROC5']

inp_rand_df1 = catMap(proc_cat_dict, proc_cols, inp_rand_df1)
inp_op_df1 = catMap(proc_cat_dict, proc_cols, inp_op_df1)

In [117]:
inp_rand_df1

Unnamed: 0,PATID,DIAG1,DIAG2,DIAG3,DIAG4,DIAG5,DRG,ICD_FLAG,LOS,PROC1,...,BRAIN_REL_PROC,HEART_REL_PROC_NEG,HEART_REL_PROC_POS,LUNG_REL_PROC,OTHER_PROC_NEG,OTHER_PROC_POS,RECTUM_PROC,SPINE_REL_PROC,SUB_REL_PROC,URINARY_REL_PROC
2,33003282417,2182,78820,6271,6259,2449,743,9,2,6841,...,0,0,0,0,0,0,0,0,0,0
5,33003283156,64421,66612,66111,65971,64842,765,9,4,741,...,0,0,0,0,0,0,0,0,0,0
7,33003283464,65201,65701,V270,65221,65421,765,9,3,741,...,0,0,0,0,0,0,0,0,0,0
10,33003283943,6185,6256,4019,2720,2777,748,9,1,7051,...,0,0,0,0,0,0,0,0,0,0
38,33003292589,65821,65221,V270,0000,0000,766,9,8,741,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509214,33081535048,42789,4280,5859,42781,42731,UNK,9,4,0,...,0,0,0,0,0,0,0,0,0,0
509215,33081535658,486,2761,73319,73300,4019,UNK,9,24,0,...,0,0,0,0,0,0,0,0,0,0
509220,33081546675,82009,73300,2449,2768,40290,UNK,9,5,0,...,0,0,0,0,0,0,0,0,0,0
509268,33220405526,65281,65451,V270,0000,0000,371,9,3,741,...,0,0,0,0,0,0,0,0,0,0


In [118]:
inp_op_df1.columns

Index(['PATID', 'DIAG1', 'DIAG2', 'DIAG3', 'DIAG4', 'DIAG5', 'DRG', 'ICD_FLAG',
       'LOS', 'PROC1', 'PROC2', 'PROC3', 'PROC4', 'PROC5', 'STD_COST',
       'ICU_IND', 'ICU_SURG_IND', 'MAJ_SURG_IND', 'MATERNITY_IND', 'DRG_DESC',
       'ABDOMEN_REL_DIAG_NEG', 'ABDOMEN_REL_DIAG_POS', 'BLOOD_REL_DIAG',
       'HEART_REL_DIAG', 'HX_TOB_HAZ_DIAG', 'INJURY_REL_DIAG',
       'KIDNEY_REL_DIAG', 'LEUK_UNS_DIAG', 'LIVER_REL_DIAG', 'LUNG_REL_DIAG',
       'MENTAL_DISORDER_DIAG', 'OTHER_DIAG_NEG', 'OTHER_DIAG_POS',
       'PAIN_CONTROL_DIAG', 'SEC_MAL_NEO_LIV_DIAG', 'SPINE_REL_DIAG',
       'SUB_ABUSE_DIAG', 'SUB_WITHDRAWAL_DIAG', 'ABDOMEN_REL_PROC_NEG',
       'ABDOMEN_REL_PROC_POS', 'BLOOD_REL_PROC', 'BONE_REL_PROC',
       'BRAIN_REL_PROC', 'HEART_REL_PROC_NEG', 'HEART_REL_PROC_POS',
       'LUNG_REL_PROC', 'OTHER_PROC_NEG', 'OTHER_PROC_POS', 'RECTUM_PROC',
       'SPINE_REL_PROC', 'SUB_REL_PROC', 'URINARY_REL_PROC'],
      dtype='object')

In [119]:
cgroup_names = ['PATID', 'LOS', 'STD_COST',
       'ICU_IND', 'ICU_SURG_IND', 'MAJ_SURG_IND', 'MATERNITY_IND',
       'ABDOMEN_REL_DIAG_NEG', 'ABDOMEN_REL_DIAG_POS', 'BLOOD_REL_DIAG',
       'HEART_REL_DIAG', 'HX_TOB_HAZ_DIAG', 'INJURY_REL_DIAG',
       'KIDNEY_REL_DIAG', 'LEUK_UNS_DIAG', 'LIVER_REL_DIAG', 'LUNG_REL_DIAG',
       'MENTAL_DISORDER_DIAG', 'OTHER_DIAG_NEG', 'OTHER_DIAG_POS',
       'PAIN_CONTROL_DIAG', 'SEC_MAL_NEO_LIV_DIAG', 'SPINE_REL_DIAG',
       'SUB_ABUSE_DIAG', 'SUB_WITHDRAWAL_DIAG', 'ABDOMEN_REL_PROC_NEG',
       'ABDOMEN_REL_PROC_POS', 'BLOOD_REL_PROC', 'BONE_REL_PROC',
       'BRAIN_REL_PROC', 'HEART_REL_PROC_NEG', 'HEART_REL_PROC_POS',
       'LUNG_REL_PROC', 'OTHER_PROC_NEG', 'OTHER_PROC_POS', 'RECTUM_PROC',
       'SPINE_REL_PROC', 'SUB_REL_PROC', 'URINARY_REL_PROC']

In [120]:
# cgroup_names = ['PATID','LOS', 'STD_COST',
#        'ICU_IND', 'ICU_SURG_IND', 'MAJ_SURG_IND', 'MATERNITY_IND',
#        'BLOOD_REL_DIAG', 'HEART_REL_DIAG', 'HX_TOB_HAZ_DIAG',
#        'INJURY_REL_DIAG', 'KIDNEY_REL_DIAG', 'LEUK_UNS_DIAG', 'LIVER_REL_DIAG',
#        'LUNG_REL_DIAG', 'MENTAL_DISORDER_DIAG', 'OTHER_DIAG_NEG',
#        'OTHER_DIAG_POS', 'PAIN_CONTROL_DIAG', 'SEC_MAL_NEO_LIV_DIAG',
#        'SPINE_REL_DIAG', 'SUB_ABUSE_DIAG', 'SUB_WITHDRAWAL_DIAG',
#        'ABDOMEN_REL_PROC_NEG', 'ABDOMEN_REL_PROC_POS', 'BLOOD_REL_PROC',
#        'BONE_REL_PROC', 'BRAIN_REL_PROC', 'HEART_REL_PROC_NEG',
#        'HEART_REL_PROC_POS', 'LUNG_REL_PROC', 'OTHER_PROC_NEG',
#        'OTHER_PROC_POS', 'RECTUM_PROC', 'SPINE_REL_PROC', 'SUB_REL_PROC',
#        'URINARY_REL_PROC']

In [121]:
inp_op_countergroup = inp_op_df1[cgroup_names]
inp_op_countergroup.head(2)

Unnamed: 0,PATID,LOS,STD_COST,ICU_IND,ICU_SURG_IND,MAJ_SURG_IND,MATERNITY_IND,ABDOMEN_REL_DIAG_NEG,ABDOMEN_REL_DIAG_POS,BLOOD_REL_DIAG,...,BRAIN_REL_PROC,HEART_REL_PROC_NEG,HEART_REL_PROC_POS,LUNG_REL_PROC,OTHER_PROC_NEG,OTHER_PROC_POS,RECTUM_PROC,SPINE_REL_PROC,SUB_REL_PROC,URINARY_REL_PROC
0,33003284598,1,6358.05,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,33003290698,6,18954.75,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [122]:
inp_op_countergroup = inp_op_countergroup.groupby('PATID').agg(sum).reset_index()
inp_op_countergroup


Unnamed: 0,PATID,LOS,STD_COST,ICU_IND,ICU_SURG_IND,MAJ_SURG_IND,MATERNITY_IND,ABDOMEN_REL_DIAG_NEG,ABDOMEN_REL_DIAG_POS,BLOOD_REL_DIAG,...,BRAIN_REL_PROC,HEART_REL_PROC_NEG,HEART_REL_PROC_POS,LUNG_REL_PROC,OTHER_PROC_NEG,OTHER_PROC_POS,RECTUM_PROC,SPINE_REL_PROC,SUB_REL_PROC,URINARY_REL_PROC
0,33003284598,1,6358.05,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,33003285835,6,9566.93,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,33003287387,2,4917.78,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,33003288343,3,28656.00,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,33003289068,5,5366.61,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37652,33141257936,2,8343.08,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37653,33160156245,3,35999.11,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37654,33162794507,5,8330.04,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37655,33220486386,1,10193.78,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [123]:
inp_rand_countergroup = inp_rand_df1[cgroup_names]
inp_rand_countergroup.head(2)

Unnamed: 0,PATID,LOS,STD_COST,ICU_IND,ICU_SURG_IND,MAJ_SURG_IND,MATERNITY_IND,ABDOMEN_REL_DIAG_NEG,ABDOMEN_REL_DIAG_POS,BLOOD_REL_DIAG,...,BRAIN_REL_PROC,HEART_REL_PROC_NEG,HEART_REL_PROC_POS,LUNG_REL_PROC,OTHER_PROC_NEG,OTHER_PROC_POS,RECTUM_PROC,SPINE_REL_PROC,SUB_REL_PROC,URINARY_REL_PROC
2,33003282417,2,11581.8,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,33003283156,4,11820.6,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [124]:
inp_rand_countergroup = inp_rand_countergroup.groupby('PATID').agg(sum).reset_index()
inp_rand_countergroup


Unnamed: 0,PATID,LOS,STD_COST,ICU_IND,ICU_SURG_IND,MAJ_SURG_IND,MATERNITY_IND,ABDOMEN_REL_DIAG_NEG,ABDOMEN_REL_DIAG_POS,BLOOD_REL_DIAG,...,BRAIN_REL_PROC,HEART_REL_PROC_NEG,HEART_REL_PROC_POS,LUNG_REL_PROC,OTHER_PROC_NEG,OTHER_PROC_POS,RECTUM_PROC,SPINE_REL_PROC,SUB_REL_PROC,URINARY_REL_PROC
0,33003282417,2,11581.80,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,33003283156,4,11820.60,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,33003283191,5,0.00,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,33003283464,3,11820.60,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,33003283943,1,10537.05,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,33183000639,2,11865.12,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39996,33220405526,3,20566.65,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39997,33220513607,3,20566.65,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39998,33220518933,4,20566.65,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [125]:
mbr_rand_df.head()

Unnamed: 0,PATID,TOTAL_MON_COV,AGE,GDR_CD,RACE,STATE
2,33003282417,30.0,55.0,F,W,AZ
13,33003283156,147.0,28.0,F,W,CO
14,33003283191,8.0,40.0,M,W,CA
20,33003283464,117.0,37.0,F,W,MD
25,33003283943,60.0,57.0,F,W,UT


In [126]:
mbr_op_df.head()

Unnamed: 0,PATID,TOTAL_MON_COV,AGE,GDR_CD,RACE,STATE
0,33003284598,193.0,54.0,M,W,MO
1,33003285835,54.0,64.0,F,W,AZ
2,33003287387,58.0,38.0,F,W,AZ
3,33003288343,88.0,62.0,F,W,CO
4,33003289068,30.0,50.0,M,W,MN


In [127]:
summ_op_joined = mbr_op_df.join(inp_op_countergroup.set_index('PATID'),on='PATID')
summ_op_joined['OPIOID_HARMED'] = '1' 
summ_op_joined.shape
summ_op_joined.head()

(37644, 45)

Unnamed: 0,PATID,TOTAL_MON_COV,AGE,GDR_CD,RACE,STATE,LOS,STD_COST,ICU_IND,ICU_SURG_IND,...,HEART_REL_PROC_NEG,HEART_REL_PROC_POS,LUNG_REL_PROC,OTHER_PROC_NEG,OTHER_PROC_POS,RECTUM_PROC,SPINE_REL_PROC,SUB_REL_PROC,URINARY_REL_PROC,OPIOID_HARMED
0,33003284598,193.0,54.0,M,W,MO,1,6358.05,1,0,...,0,0,0,0,0,0,0,0,0,1
1,33003285835,54.0,64.0,F,W,AZ,6,9566.93,0,0,...,0,0,0,0,0,0,0,0,0,1
2,33003287387,58.0,38.0,F,W,AZ,2,4917.78,0,0,...,0,0,0,0,0,0,0,0,0,1
3,33003288343,88.0,62.0,F,W,CO,3,28656.0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,33003289068,30.0,50.0,M,W,MN,5,5366.61,0,0,...,0,0,0,0,0,0,0,0,1,1


In [128]:
summ_rand_joined = mbr_rand_df.join(inp_rand_countergroup.set_index('PATID'),on='PATID')
summ_rand_joined['OPIOID_HARMED'] = '0'
summ_rand_joined.shape
summ_rand_joined.head()

(39374, 45)

Unnamed: 0,PATID,TOTAL_MON_COV,AGE,GDR_CD,RACE,STATE,LOS,STD_COST,ICU_IND,ICU_SURG_IND,...,HEART_REL_PROC_NEG,HEART_REL_PROC_POS,LUNG_REL_PROC,OTHER_PROC_NEG,OTHER_PROC_POS,RECTUM_PROC,SPINE_REL_PROC,SUB_REL_PROC,URINARY_REL_PROC,OPIOID_HARMED
2,33003282417,30.0,55.0,F,W,AZ,2,11581.8,0,0,...,0,0,0,0,0,0,0,0,0,0
13,33003283156,147.0,28.0,F,W,CO,4,11820.6,0,0,...,0,0,0,0,0,0,0,0,0,0
14,33003283191,8.0,40.0,M,W,CA,5,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
20,33003283464,117.0,37.0,F,W,MD,3,11820.6,0,0,...,0,0,0,0,0,0,0,0,0,0
25,33003283943,60.0,57.0,F,W,UT,1,10537.05,0,0,...,0,0,0,0,0,0,0,0,0,0


In [129]:
for col in inp_op_df1.columns:
    inp_op_df1[col].value_counts()

33032311841    23
33017039241    23
33029159849    22
33032410617    22
33033385886    20
               ..
33025447829     1
33025439029     1
33025437212     1
33025432762     1
33222407594     1
Name: PATID, Length: 37657, dtype: int64

41401    1373
5770     1234
486      1111
78659    1055
72210     982
         ... 
74685       1
8603        1
86101       1
1714        1
41090       1
Name: DIAG1, Length: 3248, dtype: int64

3051     5080
4019     2450
30500    1888
496      1784
0000     1641
         ... 
90089       1
64232       1
4371        1
28310       1
7590        1
Name: DIAG2, Length: 3250, dtype: int64

3051     7636
0000     5366
4019     3780
30500    1034
496       995
         ... 
5760        1
88111       1
1149        1
1973        1
7576        1
Name: DIAG3, Length: 3150, dtype: int64

0000     11543
3051      8503
4019      3272
2724      1153
53081     1034
         ...  
83402        1
6271         1
70909        1
9219         1
87359        1
Name: DIAG4, Length: 2909, dtype: int64

0000     19140
3051      8305
4019      2355
53081      922
2724       835
         ...  
56941        1
62210        1
V1303        1
34489        1
94428        1
Name: DIAG5, Length: 2672, dtype: int64

UNK    7638
430    2698
523    1909
359    1101
544     994
       ... 
623       1
755       1
821       1
868       1
118       1
Name: DRG, Length: 792, dtype: int64

9    61179
Name: ICD_FLAG, dtype: int64

1      14051
2      12746
3       9982
4       6692
5       4492
       ...  
121        1
283        1
87         1
114        1
290        1
Name: LOS, Length: 113, dtype: int64

0000000    13540
0          12556
0066         655
8102         551
66           518
           ...  
6509           1
5019           1
9084           1
8236           1
4533           1
Name: PROC1, Length: 2286, dtype: int64

0000000    21698
0          19286
8051         702
8051         491
3607         436
           ...  
7967           1
8212           1
6579           1
8682           1
3787           1
Name: PROC2, Length: 2126, dtype: int64

0       49071
8162     1023
3722      666
8853      662
3893      434
        ...  
7661        1
7639        1
7857        1
4051        1
4193        1
Name: PROC3, Length: 1100, dtype: int64

0000000    28824
0          25183
8856         335
8856         311
8451         305
           ...  
2756           1
0443           1
8754           1
5422           1
8389           1
Name: PROC4, Length: 1140, dtype: int64

0          30371
0000000    26365
8853         302
8853         273
8451         139
           ...  
4611           1
5493           1
8926           1
3778           1
0351           1
Name: PROC5, Length: 807, dtype: int64

0.00         2162
3278.52      1541
6322.86      1412
4917.78      1410
8430.48      1260
             ... 
100878.26       1
477.00          1
1171.40         1
1283.93         1
3604.00         1
Name: STD_COST, Length: 6286, dtype: int64

0    46547
1    14632
Name: ICU_IND, dtype: int64

0    56116
1     5063
Name: ICU_SURG_IND, dtype: int64

0    41674
1    19505
Name: MAJ_SURG_IND, dtype: int64

0    60788
1      391
Name: MATERNITY_IND, dtype: int64

UNKNOWN DRG CODE                            7638
OBSOLETE-PSYCHOSES                          2698
OBSOLETE-ALC/DRUG ABUSE, DEPEND W/O REHA    1909
OBSOLETE-UTER&ADNEX PROC FOR NON-MALIG W    1101
PATH FX&MSSKL/CONN TISS MALIG WO CC/MCC      994
                                            ... 
OTH MYLPRLF DIS/PRLY DIF NEOPL DX W CC         1
INTRACRAN VASC PROC W PDX HEMORRHG W MCC       1
OTH MALE REP SYS DX WO CC/MCC                  1
PROSTATIC OR PX UNREL TO PDX WO CC/MCC         1
OBSOLETE-CARDIAC PACEMAKER DEVICE REPLAC       1
Name: DRG_DESC, Length: 778, dtype: int64

0    60824
1      355
Name: ABDOMEN_REL_DIAG_NEG, dtype: int64

0    56849
1     4330
Name: ABDOMEN_REL_DIAG_POS, dtype: int64

0    58397
1     2782
Name: BLOOD_REL_DIAG, dtype: int64

0    57215
1     3964
Name: HEART_REL_DIAG, dtype: int64

0    60729
1      450
Name: HX_TOB_HAZ_DIAG, dtype: int64

0    60459
1      720
Name: INJURY_REL_DIAG, dtype: int64

0    59450
1     1729
Name: KIDNEY_REL_DIAG, dtype: int64

0    60743
1      436
Name: LEUK_UNS_DIAG, dtype: int64

0    60217
1      962
Name: LIVER_REL_DIAG, dtype: int64

0    53935
1     7244
Name: LUNG_REL_DIAG, dtype: int64

0    49991
1    11188
Name: MENTAL_DISORDER_DIAG, dtype: int64

0    57506
1     3673
Name: OTHER_DIAG_NEG, dtype: int64

0    50549
1    10630
Name: OTHER_DIAG_POS, dtype: int64

0    54433
1     6746
Name: PAIN_CONTROL_DIAG, dtype: int64

0    60899
1      280
Name: SEC_MAL_NEO_LIV_DIAG, dtype: int64

0    58586
1     2593
Name: SPINE_REL_DIAG, dtype: int64

0    57397
1     3782
Name: SUB_ABUSE_DIAG, dtype: int64

0    58925
1     2254
Name: SUB_WITHDRAWAL_DIAG, dtype: int64

0    60825
1      354
Name: ABDOMEN_REL_PROC_NEG, dtype: int64

0    60868
1      311
Name: ABDOMEN_REL_PROC_POS, dtype: int64

0    60408
1      771
Name: BLOOD_REL_PROC, dtype: int64

0    60501
1      678
Name: BONE_REL_PROC, dtype: int64

0    61010
1      169
Name: BRAIN_REL_PROC, dtype: int64

0    60742
1      437
Name: HEART_REL_PROC_NEG, dtype: int64

0    58818
1     2361
Name: HEART_REL_PROC_POS, dtype: int64

0    60553
1      626
Name: LUNG_REL_PROC, dtype: int64

0    60148
1     1031
Name: OTHER_PROC_NEG, dtype: int64

0    59281
1     1898
Name: OTHER_PROC_POS, dtype: int64

0    61179
Name: RECTUM_PROC, dtype: int64

0    59446
1     1733
Name: SPINE_REL_PROC, dtype: int64

0    60170
1     1009
Name: SUB_REL_PROC, dtype: int64

0    60823
1      356
Name: URINARY_REL_PROC, dtype: int64

In [130]:
for col in inp_rand_df1.columns:
    inp_rand_df1[col].value_counts()

33028351782    24
33010948871    24
33021577666    22
33060128229    22
33033998001    20
               ..
33021943566     1
33021941950     1
33021934292     1
33021926898     1
33220518933     1
Name: PATID, Length: 40000, dtype: int64

65421    1504
71536    1092
486      1035
41401    1029
V5789     798
         ... 
V152        1
73011       1
2117        1
64641       1
5851        1
Name: DIAG1, Length: 3404, dtype: int64

0000     3043
4019     2281
V270     2115
42731     916
27651     911
         ... 
64614       1
30189       1
332         1
7631        1
2121        1
Name: DIAG2, Length: 3494, dtype: int64

0000     8158
4019     3177
V270     1795
25000    1066
4280      735
         ... 
1916        1
5731        1
75026       1
81202       1
9918        1
Name: DIAG3, Length: 3326, dtype: int64

0000     13990
4019      2920
V270      1451
25000     1067
2724       970
         ...  
48239        1
76518        1
5258         1
3010         1
7818         1
Name: DIAG4, Length: 3006, dtype: int64

0000     19610
4019      2393
V270      1091
2724      1022
25000      930
         ...  
6011         1
71600        1
6261         1
0085         1
43830        1
Name: DIAG5, Length: 2817, dtype: int64

UNK    9437
371    2399
373    2172
544    1318
359    1125
       ... 
718       1
604       1
048       1
411       1
729       1
Name: DRG, Length: 818, dtype: int64

9    54960
Name: ICD_FLAG, dtype: int64

2      12152
3      10309
1       9740
4       6315
5       3515
       ...  
307        1
424        1
118        1
180        1
119        1
Name: LOS, Length: 147, dtype: int64

0          10519
0000000     8555
741         2361
741         1854
7359         785
           ...  
3805           1
7835           1
5899           1
4541           1
2641           1
Name: PROC1, Length: 2331, dtype: int64

0          18535
0000000    14094
734          392
9904         387
734          345
           ...  
733            1
3169           1
4876           1
1819           1
9351           1
Name: PROC2, Length: 2173, dtype: int64

0000000    26504
0          15381
734          364
7309         324
9904         311
           ...  
8937           1
6602           1
3229           1
8910           1
9959           1
Name: PROC3, Length: 1658, dtype: int64

0000000    39432
0           8336
8856         301
9904         269
3893         235
           ...  
5475           1
2756           1
9319           1
7771           1
8847           1
Name: PROC4, Length: 1084, dtype: int64

0          34456
0000000    16315
8853         207
9904         140
3893         104
           ...  
4011           1
4541           1
8245           1
8683           1
7932           1
Name: PROC5, Length: 881, dtype: int64

20566.65     2402
9462.45      2152
0.00         1382
7268.48      1166
6775.95       692
             ... 
18358.29        1
75718.20        1
5040.00         1
2441.00         1
158161.00       1
Name: STD_COST, Length: 6072, dtype: int64

0    43575
1    11385
Name: ICU_IND, dtype: int64

0    50524
1     4436
Name: ICU_SURG_IND, dtype: int64

0    32818
1    22142
Name: MAJ_SURG_IND, dtype: int64

0    45541
1     9419
Name: MATERNITY_IND, dtype: int64

UNKNOWN DRG CODE                            9437
MAJ GI DIS & PERITON INF W MCC              2399
MAJ GI DIS & PERITON INF WO CC/MCC          2172
PATH FX&MSSKL/CONN TISS MALIG WO CC/MCC     1318
OBSOLETE-UTER&ADNEX PROC FOR NON-MALIG W    1125
                                            ... 
OTH EAR/NOSE/MOUTH/THROAT DX W MCC             1
FL THK BRN W SK GRFT OR INH INJ WO CC/MC       1
OTH MALE REP SYS PX EXC MALIG WO CC/MCC        1
UTER,ADNX PX/NON-OV/ADNX MALIG W MCC           1
HIV W MAJ REL COND W CC                        1
Name: DRG_DESC, Length: 803, dtype: int64

0    54529
1      431
Name: ABDOMEN_REL_DIAG_NEG, dtype: int64

0    52867
1     2093
Name: ABDOMEN_REL_DIAG_POS, dtype: int64

0    50532
1     4428
Name: BLOOD_REL_DIAG, dtype: int64

0    48220
1     6740
Name: HEART_REL_DIAG, dtype: int64

0    54133
1      827
Name: HX_TOB_HAZ_DIAG, dtype: int64

0    54653
1      307
Name: INJURY_REL_DIAG, dtype: int64

0    51883
1     3077
Name: KIDNEY_REL_DIAG, dtype: int64

0    54696
1      264
Name: LEUK_UNS_DIAG, dtype: int64

0    54689
1      271
Name: LIVER_REL_DIAG, dtype: int64

0    51090
1     3870
Name: LUNG_REL_DIAG, dtype: int64

0    51504
1     3456
Name: MENTAL_DISORDER_DIAG, dtype: int64

0    53199
1     1761
Name: OTHER_DIAG_NEG, dtype: int64

0    40579
1    14381
Name: OTHER_DIAG_POS, dtype: int64

0    51808
1     3152
Name: PAIN_CONTROL_DIAG, dtype: int64

0    54469
1      491
Name: SEC_MAL_NEO_LIV_DIAG, dtype: int64

0    53552
1     1408
Name: SPINE_REL_DIAG, dtype: int64

0    54480
1      480
Name: SUB_ABUSE_DIAG, dtype: int64

0    54821
1      139
Name: SUB_WITHDRAWAL_DIAG, dtype: int64

0    54292
1      668
Name: ABDOMEN_REL_PROC_NEG, dtype: int64

0    54740
1      220
Name: ABDOMEN_REL_PROC_POS, dtype: int64

0    53452
1     1508
Name: BLOOD_REL_PROC, dtype: int64

0    54490
1      470
Name: BONE_REL_PROC, dtype: int64

0    54830
1      130
Name: BRAIN_REL_PROC, dtype: int64

0    54308
1      652
Name: HEART_REL_PROC_NEG, dtype: int64

0    53418
1     1542
Name: HEART_REL_PROC_POS, dtype: int64

0    54446
1      514
Name: LUNG_REL_PROC, dtype: int64

0    53701
1     1259
Name: OTHER_PROC_NEG, dtype: int64

0    53783
1     1177
Name: OTHER_PROC_POS, dtype: int64

0    54959
1        1
Name: RECTUM_PROC, dtype: int64

0    53825
1     1135
Name: SPINE_REL_PROC, dtype: int64

0    54909
1       51
Name: SUB_REL_PROC, dtype: int64

0    54323
1      637
Name: URINARY_REL_PROC, dtype: int64

In [142]:
summ_feed_joined = summ_op_joined.append(summ_rand_joined).reset_index(drop = True)
summ_feed_joined.shape
summ_feed_joined

(77018, 45)

Unnamed: 0,PATID,TOTAL_MON_COV,AGE,GDR_CD,RACE,STATE,LOS,STD_COST,ICU_IND,ICU_SURG_IND,...,HEART_REL_PROC_NEG,HEART_REL_PROC_POS,LUNG_REL_PROC,OTHER_PROC_NEG,OTHER_PROC_POS,RECTUM_PROC,SPINE_REL_PROC,SUB_REL_PROC,URINARY_REL_PROC,OPIOID_HARMED
0,33003284598,193.0,54.0,M,W,MO,1,6358.05,1,0,...,0,0,0,0,0,0,0,0,0,1
1,33003285835,54.0,64.0,F,W,AZ,6,9566.93,0,0,...,0,0,0,0,0,0,0,0,0,1
2,33003287387,58.0,38.0,F,W,AZ,2,4917.78,0,0,...,0,0,0,0,0,0,0,0,0,1
3,33003288343,88.0,62.0,F,W,CO,3,28656.00,0,0,...,0,0,0,0,0,0,0,0,0,1
4,33003289068,30.0,50.0,M,W,MN,5,5366.61,0,0,...,0,0,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77013,33183000639,49.0,53.0,F,U,CA,2,11865.12,0,0,...,0,0,0,0,0,0,0,0,0,0
77014,33220405526,12.0,32.0,F,W,NY,3,20566.65,0,0,...,0,0,0,0,0,0,0,0,0,0
77015,33220513607,14.0,38.0,F,W,NE,3,20566.65,0,0,...,0,0,0,0,0,0,0,0,0,0
77016,33220518933,42.0,33.0,F,A,NY,4,20566.65,0,0,...,0,0,0,0,0,0,0,0,0,0


In [144]:
summ_feed_joined.dtypes

PATID                     int64
TOTAL_MON_COV           float64
AGE                     float64
GDR_CD                   object
RACE                     object
STATE                    object
LOS                       int64
STD_COST                float64
ICU_IND                   int64
ICU_SURG_IND              int64
MAJ_SURG_IND              int64
MATERNITY_IND             int64
ABDOMEN_REL_DIAG_NEG      int64
ABDOMEN_REL_DIAG_POS      int64
BLOOD_REL_DIAG            int64
HEART_REL_DIAG            int64
HX_TOB_HAZ_DIAG           int64
INJURY_REL_DIAG           int64
KIDNEY_REL_DIAG           int64
LEUK_UNS_DIAG             int64
LIVER_REL_DIAG            int64
LUNG_REL_DIAG             int64
MENTAL_DISORDER_DIAG      int64
OTHER_DIAG_NEG            int64
OTHER_DIAG_POS            int64
PAIN_CONTROL_DIAG         int64
SEC_MAL_NEO_LIV_DIAG      int64
SPINE_REL_DIAG            int64
SUB_ABUSE_DIAG            int64
SUB_WITHDRAWAL_DIAG       int64
ABDOMEN_REL_PROC_NEG      int64
ABDOMEN_

In [145]:
summ_feed_joined['OPIOID_HARMED'] = summ_feed_joined['OPIOID_HARMED'].astype(str)

In [146]:
summ_feed_joined.columns

Index(['PATID', 'TOTAL_MON_COV', 'AGE', 'GDR_CD', 'RACE', 'STATE', 'LOS',
       'STD_COST', 'ICU_IND', 'ICU_SURG_IND', 'MAJ_SURG_IND', 'MATERNITY_IND',
       'ABDOMEN_REL_DIAG_NEG', 'ABDOMEN_REL_DIAG_POS', 'BLOOD_REL_DIAG',
       'HEART_REL_DIAG', 'HX_TOB_HAZ_DIAG', 'INJURY_REL_DIAG',
       'KIDNEY_REL_DIAG', 'LEUK_UNS_DIAG', 'LIVER_REL_DIAG', 'LUNG_REL_DIAG',
       'MENTAL_DISORDER_DIAG', 'OTHER_DIAG_NEG', 'OTHER_DIAG_POS',
       'PAIN_CONTROL_DIAG', 'SEC_MAL_NEO_LIV_DIAG', 'SPINE_REL_DIAG',
       'SUB_ABUSE_DIAG', 'SUB_WITHDRAWAL_DIAG', 'ABDOMEN_REL_PROC_NEG',
       'ABDOMEN_REL_PROC_POS', 'BLOOD_REL_PROC', 'BONE_REL_PROC',
       'BRAIN_REL_PROC', 'HEART_REL_PROC_NEG', 'HEART_REL_PROC_POS',
       'LUNG_REL_PROC', 'OTHER_PROC_NEG', 'OTHER_PROC_POS', 'RECTUM_PROC',
       'SPINE_REL_PROC', 'SUB_REL_PROC', 'URINARY_REL_PROC', 'OPIOID_HARMED'],
      dtype='object')

In [147]:
summ_feed_joined.head()

Unnamed: 0,PATID,TOTAL_MON_COV,AGE,GDR_CD,RACE,STATE,LOS,STD_COST,ICU_IND,ICU_SURG_IND,...,HEART_REL_PROC_NEG,HEART_REL_PROC_POS,LUNG_REL_PROC,OTHER_PROC_NEG,OTHER_PROC_POS,RECTUM_PROC,SPINE_REL_PROC,SUB_REL_PROC,URINARY_REL_PROC,OPIOID_HARMED
0,33003284598,193.0,54.0,M,W,MO,1,6358.05,1,0,...,0,0,0,0,0,0,0,0,0,1
1,33003285835,54.0,64.0,F,W,AZ,6,9566.93,0,0,...,0,0,0,0,0,0,0,0,0,1
2,33003287387,58.0,38.0,F,W,AZ,2,4917.78,0,0,...,0,0,0,0,0,0,0,0,0,1
3,33003288343,88.0,62.0,F,W,CO,3,28656.0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,33003289068,30.0,50.0,M,W,MN,5,5366.61,0,0,...,0,0,0,0,0,0,0,0,1,1


In [148]:
summ_feed_joined.to_csv('Opioid_analytics/Akhila/overdose_prediction/data/model_final_2007.csv', index=False)