In [2]:
# import optum.utils
import pandas as pd
import numpy as np
import gzip
from io import StringIO
import pyarrow as pa
import pyarrow.parquet as pq

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
# To execute a cell line by line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Loading Data

## Test RX file

In [3]:
file_name = '/N/project/optum/data/parquet/ses_81_202201/ses_r2021q3.parquet'

In [None]:
rx_temp = pq.ParquetDataset(file_name, use_legacy_dataset = False)
rx_df = rx_temp.read().to_pandas()
rx_df.shape
rx_df.head()

In [None]:
rx_df['NDC'].value_counts()

In [None]:
rx_df[rx_df['NDC'] == '59267100001'].head(2)

## Loading NDC Lookup table 

In [None]:
lu_ndc_df = pd.read_parquet('/N/project/optum/data/parquet/ses_81_202201/lu_ndc.parquet', engine='fastparquet')
lu_ndc_df.shape
lu_ndc_df.head()

In [None]:
lu_ndc_df[lu_ndc_df['NDC'] == '59267100001'].head()

## Loading Diagnosis Lookup table

In [None]:
lu_diag_df = pd.read_parquet('/N/project/optum/data/parquet/ses_81_202201/lu_diagnosis.parquet', engine='pyarrow')
lu_diag_df.shape
lu_diag_df.head()

In [None]:
lu_diag_df[(lu_diag_df['ICD_VER_CD'] == '9') & (lu_diag_df['DIAG_CD'] == '25002')].tail()

In [None]:
lu_diag_df['ICD_VER_CD'].value_counts()

In [None]:
opioid_icd9_codes = ['30400',	'30401',	'30402',	'30403',	'30470',	'30471',	'30472',	'30473',	'30550',	'30551',	'30552',	'30553',	'96500',	'96501',	'96502',	'96509',	'9701',	'E8500',	'E8501',	'E8502',	'E9350',	'E9351',	'E9352',	'E9401']
len(opioid_icd9_codes)

In [None]:
opioid_icd10_codes = ['F1110',	'F11120',	'F11121',	'F11122',	'F11129',	'F1114',	'F11150',	'F11151',	'F11159',	'F11181',	'F11182',	'F11188',	'F1119',	'F1120',	'F1121',	'F11220',	'F11221',	'F11222',	'F11229',	'F1123',	'F1124',	'F11250',	'F11251',	'F11259',	'F11281',	'F11282',	'F11288',	'F1129',	'F1190',	'F11920',	'F11921',	'F11922',	'F11929',	'F1193',	'F1194',	'F11950',	'F11951',	'F11959',	'F11981',	'F11982',	'F11988',	'F1199',	'T400X1A',	'T400X1D',	'T400X2A',	'T400X2D',	'T400X3A',	'T400X3D',	'T400X4A',	'T400X4D',	'T401X1A',	'T401X1D',	'T401X2A',	'T401X2D',	'T401X3A',	'T401X3D',	'T401X4A',	'T401X4D',	'T402X1A',	'T402X1D',	'T402X2A',	'T402X2D',	'T402X3A',	'T402X3D',	'T402X4A',	'T402X4D',	'T403X1A',	'T403X1D',	'T403X2A',	'T403X2D',	'T403X3A',	'T403X3D',	'T403X4A',	'T403X4D',	'T404X1A',	'T404X1D',	'T404X2A',	'T404X2D',	'T404X3A',	'T404X3D',	'T404X4A',	'T404X4D',	'T40601A',	'T40601D',	'T40602A',	'T40602D',	'T40603A',	'T40603D',	'T40604A',	'T40604D',	'T40691A',	'T40691D',	'T40692A',	'T40692D',	'T40693A',	'T40693D',	'T40694A',	'T40694D',	'T400X5A',	'T400X5D',	'T402X5A',	'T402X5D',	'T403X5A',	'T403X5D',	'T404X5A',	'T404X5D',	'T40605A',	'T40605D',	'T40695A',	'T40695D',	'Z79891']
len(opioid_icd10_codes)

In [None]:
lu_diag_df.head(2)

In [None]:
# Filtering for ICD-9-CM diagnosis codes

lu_diag_icd9_opioids = lu_diag_df[(lu_diag_df['ICD_VER_CD'] == '9') & (lu_diag_df['DIAG_CD'].isin(opioid_icd9_codes))]
lu_diag_icd9_opioids.shape
lu_diag_icd9_opioids['DIAG_CD'].nunique()
lu_diag_icd9_opioids.head()

lu_diag_icd9_opioids.drop_duplicates(inplace = True)
lu_diag_icd9_opioids.shape

In [None]:
# Filtering for ICD-10-CM diagnosis codes

lu_diag_icd10_opioids = lu_diag_df[(lu_diag_df['ICD_VER_CD'] == '10') & (lu_diag_df['DIAG_CD'].isin(opioid_icd10_codes))]
lu_diag_icd10_opioids.shape
lu_diag_icd10_opioids['DIAG_CD'].nunique()
lu_diag_icd10_opioids.head()

lu_diag_icd10_opioids.drop_duplicates(inplace = True)
lu_diag_icd10_opioids.shape

In [None]:
# Concatenating ICD-9 and ICD-10 dfs

lu_diag_icd_opiods = pd.concat([lu_diag_icd9_opioids, lu_diag_icd10_opioids])
lu_diag_icd_opiods.reset_index(inplace = True, drop = True)
lu_diag_icd_opiods.shape
lu_diag_icd_opiods.head(2)

In [None]:
lu_diag_icd_opiods.to_csv('lu_diag_icd_opiods.csv', index = False)

## Test Medical Diagnosis table

In [None]:
file_name1 = '/N/project/optum/data/parquet/ses_81_202201/ses_diag2021q3.parquet'

In [None]:
diag_temp = pq.ParquetDataset(file_name1, use_legacy_dataset = False)
diag_df = diag_temp.read().to_pandas()
diag_df.shape
diag_df.head()

In [None]:
diag_df['ICD_FLAG'].value_counts()

In [None]:
diag_df['DIAG'].value_counts()

In [None]:
diag_df[diag_df['DIAG'].isin(opioid_icd10_codes)].shape
diag_df[diag_df['DIAG'].isin(opioid_icd10_codes)]['DIAG'].unique()

In [None]:
diag_df[diag_df['DIAG'].isin(opioid_icd9_codes)].shape

In [None]:
diag_df[diag_df['ICD_FLAG'] == '9']['DIAG'].unique()