# Importing required libraries

In [3]:
import pandas as pd
import numpy as np
# import seaborn as sns
import pyarrow as pa
import pyarrow.parquet as pq

pd.set_option('display.max_columns', 500)
# To execute a cell line by line
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Extracting Patient IDs for different Division - Race combinations

## Loading SES Table

In [4]:
ses_df = pd.read_parquet('/N/project/optum/data/parquet/ses_81_202107/ses_ses.parquet', engine='pyarrow')
ses_df.shape
ses_df.head(2)

(60503043, 10)

Unnamed: 0,PATID,D_EDUCATION_LEVEL_CODE,D_FED_POVERTY_STATUS_CODE,D_HOME_OWNERSHIP_CODE,D_HOUSEHOLD_INCOME_RANGE_CODE,D_NETWORTH_RANGE_CODE,D_OCCUPATION_TYPE_CODE,D_RACE_CODE,NUM_ADULTS,NUM_CHILD
0,802666110000096,B,A,1,2,1,U,W,2,3
1,802666110000117,C,A,2,1,1,U,W,1,0


In [5]:
ses_df.groupby('D_RACE_CODE').agg({'PATID':'nunique'})

Unnamed: 0_level_0,PATID
D_RACE_CODE,Unnamed: 1_level_1
,1692852
A,3157847
B,6698555
H,8015670
W,40938119


In [6]:
#Replacing race codes with their definitions

ses_df.loc[ses_df["D_RACE_CODE"] == "A", "D_RACE_CODE"] = "Asian"
ses_df.loc[ses_df["D_RACE_CODE"] == "B", "D_RACE_CODE"] = "Black"
ses_df.loc[ses_df["D_RACE_CODE"] == "H", "D_RACE_CODE"] = "Hispanic"
ses_df.loc[ses_df["D_RACE_CODE"] == "W", "D_RACE_CODE"] = "White"
ses_df.loc[ses_df["D_RACE_CODE"] == "", "D_RACE_CODE"] = "Unknown"

In [7]:
ses_df.groupby('D_RACE_CODE').agg({'PATID':'nunique'})

Unnamed: 0_level_0,PATID
D_RACE_CODE,Unnamed: 1_level_1
Asian,3157847
Black,6698555
Hispanic,8015670
Unknown,1692852
White,40938119


In [8]:
ses_pat_race = ses_df[['PATID', 'D_RACE_CODE']]
ses_pat_race.shape
ses_pat_race.drop_duplicates(inplace = True)
ses_pat_race.shape

(60503043, 2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ses_pat_race.drop_duplicates(inplace = True)


(60503043, 2)

## Loading Member Enrollment data

In [9]:
mbr_enroll_df = pd.read_parquet('/N/project/optum/data/parquet/ses_81_202201/ses_mbr_enroll.parquet', engine='pyarrow')
mbr_enroll_df.shape
mbr_enroll_df.head(2)

(135536313, 14)

Unnamed: 0,PATID,PAT_PLANID,ASO,BUS,CDHP,DIVISION,ELIGEFF,ELIGEND,GDR_CD,GROUP_NBR,HEALTH_EXCH,LIS_DUAL,PRODUCT,YRDOB
0,802666110000096,208426656918632,N,COM,3,EAST NORTH CENTRAL,2019-09-01,2021-08-31,M,LLLU1OPCP,0,,POS,2011
1,802666110000117,208426147489515,N,COM,2,EAST NORTH CENTRAL,2019-02-01,2019-07-19,F,LLLHS5U66,0,,EPO,1993


In [10]:
mbr_pat_div = mbr_enroll_df[['PATID', 'DIVISION']]
mbr_pat_div.shape
mbr_pat_div.drop_duplicates(inplace = True)
mbr_pat_div.shape

(135536313, 2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mbr_pat_div.drop_duplicates(inplace = True)


(75446404, 2)

In [11]:
mbr_ses_pat = pd.merge(mbr_pat_div, ses_pat_race, how = 'left', on = 'PATID')
mbr_ses_pat.shape
mbr_ses_pat.head(2)
mbr_ses_pat.drop_duplicates(inplace = True)
mbr_ses_pat.shape

(75446404, 3)

Unnamed: 0,PATID,DIVISION,D_RACE_CODE
0,802666110000096,EAST NORTH CENTRAL,White
1,802666110000117,EAST NORTH CENTRAL,White


(75446404, 3)

In [16]:
mbr_ses_pat['D_RACE_CODE'].fillna('Unknown', inplace = True)

## Extracting patients IDs for division and race code combinations

In [18]:
div_race_dict = {}

for div in mbr_ses_pat['DIVISION'].unique():
    for race in mbr_ses_pat['D_RACE_CODE'].unique():
        list_name = div + "_" + race + "_" + 'ids'
        print(list_name)
        div_race_dict[list_name] = list(mbr_ses_pat[(mbr_ses_pat['D_RACE_CODE'] == race) & (mbr_ses_pat['DIVISION'] == div)]['PATID'].unique())


EAST NORTH CENTRAL_White_ids
EAST NORTH CENTRAL_Hispanic_ids
EAST NORTH CENTRAL_Unknown_ids
EAST NORTH CENTRAL_Asian_ids
EAST NORTH CENTRAL_Black_ids
EAST SOUTH CENTRAL_White_ids
EAST SOUTH CENTRAL_Hispanic_ids
EAST SOUTH CENTRAL_Unknown_ids
EAST SOUTH CENTRAL_Asian_ids
EAST SOUTH CENTRAL_Black_ids
MOUNTAIN_White_ids
MOUNTAIN_Hispanic_ids
MOUNTAIN_Unknown_ids
MOUNTAIN_Asian_ids
MOUNTAIN_Black_ids
SOUTH ATLANTIC_White_ids
SOUTH ATLANTIC_Hispanic_ids
SOUTH ATLANTIC_Unknown_ids
SOUTH ATLANTIC_Asian_ids
SOUTH ATLANTIC_Black_ids
PACIFIC_White_ids
PACIFIC_Hispanic_ids
PACIFIC_Unknown_ids
PACIFIC_Asian_ids
PACIFIC_Black_ids
WEST NORTH CENTRAL_White_ids
WEST NORTH CENTRAL_Hispanic_ids
WEST NORTH CENTRAL_Unknown_ids
WEST NORTH CENTRAL_Asian_ids
WEST NORTH CENTRAL_Black_ids
WEST SOUTH CENTRAL_White_ids
WEST SOUTH CENTRAL_Hispanic_ids
WEST SOUTH CENTRAL_Unknown_ids
WEST SOUTH CENTRAL_Asian_ids
WEST SOUTH CENTRAL_Black_ids
MIDDLE ATLANTIC_White_ids
MIDDLE ATLANTIC_Hispanic_ids
MIDDLE ATLANTIC_Unkn

In [19]:
len(div_race_dict['EAST NORTH CENTRAL_White_ids'])

6902425

In [20]:
# Keys with shortest length lists in dictionary
# Using len() + loop + items()
min_val = min([len(div_race_dict[ele]) for ele in div_race_dict])
res = []
for ele in div_race_dict:
    if len(div_race_dict[ele]) == min_val:
        res.append(ele)
          
# printing result 
print("The required keys are : " + str(res)) 

The required keys are : ['EAST SOUTH CENTRAL_Asian_ids']


In [22]:
import pickle

In [23]:
with open('div_race_dict.pickle', 'wb') as handle:
    pickle.dump(div_race_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [26]:
# with open('div_race_dict.pickle', 'rb') as handle:
#     b = pickle.load(handle)

In [27]:
# b