In [247]:

import pandas as pd
import numpy as np
import regex as re
from pathlib import Path
import glob


## Load Data

In [118]:
def file_paths_and_names(year):
    p = Path('data')
    files = list(p.glob(f"*_{year}.csv"))
    file_names = [re.findall(r'[a-zA-Z]+', i.stem)[0] for i in files]
    # print(file_names)
    file_paths = [i for i in files]
    return file_names,file_paths
years = ['19', '20', '21', '22', '23']
files_by_year = {year: dict(zip(*file_paths_and_names(year))) for year in years}


In [51]:
def load_eventlink_file(file,year):
    return pd.read_csv(file,usecols=['DUPERSID','CONDIDX','EVNTIDX'])

In [4]:
def load_prescription_file(file,year):
    tot_exp = 'RXXP'+year+'X'
    oop_doc = 'RXSF'+year+'X'
    mdcare_doc = 'RXMR'+year+'X'
    mdcaid_doc = 'RXMD'+year+'X'
    priv_doc = 'RXPV'+year+'X'
    WT = 'PERWT'+year+'F'
    return pd.read_csv(file,usecols=['DUPERSID','RXRECIDX','LINKIDX','TC1S1_1','DIABEQUIP',tot_exp,oop_doc,mdcare_doc,mdcaid_doc,priv_doc,WT])


In [5]:
def load_conditions_file(file,year):
    WT = 'PERWT'+year+'F'
    return pd.read_csv(file,usecols = ['DUPERSID','CONDIDX','ICD10CDX',WT])

In [6]:
def load_inpatient_file(file,year):
    tot_exp = 'IPXP'+year+'X'
    oop_doc = 'IPDSF'+year+'X'
    oop_fac = 'IPFSF'+year+'X'
    mdcare_doc = 'IPDMR'+year+'X'
    mdcaid_doc = 'IPDMD'+year+'X'
    priv_doc = 'IPDPV'+year+'X'
    mdcare_fac = 'IPFMR'+year+'X'
    mdcaid_fac = 'IPFMD'+year+'X'
    priv_fac = 'IPFPV'+year+'X'
    WT = 'PERWT'+year+'F'
    return pd.read_csv(file,usecols=['DUPERSID','EVNTIDX',tot_exp,oop_doc,oop_fac,mdcare_doc,mdcare_fac,mdcaid_doc,mdcaid_fac,priv_doc,priv_fac,WT])

In [7]:
def load_outpatient_file(file,year):
    tot_exp = 'OPXP'+year+'X'
    oop_doc = 'OPDSF'+year+'X'
    oop_fac = 'OPFSF'+year+'X'
    mdcare_doc = 'OPDMR'+year+'X'
    mdcaid_doc = 'OPDMD'+year+'X'
    priv_doc = 'OPDPV'+year+'X'
    mdcare_fac = 'OPFMR'+year+'X'
    mdcaid_fac = 'OPFMD'+year+'X'
    priv_fac = 'OPFPV'+year+'X'
    WT = 'PERWT'+year+'F'
    return pd.read_csv(file,usecols=['DUPERSID','EVNTIDX',tot_exp,oop_doc,oop_fac,mdcare_doc,mdcare_fac,mdcaid_doc,mdcaid_fac,priv_doc,priv_fac,WT])


In [8]:
def load_er_file(file,year):
    tot_exp = 'ERXP'+year+'X'
    oop_doc = 'ERDSF'+year+'X'
    oop_fac = 'ERFSF'+year+'X'
    mdcare_doc = 'ERDMR'+year+'X'
    mdcaid_doc = 'ERDMD'+year+'X'
    priv_doc = 'ERDPV'+year+'X'
    mdcare_fac = 'ERFMR'+year+'X'
    mdcaid_fac = 'ERFMD'+year+'X'
    priv_fac = 'ERFPV'+year+'X'
    WT = 'PERWT'+year+'F'
    return pd.read_csv(file,usecols=['DUPERSID','EVNTIDX',tot_exp,oop_doc,oop_fac,mdcare_doc,mdcare_fac,mdcaid_doc,mdcaid_fac,priv_doc,priv_fac,WT])

In [9]:
def load_office_file(file,year):
    tot_exp = 'OBXP'+year+'X'
    oop_doc = 'OBSF'+year+'X'
    mdcare_doc = 'OBMR'+year+'X'
    mdcaid_doc = 'OBMD'+year+'X'
    priv_doc = 'OBPV'+year+'X'
    WT = 'PERWT'+year+'F'
    return pd.read_csv(file,usecols=['DUPERSID','EVNTIDX',tot_exp,oop_doc,mdcare_doc,mdcaid_doc,priv_doc,WT])

In [10]:
def load_home_file(file,year):
    tot_exp = 'HHXP'+year+'X'
    oop_doc = 'HHSF'+year+'X'
    mdcare_doc = 'HHMR'+year+'X'
    mdcaid_doc = 'HHMD'+year+'X'
    priv_doc = 'HHPV'+year+'X'
    WT = 'PERWT'+year+'F'
    return pd.read_csv(file,usecols=['DUPERSID','EVNTIDX',tot_exp,oop_doc,mdcare_doc,mdcaid_doc,priv_doc,WT])   

In [178]:
def load_fyc(file,year):
    tot_exp = 'TOTEXP'+year
    return pd.read_csv(file,usecols=['DUPERSID',tot_exp])

In [11]:
def rename_cols(df):
    old_cols = df.columns
    new_cols = [re.sub('[0-9]','',i) for i in old_cols]
    return df.rename(columns=dict(zip(old_cols,new_cols)))

In [83]:
#Condition index and person weight for diabetes 
def condidx_diabetes(df):
    diab_df = df[df['ICDCDX']=='E11'][['CONDIDX','PERWTF']]
    diab_df['CONDIDX'] = diab_df['CONDIDX'].astype(str)
    return diab_df


In [175]:
def load_all_files(year):
    cond =  rename_cols(load_conditions_file(files_by_year[year]['conditions'],year))
    out = rename_cols(load_outpatient_file(files_by_year[year]['outpatient'],year))
    prescription =  rename_cols(load_prescription_file(files_by_year[year]['prescriptions'],year)) 
    inpatient =  rename_cols(load_inpatient_file(files_by_year[year]['inpatient'],year))    
    home =  rename_cols(load_home_file(files_by_year[year]['home'],year))   
    office =  rename_cols(load_office_file(files_by_year[year]['office'],year))
    er =  rename_cols(load_er_file(files_by_year[year]['er'],year)) 
    eventlink = load_eventlink_file(files_by_year[year]['eventlink'],year)
    fyc = rename_cols(load_fyc(files_by_year[year]['fyc'],year))
    return [cond,out,prescription,inpatient,home,office,er,eventlink,fyc]
      

In [248]:
dfs_by_year = {}
df_names = ['cond','out','prescription','inpatient','home','office','er','eventlink','fyc']
for year in years:
    dfs_by_year[year] = dict(zip(df_names,load_all_files(year)))


## conditions

In [249]:
def get_diabetes_conditions(year):
    return condidx_diabetes(dfs_by_year[year]['cond'])
cond_diab = {}
for year in years:
    cond_diab[year]=get_diabetes_conditions(year)

## Process all event

In [250]:
def create_event_diabetes(event,year):
    df = dfs_by_year[year][event]
    evntlnk = dfs_by_year[year]['eventlink']
    evntlnk['EVNTIDX'] = evntlnk['EVNTIDX'].astype(str)
    evntlnk['CONDIDX'] = evntlnk['CONDIDX'].astype(str)
    # return cond_diab[year]
    #use eventlink file to link conditions and prescriptions 
    evn_diabetes = pd.merge(left=evntlnk,right=cond_diab[year],on=['CONDIDX'])
    df['EVNTIDX'] = df['EVNTIDX'].astype(str)
    diabetes = pd.merge(left=evn_diabetes,right=df,on=['EVNTIDX','DUPERSID','PERWTF'])
    return diabetes


## Prescriptions

In [252]:
#links prescription dataset to diabetes conditions 
def create_prescription_diabetes(year):
    evntlnk = dfs_by_year[year]['eventlink']
    p_df = dfs_by_year[year]['prescription']
    cond_df = cond_diab[year]
    evntlnk['LINKIDX'] = evntlnk['EVNTIDX'].astype(str)
    evntlnk['CONDIDX'] = evntlnk['CONDIDX'].astype(str)
    
    #use eventlink file to link conditions and prescriptions 
    evn_diabetes = pd.merge(left=evntlnk,right=cond_df,on=['CONDIDX'])
    p_df['LINKIDX'] = p_df['LINKIDX'].astype(str)
    p_diabetes = pd.merge(left=evn_diabetes,right=p_df,on=['LINKIDX','DUPERSID','PERWTF'])
    return p_diabetes


In [254]:
#save all prescriptions associated with diabetes for all years 
prescription_dfs = []
for year in years:
    p_df = create_prescription_diabetes(year)
    p_df['year'] = '20'+year
    p_df.drop(columns = ['EVNTIDX'],inplace=True)
    prescription_dfs.append(p_df)
unweighted_prescription_years = pd.concat(prescription_dfs,ignore_index=True)
# unweighted_prescription_years.to_csv('data/processed_data/unweighted_diabetes_prescriptions.csv',index=False)

In [265]:
def event_diabetes_person_sum(event,year):
    
    if event == 'prescription':
        df = create_prescription_diabetes(year)
    else:
        df = create_event_diabetes(event,year)

    col_sub = ['XPX','SFX','MRX','MDX','PVX']
    all_cols = dfs_by_year[year][event].columns
    payment_columns = all_cols[all_cols.str.contains('|'.join(col_sub))]
    df[payment_columns] = df[payment_columns].apply(pd.to_numeric, errors='coerce')
    # sum_payment_cols = dict(zip(payment_columns,['sum' for i in range(len(payment_columns))]))
    sum_payments = df.groupby('DUPERSID',as_index=False)[payment_columns].sum()
    # .agg(sum_payment_cols)

    sum_payments= pd.merge(left = sum_payments, right= df[['DUPERSID','PERWTF']].drop_duplicates(), on='DUPERSID')
    sum_payments[payment_columns] = sum_payments[payment_columns].multiply(sum_payments['PERWTF'],axis='index')
    return sum_payments,payment_columns


In [271]:
events = ['out','inpatient','home','office','er','prescription']
long_df = []
for e in events:
    event_dfs = []
    
    for year in years:
        event_df,payment_columns = event_diabetes_person_sum(e,year)
        event_df['year'] = '20'+year
        event_dfs.append(event_df)
        l_df = pd.melt(event_df,id_vars=['DUPERSID','year'],value_vars=payment_columns)
        long_df.append(l_df)
    file_name = f"weighted_{e}_sums"
    weighted_sums = pd.concat(event_dfs,ignore_index=True)
    weighted_sums.to_csv(f'data/processed_data/{file_name}.csv',index=False)
all_events = pd.concat(long_df,ignore_index=True)
all_events.to_csv('data/processed_data/all_events.csv',index=False)
    # print(file_name)
    

In [272]:
all_events

Unnamed: 0,DUPERSID,year,variable,value
0,2320180101,2019,OPXPX,3.059676e+06
1,2320379101,2019,OPXPX,2.804109e+06
2,2320434101,2019,OPXPX,9.850112e+06
3,2320574101,2019,OPXPX,1.506772e+06
4,2320654102,2019,OPXPX,2.608390e+05
...,...,...,...,...
112318,2819747101,2023,RXXPX,8.434618e+05
112319,2819759102,2023,RXXPX,4.487864e+05
112320,2819767102,2023,RXXPX,3.640327e+08
112321,2819781101,2023,RXXPX,3.127760e+06


In [204]:
fyc_dfs = []
for year in years:
    f_df = dfs_by_year[year]['fyc']
    f_df['year'] = '20'+year
    fyc_dfs.append(f_df)
all_fycs = pd.concat(fyc_dfs,ignore_index=True)
all_fycs.to_csv('data/processed_data/all_fycs.csv',index=False)

In [256]:
p_d = create_prescription_diabetes('22')
col_sub = ['XPX','SFX','MRX','MDX','PVX']
all_cols = p_d.columns
payment_columns = all_cols[all_cols.str.contains('|'.join(col_sub))]
p_d[payment_columns] = p_d[payment_columns].apply(pd.to_numeric, errors='coerce')
# sum_payment_cols = dict(zip(payment_columns,['sum' for i in range(len(payment_columns))]))
sum_payments = p_d.groupby('DUPERSID',as_index=False)[payment_columns].sum()
sum_payments= pd.merge(left = sum_payments, right= p_d[['DUPERSID','PERWTF']].drop_duplicates(), on='DUPERSID')
sum_payments[payment_columns] = sum_payments[payment_columns].multiply(sum_payments['PERWTF'],axis='index')
sum_payments
# long_df.append(pd.melt(sum_payments,id_vars = 'DUPERSID',value_vars = payment_columns))
# all_events = pd.concat(long_df)
# all_events

Unnamed: 0,DUPERSID,RXSFX,RXMRX,RXMDX,RXPVX,RXXPX,PERWTF
0,2460002101,0.000000e+00,6.759062e+06,5.728309e+03,0.000000e+00,6.764790e+06,5728.309495
1,2460050101,1.244453e+06,4.153702e+06,0.000000e+00,0.000000e+00,5.398155e+06,6579.184245
2,2460103101,2.989085e+05,1.161731e+06,0.000000e+00,0.000000e+00,1.460639e+06,1379.301845
3,2460108101,0.000000e+00,0.000000e+00,1.885516e+07,0.000000e+00,1.885516e+07,11477.800780
4,2460115101,1.142052e+05,5.847785e+06,0.000000e+00,0.000000e+00,5.961990e+06,14134.302200
...,...,...,...,...,...,...,...
2141,2799595101,3.468582e+05,9.221031e+05,0.000000e+00,1.144863e+06,2.413824e+06,7707.959071
2142,2799632101,1.849308e+06,2.025177e+07,0.000000e+00,2.475455e+07,4.685563e+07,14225.445400
2143,2799636102,6.173443e+05,0.000000e+00,0.000000e+00,1.599914e+05,7.773358e+05,7136.103504
2144,2799645102,1.801140e+05,0.000000e+00,0.000000e+00,3.658241e+05,5.459380e+05,20726.576630


In [None]:
all_event.to_csv('data/processed_data/unweighted_diabetes_prescriptions.csv',index=False)

In [None]:
e_d,payment_columns = event_diabetes_person_sum('office','23')

# e_d.pivot(payment_columns)
# pay = [i for i in payment_columns]
e_d
# pd.melt(e_d,id_vars=['DUPERSID','year'],value_vars=payment_columns)

Unnamed: 0,DUPERSID,OBSFX,OBMRX,OBMDX,OBPVX,OBXPX,PERWTF
0,2790002101,3.499328e+05,0.000000e+00,0.0000,1.943177e+06,2.293110e+06,11664.426820
1,2790012103,0.000000e+00,8.438870e+05,0.0000,2.387735e+06,3.231622e+06,23987.691660
2,2790019101,0.000000e+00,1.311894e+06,122606.9291,0.000000e+00,1.434501e+06,12260.692910
3,2790043103,2.837702e+07,0.000000e+00,0.0000,0.000000e+00,2.837702e+07,39688.137450
4,2790051101,0.000000e+00,3.630554e+06,0.0000,0.000000e+00,3.630554e+06,9364.337844
...,...,...,...,...,...,...,...
1110,2819747101,0.000000e+00,0.000000e+00,0.0000,5.825089e+06,5.825089e+06,10594.922930
1111,2819759102,0.000000e+00,3.461402e+06,0.0000,0.000000e+00,3.461402e+06,12228.511460
1112,2819767102,0.000000e+00,1.188150e+06,0.0000,2.031132e+07,2.149947e+07,14477.271490
1113,2819781101,0.000000e+00,5.966134e+05,0.0000,0.000000e+00,5.966134e+05,7023.938785
