In [None]:
import psycopg2
import pandas as pd
import numpy as np
import regex as re
from pathlib import Path
import glob


## Load Data

In [180]:
def file_paths_and_names(year):
    p = Path('data')
    files = list(p.glob(f"*_{year}.csv"))
    file_names = [re.findall(r'[a-zA-Z]+', i.stem)[0] for i in files]
    # print(file_names)
    file_paths = [i for i in files]
    return file_names,file_paths
years = ['19', '20', '21', '22', '23']
files_by_year = {year: dict(zip(*file_paths_and_names(year))) for year in years}
files_by_year

{'19': {'conditions': WindowsPath('data/conditions_19.csv'),
  'er': WindowsPath('data/er_19.csv'),
  'fyc': WindowsPath('data/fyc_19.csv'),
  'home': WindowsPath('data/home_19.csv'),
  'inpatient': WindowsPath('data/inpatient_19.csv'),
  'office': WindowsPath('data/office_19.csv'),
  'outpatient': WindowsPath('data/outpatient_19.csv'),
  'prescriptions': WindowsPath('data/prescriptions_19.csv')},
 '20': {'conditions': WindowsPath('data/conditions_20.csv'),
  'er': WindowsPath('data/er_20.csv'),
  'fyc': WindowsPath('data/fyc_20.csv'),
  'home': WindowsPath('data/home_20.csv'),
  'inpatient': WindowsPath('data/inpatient_20.csv'),
  'office': WindowsPath('data/office_20.csv'),
  'outpatient': WindowsPath('data/outpatient_20.csv'),
  'prescriptions': WindowsPath('data/prescriptions_20.csv')},
 '21': {'conditions': WindowsPath('data/conditions_21.csv'),
  'er': WindowsPath('data/er_21.csv'),
  'fyc': WindowsPath('data/fyc_21.csv'),
  'home': WindowsPath('data/home_21.csv'),
  'inpatient':

In [None]:
def load_prescription_file(file,year):
    tot_exp = 'RXXP'+year+'X'
    oop_doc = 'RXSF'+year+'X'
    mdcare_doc = 'RXMR'+year+'X'
    mdcaid_doc = 'RXMD'+year+'X'
    priv_doc = 'RXPV'+year+'X'
    WT = 'PERWT'+year+'F'
    return pd.read_csv(file,usecols=['DUPERSID','RXRECIDX','LINKIDX','TC1S1_1','DIABEQUIP',tot_exp,oop_doc,mdcare_doc,mdcaid_doc,priv_doc,WT])


In [171]:
def load_conditions_file(file,year):
    WT = 'PERWT'+year+'F'
    return pd.read_csv(file,usecols = ['DUPERSID','CONDIDX','ICD10CDX',WT])

In [188]:
def load_inpatient_file(file,year):
    tot_exp = 'IPXP'+year+'X'
    oop_doc = 'IPDSF'+year+'X'
    oop_fac = 'IPFSF'+year+'X'
    mdcare_doc = 'IPDMR'+year+'X'
    mdcaid_doc = 'IPDMD'+year+'X'
    priv_doc = 'IPDPV'+year+'X'
    mdcare_fac = 'IPFMR'+year+'X'
    mdcaid_fac = 'IPFMD'+year+'X'
    priv_fac = 'IPFPV'+year+'X'
    WT = 'PERWT'+year+'F'
    return pd.read_csv(file,usecols=['DUPERSID','EVNTIDX',tot_exp,oop_doc,oop_fac,mdcare_doc,mdcare_fac,mdcaid_doc,mdcaid_fac,priv_doc,priv_fac,WT])

In [38]:
def load_outpatient_file(file,year):
    tot_exp = 'OPXP'+year+'X'
    oop_doc = 'OPDSF'+year+'X'
    oop_fac = 'OPFSF'+year+'X'
    mdcare_doc = 'OPDMR'+year+'X'
    mdcaid_doc = 'OPDMD'+year+'X'
    priv_doc = 'OPDPV'+year+'X'
    mdcare_fac = 'OPFMR'+year+'X'
    mdcaid_fac = 'OPFMD'+year+'X'
    priv_fac = 'OPFPV'+year+'X'
    WT = 'PERWT'+year+'F'
    return pd.read_csv(file,usecols=['DUPERSID','EVNTIDX',tot_exp,oop_doc,oop_fac,mdcare_doc,mdcare_fac,mdcaid_doc,mdcaid_fac,priv_doc,priv_fac,WT])


In [182]:
def load_er_file(file,year):
    tot_exp = 'ERXP'+year+'X'
    oop_doc = 'ERDSF'+year+'X'
    oop_fac = 'ERFSF'+year+'X'
    mdcare_doc = 'ERDMR'+year+'X'
    mdcaid_doc = 'ERDMD'+year+'X'
    priv_doc = 'ERDPV'+year+'X'
    mdcare_fac = 'ERFMR'+year+'X'
    mdcaid_fac = 'ERFMD'+year+'X'
    priv_fac = 'ERFPV'+year+'X'
    WT = 'PERWT'+year+'F'
    return pd.read_csv(file,usecols=['DUPERSID','EVNTIDX',tot_exp,oop_doc,oop_fac,mdcare_doc,mdcare_fac,mdcaid_doc,mdcaid_fac,priv_doc,priv_fac,WT])

In [184]:
def load_office_file(file,year):
    tot_exp = 'OBXP'+year+'X'
    oop_doc = 'OBSF'+year+'X'
    mdcare_doc = 'OBMR'+year+'X'
    mdcaid_doc = 'OBMD'+year+'X'
    priv_doc = 'OBPV'+year+'X'
    WT = 'PERWT'+year+'F'
    return pd.read_csv(file,usecols=['DUPERSID','EVNTIDX',tot_exp,oop_doc,mdcare_doc,mdcaid_doc,priv_doc,WT])

In [185]:
def load_home_file(file,year):
    tot_exp = 'HHXP'+year+'X'
    oop_doc = 'HHSF'+year+'X'
    mdcare_doc = 'HHMR'+year+'X'
    mdcaid_doc = 'HHMD'+year+'X'
    priv_doc = 'HHPV'+year+'X'
    WT = 'PERWT'+year+'F'
    return pd.read_csv(file,usecols=['DUPERSID','EVNTIDX',tot_exp,oop_doc,mdcare_doc,mdcaid_doc,priv_doc,WT])   

In [106]:
def rename_cols(df):
    old_cols = df.columns
    new_cols = [re.sub('[0-9]','',i) for i in old_cols]
    return df.rename(columns=dict(zip(old_cols,new_cols)))

In [None]:
#Condition index and person weight for diabetes 
def condidx_diabetes(df):
    diab_df = df[df['ICDCDX']=='E11'][['CONDIDX','PERWTF']]
    diab_df['CONDIDX'] = diab_df['CONDIDX'].astype(str)
    return diab_df


## outpatient

In [None]:
def create_outpatient_diabetes(out_df,cond_df):
    #find condition associated with event
    out_df['CONDIDX'] = out_df['EVNTIDX'].astype(str).str[:13]
    #outptient event associated with diabetes
    outpatient_diabetes = pd.merge(left=cond_df,right=out_df,on=['CONDIDX','PERWTF'])
    outpatient_diabetes[['DUPERSID','EVNTIDX']] = outpatient_diabetes[['DUPERSID','EVNTIDX']].astype(str)
    return outpatient_diabetes

In [86]:
def outpatient_diab_person_sum(out_df):
    payment_columns = ['OPXPX','OPFSFX','OPFMRX','OPFMDX','OPFPVX','OPDSFX','OPDMRX','OPDMDX','OPDPVX']
    sum_payment_cols = dict(zip(payment_columns,['sum' for i in range(len(payment_columns))]))
    sum_outpatient_payments = out_df.groupby('DUPERSID').agg(sum_payment_cols)
    sum_outpatient_payments= pd.merge(left = sum_outpatient_payments, right= out_df[['DUPERSID','PERWTF']].drop_duplicates(), on='DUPERSID')
    sum_outpatient_payments[payment_columns] = sum_outpatient_payments[payment_columns].multiply(sum_outpatient_payments['PERWTF'],axis='index')
    return sum_outpatient_payments

## Prescriptions

In [194]:
def load_all_files(year):
    cond =  rename_cols(load_conditions_file(files_by_year[year]['conditions'],year))
    out = rename_cols(load_outpatient_file(files_by_year[year]['outpatient'],year))
    prescription =  rename_cols(load_prescription_file(files_by_year[year]['prescriptions'],year)) 
    inpatient =  rename_cols(load_inpatient_file(files_by_year[year]['inpatient'],year))    
    home =  rename_cols(load_home_file(files_by_year[year]['home'],year))   
    office =  rename_cols(load_office_file(files_by_year[year]['office'],year))
    er =  rename_cols(load_er_file(files_by_year[year]['er'],year)) 
    return [cond,out,prescription,inpatient,home,office,er]
      

In [198]:
dfs_by_year = {}
df_names = ['cond','out','prescription','inpatient','home','office','er']
for year in years:
    dfs_by_year[year] = dict(zip(df_names,load_all_files(year)))


In [205]:
def get_diabetes_conditions(year):
    return condidx_diabetes(dfs_by_year[year]['cond'])
cond_diab = {}
for year in years:
    cond_diab[year]=get_diabetes_conditions(year)

In [206]:
def weighted_outpatient(year):
    #outpatient
    out_diab = create_outpatient_diabetes(dfs_by_year[year]['out'],cond_diab[year])
    weighted_outpatient_cost_pp = outpatient_diab_person_sum(out_diab)
    return weighted_outpatient_cost_pp
outpatient_dfs = []
for year in years:
    df = weighted_outpatient(year)
    df['year'] = '20'+year
    outpatient_dfs.append(df)
weighted_outpatient_years = pd.concat(outpatient_dfs,ignore_index=True)
weighted_outpatient_years.to_csv('data/processed_data/weighted_outpatient_sums.csv',index=False)