In [None]:
import psycopg2
import pandas as pd
import numpy as np
import regex as re
from pathlib import Path
import glob


In [157]:
def file_paths_and_names(year):
    p = Path('data')
    files = list(p.glob(f"*_{year}.csv"))
    file_names = [re.findall(r'[a-zA-Z]+', i.stem)[0] for i in files]
    # print(file_names)
    file_paths = [i for i in files]
    return file_names,file_paths
years = ['19', '20', '21', '22', '23']
files_by_year = {year: dict(zip(*file_paths_and_names(year))) for year in years}
files_by_year

{'19': {'conditions': WindowsPath('data/conditions_19.csv'),
  'outpatient': WindowsPath('data/outpatient_19.csv')},
 '20': {'conditions': WindowsPath('data/conditions_20.csv'),
  'outpatient': WindowsPath('data/outpatient_20.csv')},
 '21': {'conditions': WindowsPath('data/conditions_21.csv'),
  'outpatient': WindowsPath('data/outpatient_21.csv')},
 '22': {'conditions': WindowsPath('data/conditions_22.csv'),
  'outpatient': WindowsPath('data/outpatient_22.csv')},
 '23': {'conditions': WindowsPath('data/conditions_23.csv'),
  'fyc': WindowsPath('data/fyc_23.csv'),
  'outpatient': WindowsPath('data/outpatient_23.csv'),
  'prescriptions': WindowsPath('data/prescriptions_23.csv')}}

## Load Data

In [36]:
def load_prescription_file(file,year):
    RXXP = 'RXXP'+year+'X'
    RXSF = 'RXSF'+year+'X'
    return pd.read_csv(file,usecols=['DUPERSID','RXRECIDX','LINKIDX','TC1S1_1','DIABEQUIP',RXXP,RXSF])


In [171]:
def load_conditions_file(file,year):
    WT = 'PERWT'+year+'F'
    return pd.read_csv(file,usecols = ['DUPERSID','CONDIDX','ICD10CDX',WT])

In [38]:
def load_outpatient_file(file,year):
    tot_exp = 'OPXP'+year+'X'
    oop_doc = 'OPDSF'+year+'X'
    oop_fac = 'OPFSF'+year+'X'
    mdcare_doc = 'OPDMR'+year+'X'
    mdcaid_doc = 'OPDMD'+year+'X'
    priv_doc = 'OPDPV'+year+'X'
    mdcare_fac = 'OPFMR'+year+'X'
    mdcaid_fac = 'OPFMD'+year+'X'
    priv_fac = 'OPFPV'+year+'X'
    WT = 'PERWT'+year+'F'
    return pd.read_csv(file,usecols=['DUPERSID','EVNTIDX',tot_exp,oop_doc,oop_fac,mdcare_doc,mdcare_fac,mdcaid_doc,mdcaid_fac,priv_doc,priv_fac,WT])


In [106]:
def rename_cols(df):
    old_cols = df.columns
    new_cols = [re.sub('[0-9]','',i) for i in old_cols]
    return df.rename(columns=dict(zip(old_cols,new_cols)))

In [None]:
#Condition index and person weight for diabetes 
def condidx_diabetes(df):
    diab_df = df[df['ICDCDX']=='E11'][['CONDIDX','PERWTF']]
    diab_df['CONDIDX'] = diab_df['CONDIDX'].astype(str)
    return diab_df


## outpatient

In [None]:
def create_outpatient_diabetes(out_df,cond_df):
    #find condition associated with event
    out_df['CONDIDX'] = out_df['EVNTIDX'].astype(str).str[:13]
    #outptient event associated with diabetes
    outpatient_diabetes = pd.merge(left=cond_df,right=out_df,on=['CONDIDX','PERWTF'])
    outpatient_diabetes[['DUPERSID','EVNTIDX']] = outpatient_diabetes[['DUPERSID','EVNTIDX']].astype(str)
    return outpatient_diabetes

In [86]:
def outpatient_diab_person_sum(out_df):
    payment_columns = ['OPXPX','OPFSFX','OPFMRX','OPFMDX','OPFPVX','OPDSFX','OPDMRX','OPDMDX','OPDPVX']
    sum_payment_cols = dict(zip(payment_columns,['sum' for i in range(len(payment_columns))]))
    sum_outpatient_payments = out_df.groupby('DUPERSID').agg(sum_payment_cols)
    sum_outpatient_payments= pd.merge(left = sum_outpatient_payments, right= out_df[['DUPERSID','PERWTF']].drop_duplicates(), on='DUPERSID')
    sum_outpatient_payments[payment_columns] = sum_outpatient_payments[payment_columns].multiply(sum_outpatient_payments['PERWTF'],axis='index')
    return sum_outpatient_payments

## Prescriptions

In [174]:
def new_csvs(year):
    #load files
    cond =  rename_cols(load_conditions_file(files_by_year[year]['conditions'],year))
    out = rename_cols(load_outpatient_file(files_by_year[year]['outpatient'],year))
    # prescription =  rename_cols(load_prescription_file(files_by_year[year]['prescriptions'],'23'))
    #unique diabetes
    cond_diab = condidx_diabetes(cond)
    #outpatient
    out_diab = create_outpatient_diabetes(out,cond_diab)
    weighted_outpatient_cost_pp = outpatient_diab_person_sum(out_diab)
    return weighted_outpatient_cost_pp
new_csvs('19')

Unnamed: 0,DUPERSID,OPXPX,OPFSFX,OPFMRX,OPFMDX,OPFPVX,OPDSFX,OPDMRX,OPDMDX,OPDPVX,PERWTF
0,2460124101,7.069813e+05,51816.278550,6.551650e+05,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000e+00,10363.255710
1,2460129101,8.561469e+06,615138.281548,0.000000e+00,7.874175e+06,0.000000e+00,0.000000,0.000000e+00,72155.745743,0.000000e+00,8439.268508
2,2460362101,5.601390e+06,775300.391500,4.146306e+06,0.000000e+00,0.000000e+00,0.000000,6.797834e+05,0.000000,0.000000e+00,15506.007830
3,2460431102,3.375604e+06,0.000000,2.490709e+06,0.000000e+00,6.355369e+05,0.000000,2.493581e+05,0.000000,0.000000e+00,8703.599978
4,2460467102,9.740147e+05,0.000000,9.740147e+05,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000e+00,5450.557677
...,...,...,...,...,...,...,...,...,...,...,...
83,2469004101,1.454763e+07,884110.682250,0.000000e+00,0.000000e+00,1.212517e+07,0.000000,0.000000e+00,0.000000,1.538353e+06,11788.142430
84,2469203101,7.014672e+06,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,0.000000e+00,0.000000,0.000000e+00,6245.812449
85,2469216102,2.515458e+06,424736.262300,0.000000e+00,0.000000e+00,0.000000e+00,0.000000,1.558782e+06,0.000000,0.000000e+00,8494.725246
86,2469341101,1.452231e+07,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,126147.505037,3.278723e+06,0.000000,3.385403e+06,11124.118610
