In [286]:
import pandas as pd
from datetime import datetime
from tqdm import tqdm
tqdm.pandas()


In [287]:
file = 'F:\\_Bounty\\860800150_mayo-clinic-arizona_standardcharges.csv'
# file = '860800150_mayo_head.csv'
url = "https://mcorgstatic.blob.core.windows.net/cms-price/860800150_mayo-clinic-arizona_standardcharges.csv"
ccn = '030103'
output_file = 'F:\\_Bounty\\transparency-in-pricing\\az_out.csv'

In [288]:
def payer_category(payer_category):
    cats = {
        'payer_specific_negotiated_rate': 'payer', 
        'standard_gross_charge': 'gross', 
        'discounted_cash_price': 'cash', 
        'minimum_negotiated': 'min',
        'maximum_negotiated': 'max'
    }
    
    return cats[payer_category]

def set_value_based_on_code_type(row):
    column_name = row['code_type']
    if column_name != "hcpcs_cpt":
        row[column_name] = row['code']
        return row
    else:
        return row

In [289]:
with open(file, 'r') as f:
    for i, line in enumerate(f):
        if i == 1:
            hospital_name = line.split('-')[-1].split(',')[0].strip('"')
        elif i == 2:
            updated_date = line.split(':')[-1].split(',')[0].strip('"').strip()
        if i > 2:
            break

updated_date = datetime.strptime(updated_date, '%Y/%m/%d')
        
print(hospital_name, updated_date)
            

 Arizona 2023-03-19 00:00:00


In [290]:
df = pd.read_csv(file, dtype=object)
# Drop non-csv data
df = df[7:]

In [291]:
df = df.rename(columns={
    'code_desc': 'procedure_desc',
    'hcpcs': 'hcpcs_cpt',
    'setting': 'patient_class',
    'contract_name': 'payer_name',
    # 'payer_specific_negotiated_rate': 'payer_rate'
})

In [292]:
df.drop(columns=[
    'dose_type',
    'applicable_percent',
    'count_of_compared_rates',
    'footnote',
    'product_name',
    'implied_quantity',
    'hcpcs_dose'

], inplace=True)

In [293]:
cols = df.columns.tolist()
id_vars = cols[:cols.index('modifier')+1]
id_vars.extend(['payer_name', 'rate_method', 'rate_desc'])
print(id_vars)

payer_cat = ['payer_specific_negotiated_rate', 'standard_gross_charge', 'discounted_cash_price', 'minimum_negotiated', 'maximum_negotiated']
df = pd.melt(df, id_vars=id_vars, value_vars=payer_cat, var_name='payer_desc', value_name='rate')

['procedure_desc', 'code', 'rev_code', 'rev_desc', 'hcpcs_cpt', 'code_type', 'billing_class', 'patient_class', 'billed_quantity', 'modifier', 'payer_name', 'rate_method', 'rate_desc']


In [294]:
df_lowers = df[['code_type', 'billing_class', 'patient_class', 'rate_method']]
df[df_lowers.columns] = df_lowers.apply(lambda x: x.str.lower())

In [295]:
# This would all be faster if done before the melt.
df["code_type"] = df["code_type"].apply(lambda x: "hcpcs_cpt" if x == "hcpcs" else x )
# Explode codes into their columns and set payer cat
df["payer_category"] = df["payer_desc"].apply(lambda x: payer_category(x))
# df = df.progress_apply(set_value_based_on_code_type, axis=1) # This takes like 30 minutes, faster to just loop repeatedly
df["cdm"] = df["code"].progress_apply(lambda x: x if pd.notna(x) and "_" in x else pd.NA)
df["apr-drg"] = df["code"].progress_apply(lambda x: x if pd.notna(x) and "-" in x else pd.NA)
df["ms-drg"] = df["code"].progress_apply(lambda x: x if pd.notna(x) and len(x) == 3 else pd.NA)
df["ndc"] = df["code"].progress_apply(lambda x: x if pd.notna(x) and len(x) == 11 else pd.NA)
# hcpcs_cpt is already a column so no need to do it

100%|██████████| 6877375/6877375 [00:08<00:00, 822701.82it/s]
100%|██████████| 6877375/6877375 [00:08<00:00, 842970.75it/s]
100%|██████████| 6877375/6877375 [00:08<00:00, 825367.52it/s]
100%|██████████| 6877375/6877375 [00:08<00:00, 837631.86it/s]


In [296]:
tin = file.split("\\")[-1].split("_")[0]
tin = tin[:2] + "-" + tin[2:]
df["file_last_updated"] = updated_date
df["filename"] = file.split("\\")[-1]
df["url"] = url
df["hospital_ccn"] = ccn
df["hospital_tin"] = tin

In [300]:
# Enforce `na` null convention
non_null_rows = ['modifier', 'rev_code', 'ndc']
df[non_null_rows] = df[non_null_rows].fillna('na')
df['billed_quantity'] = df['billed_quantity'].fillna(-1)
df = df.dropna(subset=['rate'])

In [298]:
# Cleaning and schema conformance
df["code"] = df["code"].str.lstrip("'")
df["rate"] = df["rate"].apply(lambda x: x.replace('$', '').replace(',', ''))
df["rev_desc"] = df["rev_desc"].str[:100]
df['code_type'] = df['code_type'].where(df['code_type'] != 'ndc', None)


In [301]:
df.to_csv(output_file, index=False)