Data from:

Physician & Other Supplier Payments - Detailed Data https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/Medicare-Provider-Charge-Data/Physician-and-Other-Supplier2017

Hospital Outpatient - Detailed Data https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/Medicare-Provider-Charge-Data/Outpatient

APC to CPT/HCPCS crosswalk - Addendum B – January 2020 https://www.cms.gov/Medicare/Medicare-Fee-for-Service-Payment/HospitalOutpatientPPS/Addendum-A-and-Addendum-B-Updates

Zip Code to CBSA https://www.huduser.gov/portal/datasets/usps_crosswalk.html![image.png](attachment:image.png)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline

In [3]:
pd.options.display.max_columns = 25

pd.options.display.max_rows = 25

In [4]:
# Reading in the CBSA to ZIP code crosswalk

cbsa_to_zip = pd.read_excel('../data/CBSA_ZIP_032020.xlsx', 
                            usecols = ['CBSA', 'ZIP'],
                            dtypes={'ZIP':'str'})

In [5]:
#Reading in HCPCS to APC crosswalk with Short Descriptor column

hcpcs_to_apc = pd.read_excel('../data/Addendum_B/2020_january_web_addendum_b.12312019.xlsx', 
                             header = 2, usecols = ['HCPCS Code', 'Short Descriptor', 'APC '])

In [6]:
#Reading in the Physician & Other Supplier Payments while discarding some columns

phys_other_payments = pd.read_csv('../data/Medicare_Provider_Util_Payment_PUF_CY2017/Medicare_Provider_Util_Payment_PUF_CY2017.txt', 
    sep='\t', 
    skiprows = [1], 
    usecols = lambda column : column not in ['average_submitted_chrg_amt', 
                                             'average_Medicare_payment_amt', 
                                             'average_Medicare_standard_amt', 
                                             'hcpcs_drug_indicator', 
                                             'medicare_participation_indicator'], 
    dtype = {'nppes_provider_zip':'str'},
    low_memory=False)

In [8]:
# Reading in the Hospital Outpatient while discarding some columns

column_exclude_list = ['Outlier\nComprehensive\nAPC\nServices', 'Average\nMedicare\nOutlier\nAmount', 'Average\nEstimated\nTotal\nSubmitted\nCharges', 'Average\nMedicare\nPayment\nAmount']

hosp_payments = pd.read_excel('../data/MUP_OHP_R19_P04_V10_D17_APC_Provider/MUP_OHP_R19_P04_V10_D17_APC_Provider.xlsx', 
                              header = 5, 
                              usecols = lambda column : column not in column_exclude_list,
                              dtype = {'Provider\nZip Code' : 'str'})

In [9]:
hcpcs_to_apc.columns = ['hcpcs', 'descriptor', 'apc']
cbsa_to_zip.columns = ['cbsa', 'zip']
phys_other_payments.columns = ['npi', 
                               'last_org_name', 
                               'first_name', 
                               'mi', 
                               'creds', 
                               'gender', 
                               'entity', 
                               'street1', 
                               'street2' , 
                               'city', 
                               'zip',
                               'state', 
                               'country', 
                               'provider_type', 
                               'service_loc', 
                               'hcpcs', 
                               'description', 
                               'services_cnt', 
                               'benefic_cnt', 
                               'benefic_d_cnt', 
                               'avg_medi_allowed_amt']
hosp_payments.columns = ['provider_id', 
                         'provider_name', 
                         'street', 
                         'city', 
                         'state', 
                         'zip', 
                         'region', 
                         'apc', 
                         'description', 
                         'benefic_cnt', 
                         'services_cnt', 
                         'avg_medi_allowed_amt']

In [10]:
# Dropping non US

phys_other_payments = phys_other_payments[phys_other_payments.country == 'US']

In [11]:
# Removing trailing zip digits

phys_other_payments['zip'] = phys_other_payments['zip'].str[:5]

In [13]:
# Putting leading zero back on zips

cbsa_to_zip['zip'] = cbsa_to_zip['zip'].apply(lambda x: '{0:0>5}'.format(x))

In [14]:
# Merging CBSA to Zip in physcian billing data

phys_cbsa = pd.merge(phys_other_payments, cbsa_to_zip, how='left', on = ['zip'])

del phys_other_payments

In [25]:
phys_cbsa = phys_cbsa.dropna(subset = ['cbsa'])

In [26]:
phys_cbsa['cbsa'] = phys_cbsa['cbsa'].astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [30]:
# CBSA 99999 is an 'other' grouping. Discarding

phys_cbsa = phys_cbsa[phys_cbsa['cbsa'] != 99999]

In [31]:
list = []

for cbsa in phys_cbsa.cbsa.unique():
    subset = phys_cbsa[phys_cbsa['cbsa'] == cbsa]
    sum_visits = subset.benefic_d_cnt.sum()
    list.append([cbsa, sum_visits])
    
cbsa_sum_visits = pd.DataFrame(list)
cbsa_sum_visits.columns = ['cbsa', 'sum_visits']

del list

In [33]:
cbsa_sum_visits.sort_values('sum_visits')

Unnamed: 0,cbsa,sum_visits
937,27580,2799
930,29500,2841
850,11380,6672
310,17640,6805
912,37770,7998
...,...,...
47,19100,31308599
7,37980,33593820
1,16980,44757502
9,31080,52879314
