Data from:

Physician & Other Supplier Payments - Detailed Data https://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/Medicare-Provider-Charge-Data/Physician-and-Other-Supplier2017

Zip Code to CBSA https://www.huduser.gov/portal/datasets/usps_crosswalk.html![image.png](attachment:image.png)

Population data https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-metro-and-micro-statistical-areas.html#par_textimage

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline

In [130]:
pd.options.display.max_columns = 25

pd.options.display.max_rows = 100

In [4]:
# Reading in the CBSA to ZIP code crosswalk

cbsa_to_zip = pd.read_excel('../data/CBSA_ZIP_032020.xlsx', 
                            usecols = ['CBSA', 'ZIP'],
                            dtypes={'ZIP':'str'})

In [7]:
#Reading in the Physician & Other Supplier Payments while discarding some columns

phys_other_payments = pd.read_csv('../data/Medicare_Provider_Util_Payment_PUF_CY2017/Medicare_Provider_Util_Payment_PUF_CY2017.txt', 
    sep='\t', 
    skiprows = [1], 
    usecols = lambda column : column not in ['average_submitted_chrg_amt', 
                                             'average_Medicare_payment_amt', 
                                             'average_Medicare_standard_amt', 
                                             'hcpcs_drug_indicator', 
                                             'medicare_participation_indicator'], 
    dtype = {'nppes_provider_zip':'str'},
    low_memory=False)

In [97]:
cbsa_population = pd.read_csv('..\data\csa-est2019-alldata.csv', engine = 'python', 
           usecols = ['CBSA', 'LSAD', 'NAME', 'POPESTIMATE2017'])


In [9]:
cbsa_to_zip.columns = ['cbsa', 'zip']
phys_other_payments.columns = ['npi', 
                               'last_org_name', 
                               'first_name', 
                               'mi', 
                               'creds', 
                               'gender', 
                               'entity', 
                               'street1', 
                               'street2' , 
                               'city', 
                               'zip',
                               'state', 
                               'country', 
                               'provider_type', 
                               'service_loc', 
                               'hcpcs', 
                               'description', 
                               'services_cnt', 
                               'benefic_cnt', 
                               'benefic_d_cnt', 
                               'avg_medi_allowed_amt']
cbsa_population.columns = ['cbsa', 'lsad', 'name', 'pop2017']

In [98]:
cbsa_population = cbsa_population[cbsa_population['lsad'] != 'County or equivalent']

In [99]:
cbsa_population = cbsa_population.dropna(subset = ['cbsa'])

In [101]:
cbsa_population['cbsa'] = cbsa_population['cbsa'].astype('int')

In [102]:
cbsa_pop_sum = cbsa_population.groupby(['cbsa'])[['pop2017']].sum()

In [27]:
# Dropping non US

phys_other_payments = phys_other_payments[phys_other_payments.country == 'US']

In [28]:
# Removing trailing zip digits

phys_other_payments['zip'] = phys_other_payments['zip'].str[:5]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [29]:
# Putting leading zero back on zips

cbsa_to_zip['zip'] = cbsa_to_zip['zip'].apply(lambda x: '{0:0>5}'.format(x))

In [32]:
phys_office = phys_other_payments[phys_other_payments['service_loc'] == 'O']

In [36]:
phys_office_tn = phys_office[phys_office['state'] == 'TN']

In [40]:
# Merging CBSA to Zip in physcian billing data

phys_tn_cbsa = pd.merge(phys_office_tn, cbsa_to_zip, how='left', on = ['zip'])

In [42]:
phys_tn_cbsa = phys_tn_cbsa.dropna(subset = ['cbsa'])

In [44]:
phys_tn_cbsa['cbsa'] = phys_tn_cbsa['cbsa'].astype('int')

In [46]:
# CBSA 99999 is an 'other' grouping. Discarding

phys_tn_cbsa = phys_tn_cbsa[phys_tn_cbsa['cbsa'] != 99999]

In [76]:
provider_cbsa_sum_visits_tn = phys_tn_cbsa.groupby(['provider_type', 'cbsa'])[['benefic_d_cnt']].sum()

provider_cbsa_sum_visits_tn  = provider_cbsa_sum_visits_tn.reset_index()

In [55]:
cbsa_provider_sum_visits_tn = phys_tn_cbsa.groupby(['cbsa', 'provider_type'])[['benefic_d_cnt']].sum()

cbsa_provider_sum_visits_tn = cbsa_provider_sum_visits_tn.reset_index()

In [95]:
provider_cbsa_sum_visits_tn = pd.merge(provider_cbsa_sum_visits_tn, cbsa_pop_sum, how = 'left', on = ['cbsa'])

In [96]:
cbsa_provider_sum_visits_tn = pd.merge(cbsa_provider_sum_visits_tn, cbsa_pop_sum, how = 'left', on = ['cbsa'])

In [112]:
cbsa_population['state'] = cbsa_population['lsad'].str.split(',').str[1]

In [114]:
cbsa_population['region'] = cbsa_population['lsad'].str.split(',').str[0]

In [116]:
cbsa_population['state'] = cbsa_population['state'].str.strip()

cbsa_population['region'] = cbsa_population['region'].str.strip()

In [123]:
cbsa_region_state = cbsa_population.loc[:, ['cbsa', 'state', 'region']]

In [132]:
cbsa_region_state = cbsa_region_state.drop_duplicates(subset = ['cbsa'])

In [134]:
provider_cbsa_merge = pd.merge(provider_cbsa_sum_visits_tn, cbsa_region_state, how = 'left', on = ['cbsa'])

In [138]:
cbsa_provider_merge = pd.merge(cbsa_provider_sum_visits_tn, cbsa_region_state, how = 'left', on = ['cbsa'])

In [142]:
cbsa_provider_merge['cnt_per_1kcapita'] = cbsa_provider_merge['benefic_d_cnt']/cbsa_provider_merge['pop2017']*1000

In [146]:
provider_cbsa_merge['cnt_per_1kcapita'] = provider_cbsa_merge['benefic_d_cnt']/provider_cbsa_merge['pop2017']*1000

In [147]:
cbsa_provider_merge.to_csv('../cbsa_provider.csv')

In [148]:
provider_cbsa_merge.to_csv('../provider_cbsa.csv')