In [1]:
import pandas as pd
import pickle

In [2]:
%%time

individual_provider_rows =[]
for chunk in pd.read_csv('../data/Medicare_Provider_Utilization_and_Payment_Data__Physician_and_Other_Supplier_PUF_CY2017.csv', 
                         dtype={'Zip Code of the Provider': object},
                         chunksize = 50000):
    individual_provider_rows.append(chunk[chunk['Entity Type of the Provider'] == 'I']) 
               
                
individual_provider_payment_df = pd.concat(individual_provider_rows, ignore_index=True)

Wall time: 2min 59s


In [3]:
# creating total medicare revenue column

individual_provider_payment_df['Total Medicare Revenue'] = individual_provider_payment_df['Average Medicare Payment Amount'] * individual_provider_payment_df['Number of Services']

In [4]:
# creates and adds groupings for codes to dataframe

HCPCS_Grouped_Code = []

for value in individual_provider_payment_df['HCPCS Code']:
    if value.endswith("T"):
        HCPCS_Grouped_Code.append("0000T")
    else:
        HCPCS_Grouped_Code.append(value[0:3] + "00")


individual_provider_payment_df['HCPCS Grouped Code'] = HCPCS_Grouped_Code 

In [5]:
individual_provider_payment_df.shape

(9416125, 28)

In [6]:
individual_provider_payment_df.columns

Index(['National Provider Identifier',
       'Last Name/Organization Name of the Provider',
       'First Name of the Provider', 'Middle Initial of the Provider',
       'Credentials of the Provider', 'Gender of the Provider',
       'Entity Type of the Provider', 'Street Address 1 of the Provider',
       'Street Address 2 of the Provider', 'City of the Provider',
       'Zip Code of the Provider', 'State Code of the Provider',
       'Country Code of the Provider', 'Provider Type',
       'Medicare Participation Indicator', 'Place of Service', 'HCPCS Code',
       'HCPCS Description', 'HCPCS Drug Indicator', 'Number of Services',
       'Number of Medicare Beneficiaries',
       'Number of Distinct Medicare Beneficiary/Per Day Services',
       'Average Medicare Allowed Amount', 'Average Submitted Charge Amount',
       'Average Medicare Payment Amount',
       'Average Medicare Standardized Amount', 'Total Medicare Revenue',
       'HCPCS Grouped Code'],
      dtype='object')

In [7]:
individual_provider_payment_df_clean = individual_provider_payment_df.drop(columns = ['Last Name/Organization Name of the Provider',
                                                                                      'First Name of the Provider', 
                                                                                      'Middle Initial of the Provider',
                                                                                      'Entity Type of the Provider',
                                                                                      'Street Address 1 of the Provider',
                                                                                      'Street Address 2 of the Provider',
                                                                                      'City of the Provider',
                                                                                      'Zip Code of the Provider',
                                                                                      'State Code of the Provider',
                                                                                      'Country Code of the Provider',
                                                                                      'Medicare Participation Indicator',
                                                                                      'HCPCS Drug Indicator',
                                                                                      'Average Medicare Allowed Amount', 
                                                                                      'Average Submitted Charge Amount',
                                                                                      'Average Medicare Standardized Amount', 
                                                                                      ])
                                                                           
#  Keeping 'National Provider Identifier', 
#          'Credentials of the Provider',
#          'Gender of the Provider', 
#         'Provider Type', 
#         'Place of Service', 
#         'HCPCS Code',
#         'HCPCS Description', 
#         'HCPCS Grouped Code',
#         'Total Medicare Revenue', 
#         'Average Medicare Payment Amount',
#         'Number of Services',
#         'Number of Medicare Beneficiaries',
#         'Number of Distinct Medicare Beneficiary/Per Day Services'


# new code to add ratios and calculations

In [8]:
# creates sums over Provider Type and Provider Type/HCPCS Code

provider_type_sum_of_services = individual_provider_payment_df_clean.groupby('Provider Type', as_index=False)['Number of Services'].sum()

provider_type_services = individual_provider_payment_df_clean.groupby(['Provider Type','HCPCS Code'], as_index=False)['Number of Services'].sum()

provider_type_grouped_services = individual_provider_payment_df_clean.groupby(['Provider Type','HCPCS Grouped Code'], as_index=False)['Number of Services'].sum()

In [9]:
# creates data frames from the new sums

provider_type_sum_of_services_df = pd.DataFrame({"Provider Type": provider_type_sum_of_services['Provider Type'], "total_count": provider_type_sum_of_services["Number of Services"]})

provider_type_services_df = pd.DataFrame({ "Provider Type":provider_type_services["Provider Type"], "HCPCS Code":provider_type_services["HCPCS Code"], "service_count":provider_type_services["Number of Services"]})

provider_type_grouped_services_df = pd.DataFrame({"Provider Type": provider_type_grouped_services['Provider Type'], "HCPCS Grouped Code":provider_type_grouped_services["HCPCS Grouped Code"], "grouped_service_count":provider_type_grouped_services["Number of Services"]})


In [19]:
individual_provider_payment_df_clean_merged = pd.merge(individual_provider_payment_df_clean, provider_type_services_df, how="left", on=['Provider Type','HCPCS Code'])
individual_provider_payment_df_clean_merged = pd.merge(individual_provider_payment_df_clean_merged, provider_type_sum_of_services_df, how="left", on='Provider Type')
individual_provider_payment_df_clean_merged = pd.merge(individual_provider_payment_df_clean_merged, provider_type_grouped_services_df, how="left", on=['Provider Type','HCPCS Grouped Code'])
# individual_provider_payment_df_clean_merged['service_ratio'] = individual_provider_payment_df_clean_merged.service_count/individual_provider_payment_df_clean_merged.total_count
# individual_provider_payment_df_clean_merged['grouped_service_ratio'] = individual_provider_payment_df_clean_merged.grouped_service_count/individual_provider_payment_df_clean_merged.total_count

individual_provider_payment_df_clean_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9416125 entries, 0 to 9416124
Data columns (total 16 columns):
 #   Column                                                    Dtype  
---  ------                                                    -----  
 0   National Provider Identifier                              int64  
 1   Credentials of the Provider                               object 
 2   Gender of the Provider                                    object 
 3   Provider Type                                             object 
 4   Place of Service                                          object 
 5   HCPCS Code                                                object 
 6   HCPCS Description                                         object 
 7   Number of Services                                        float64
 8   Number of Medicare Beneficiaries                          int64  
 9   Number of Distinct Medicare Beneficiary/Per Day Services  int64  
 10  Average Medicare Payment Amoun

In [15]:
individual_provider_payment_df_clean_merged.sort_values(by = 'HCPCS Code').head()

Unnamed: 0,National Provider Identifier,Credentials of the Provider,Gender of the Provider,Provider Type,Place of Service,HCPCS Code,HCPCS Description,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Payment Amount,Total Medicare Revenue,HCPCS Grouped Code,service_count,total_count,grouped_service_count,service_ratio,grouped_service_ratio
6681070,1700955184,M. D.,F,Pathology,O,0008M,Onc breast risk score,92.0,90,91,3363.19587,309414.020003,0,198.0,29564489.8,198.0,7e-06,7e-06
63553,1003871815,MD,M,Pathology,O,0008M,Onc breast risk score,36.0,36,36,3224.514444,116082.519998,0,198.0,29564489.8,198.0,7e-06,7e-06
7144940,1750608915,M.D.,F,Pathology,O,0008M,Onc breast risk score,70.0,70,70,3374.49,236214.3,0,198.0,29564489.8,198.0,7e-06,7e-06
9097847,1962567180,MD,F,Anesthesiology,F,00100,Anesthesia for procedure on salivary gland wit...,11.0,11,11,260.864545,2869.51,100,65.0,14325119.1,1184708.0,5e-06,0.082701
7687405,1811937972,MD,M,Anesthesiology,F,00100,Anesthesia for procedure on salivary gland wit...,12.0,12,12,105.016667,1260.2,100,65.0,14325119.1,1184708.0,5e-06,0.082701


In [20]:
individual_provider_payment_df_clean_merged.to_csv("../data/final_medicare_provider_file.csv", index = False)


In [7]:
# optional pickle

# individual_provider_payment_df_clean.to_pickle("../data/individual_provider_payment.pkl")