In [1]:
import pandas as pd
import pickle
import pprint
import random

In [2]:
%%time

individual_provider_payment_df_clean = pd.read_pickle("../data/individual_provider_payment.pkl")

Wall time: 16.6 s


In [3]:
individual_provider_payment_df_clean.describe()

Unnamed: 0,National Provider Identifier,Number of Services,Number of Medicare Beneficiaries,Number of Distinct Medicare Beneficiary/Per Day Services,Average Medicare Allowed Amount,Average Medicare Payment Amount
count,9416125.0,9416125.0,9416125.0,9416125.0,9416125.0,9416125.0
mean,1499785000.0,198.3183,70.84164,115.1297,97.35926,73.92625
std,287728900.0,2351.712,144.2648,278.3331,218.6003,170.5221
min,1003000000.0,2.4,11.0,11.0,6.03538e-05,0.0
25%,1255302000.0,20.0,17.0,20.0,25.01,19.6705
50%,1497969000.0,43.0,32.0,40.0,66.39,47.45194
75%,1740681000.0,116.0,73.0,104.0,113.14,84.56263
max,1993000000.0,3301134.0,95327.0,175870.0,39864.17,31253.51


In [4]:
individual_provider_payment_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9416125 entries, 0 to 9416124
Data columns (total 21 columns):
 #   Column                                                    Dtype  
---  ------                                                    -----  
 0   National Provider Identifier                              int64  
 1   Last Name/Organization Name of the Provider               object 
 2   First Name of the Provider                                object 
 3   Credentials of the Provider                               object 
 4   Entity Type of the Provider                               object 
 5   Street Address 1 of the Provider                          object 
 6   City of the Provider                                      object 
 7   Zip Code of the Provider                                  object 
 8   State Code of the Provider                                object 
 9   Country Code of the Provider                              object 
 10  Provider Type                 

In [5]:
individual_provider_payment_df_clean['Provider Type'].value_counts()

Diagnostic Radiology                   1241400
Internal Medicine                      1118171
Family Practice                         969268
Nurse Practitioner                      560219
Cardiology                              445088
                                        ...   
Ambulance Service Provider                  42
Unknown Supplier/Provider Specialty         17
All Other Suppliers                         11
Medical Toxicology                           2
Slide Preparation Facility                   2
Name: Provider Type, Length: 88, dtype: int64

In [6]:
individual_provider_payment_df_clean['HCPCS Description'].value_counts()

Established patient office or other outpatient visit, typically 15 minutes    457835
Established patient office or other outpatient, visit typically 25 minutes    428262
New patient office or other outpatient visit, typically 45 minutes            179569
New patient office or other outpatient visit, typically 30 minutes            178057
Subsequent hospital inpatient care, typically 25 minutes per day              175335
                                                                               ...  
Drainage of abscess or blood accumulation at forearm and/or wrist                  1
Repair of windpipe cartilage by insertion of splint or graft                       1
Removal of nerve of foot muscle                                                    1
Bone graft of pelvic bone with microvascular connection                            1
Removal of bladder, urinary ducts (ureters)                                        1
Name: HCPCS Description, Length: 5171, dtype: int64

In [7]:
individual_provider_payment_df_clean['HCPCS Code'].value_counts()

99213    457835
99214    428262
99204    179569
99203    178057
99232    175335
          ...  
67120         1
80190         1
44314         1
96420         1
86622         1
Name: HCPCS Code, Length: 5702, dtype: int64

In [60]:
# informational to make sure my row drops will be dropping what I expect it to drop
individual_provider_payment_df_clean.loc[individual_provider_payment_df_clean['HCPCS Code'].str.contains('992..|993..', regex=True)]['HCPCS Code'].sort_values()

4075581    99201
4887937    99201
529176     99201
1642193    99201
7929017    99201
           ...  
8736400    99359
8554568    99359
9286250    99359
5783575    99359
8609130    99359
Name: HCPCS Code, Length: 2957859, dtype: object

In [3]:
# remove patient office visits
individual_provider_payment_df_clean_no_office = individual_provider_payment_df_clean.loc[~individual_provider_payment_df_clean['HCPCS Code'].str.contains('992..|993..', regex=True)]

In [4]:
individual_provider_payment_df_clean_no_office = individual_provider_payment_df_clean_no_office.reset_index(drop=True)

In [5]:
# creating total revenue column

individual_provider_payment_df_clean_no_office['Total Revenue'] = individual_provider_payment_df_clean_no_office['Average Medicare Allowed Amount'] * individual_provider_payment_df_clean_no_office['Number of Services']

In [14]:
# creating groupings for codes

HCPCS_Grouped_Code = []

for value in individual_provider_payment_df_clean_no_office['HCPCS Code']:
    if value.endswith("T"):
        HCPCS_Grouped_Code.append(value)
    else:
        HCPCS_Grouped_Code.append(value[0:3] + "00")
        
     

In [17]:
individual_provider_payment_df_clean_no_office['HCPCS Grouped Code'] = HCPCS_Grouped_Code 

In [18]:
# write file without 99200 and 99300 to csv
individual_provider_payment_df_clean_no_office.to_csv("../data/provider_payments_exclude_99200_and_99300.csv", index=False)

In [118]:
provider_type_sum_of_services = individual_provider_payment_df_clean_no_office.groupby('Provider Type', as_index=False)['Number of Services'].sum()

provider_type_services = individual_provider_payment_df_clean_no_office.groupby(['Provider Type','HCPCS Code'], as_index=False)['Number of Services'].sum()

provider_type_grouped_services = individual_provider_payment_df_clean_no_office.groupby(['Provider Type','HCPCS Grouped Code'], as_index=False)['Number of Services'].sum()


In [124]:
print(provider_type_services.head())
print(provider_type_sum_of_services.head())
print(provider_type_grouped_services.head())

        Provider Type HCPCS Code  Number of Services
0  Addiction Medicine      36415               595.0
1  Addiction Medicine      36556                12.0
2  Addiction Medicine      36620                13.0
3  Addiction Medicine      70100               590.0
4  Addiction Medicine      70310               590.0
                                      Provider Type  Number of Services
0                                Addiction Medicine             38282.0
1  Advanced Heart Failure and Transplant Cardiology              4219.0
2                               All Other Suppliers              1109.0
3                               Allergy/ Immunology          15512077.9
4                        Ambulance Service Provider            233194.9
        Provider Type HCPCS Grouped Code  Number of Services
0  Addiction Medicine              36400               595.0
1  Addiction Medicine              36500                12.0
2  Addiction Medicine              36600                13.0
3  Add

In [126]:
provider_type_sum_of_services_df = pd.DataFrame({"Provider Type": provider_type_sum_of_services['Provider Type'], "total_count": provider_type_sum_of_services["Number of Services"]})

print(provider_type_sum_of_services_df.head())

provider_type_services_df = pd.DataFrame({ "Provider Type":provider_type_services["Provider Type"], "HCPCS Code":provider_type_services["HCPCS Code"], "service_count":provider_type_services["Number of Services"]})
# provider_type_services_df = provider_type_services_df.reset_index(level='HCPCS Code')

provider_type_grouped_services_df = pd.DataFrame({"Provider Type": provider_type_grouped_services['Provider Type'], "HCPCS Grouped Code":provider_type_grouped_services["HCPCS Grouped Code"], "grouped_service_count":provider_type_grouped_services["Number of Services"]})
# provider_type_grouped_services_df = provider_type_grouped_services_df.reset_index(level='HCPCS Grouped Code')

print(provider_type_grouped_services_df.head())

                                      Provider Type  total_count
0                                Addiction Medicine      38282.0
1  Advanced Heart Failure and Transplant Cardiology       4219.0
2                               All Other Suppliers       1109.0
3                               Allergy/ Immunology   15512077.9
4                        Ambulance Service Provider     233194.9
        Provider Type HCPCS Grouped Code  grouped_service_count
0  Addiction Medicine              36400                  595.0
1  Addiction Medicine              36500                   12.0
2  Addiction Medicine              36600                   13.0
3  Addiction Medicine              70100                  590.0
4  Addiction Medicine              70300                 1081.0


In [55]:
provider_type_sum_of_services.index

Index(['Addiction Medicine',
       'Advanced Heart Failure and Transplant Cardiology',
       'All Other Suppliers', 'Allergy/ Immunology',
       'Ambulance Service Provider', 'Anesthesiology',
       'Anesthesiology Assistant', 'Audiologist', 'Cardiac Surgery',
       'Cardiology', 'Certified Clinical Nurse Specialist',
       'Certified Nurse Midwife',
       'Certified Registered Nurse Anesthetist (CRNA)', 'Chiropractic',
       'Clinic or Group Practice', 'Clinical Cardiac Electrophysiology',
       'Clinical Laboratory', 'Colorectal Surgery (Proctology)',
       'Critical Care (Intensivists)', 'Dentist', 'Dermatology',
       'Diagnostic Radiology', 'Emergency Medicine', 'Endocrinology',
       'Family Practice', 'Gastroenterology', 'General Practice',
       'General Surgery', 'Geriatric Medicine', 'Geriatric Psychiatry',
       'Gynecological Oncology', 'Hand Surgery', 'Hematology',
       'Hematology-Oncology', 'Hospice and Palliative Care', 'Hospitalist',
       'Independent

In [127]:
individual_provider_payment_df_clean_no_office_merged = pd.merge(individual_provider_payment_df_clean_no_office, provider_type_services_df, how="left", on=['Provider Type','HCPCS Code'])
individual_provider_payment_df_clean_no_office_merged = pd.merge(individual_provider_payment_df_clean_no_office_merged, provider_type_sum_of_services_df, how="left", on='Provider Type')
individual_provider_payment_df_clean_no_office_merged = pd.merge(individual_provider_payment_df_clean_no_office_merged, provider_type_grouped_services_df, how="left", on=['Provider Type','HCPCS Grouped Code'])
individual_provider_payment_df_clean_no_office_merged['service_ratio'] = individual_provider_payment_df_clean_no_office_merged.service_count/individual_provider_payment_df_clean_no_office_merged.total_count
individual_provider_payment_df_clean_no_office_merged['grouped_service_ratio'] = individual_provider_payment_df_clean_no_office_merged.grouped_service_count/individual_provider_payment_df_clean_no_office_merged.total_count

individual_provider_payment_df_clean_no_office_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6458266 entries, 0 to 6458265
Data columns (total 28 columns):
 #   Column                                                    Dtype  
---  ------                                                    -----  
 0   National Provider Identifier                              int64  
 1   Last Name/Organization Name of the Provider               object 
 2   First Name of the Provider                                object 
 3   Credentials of the Provider                               object 
 4   Entity Type of the Provider                               object 
 5   Street Address 1 of the Provider                          object 
 6   City of the Provider                                      object 
 7   Zip Code of the Provider                                  object 
 8   State Code of the Provider                                object 
 9   Country Code of the Provider                              object 
 10  Provider Type                 

In [97]:
individual_provider_payment_df_clean_no_office_merged.service_count

0            759792.0
1          11238971.8
2            872574.0
3           1020068.1
4            116120.0
              ...    
6458261      528132.0
6458262      566218.0
6458263      566218.0
6458264      858304.0
6458265      858304.0
Name: service_count, Length: 6458266, dtype: float64

In [104]:
pd.options.display.max_rows = 100

individual_provider_payment_df_clean_no_office_merged[["Provider Type", "total_count"]]

Unnamed: 0,Provider Type,total_count
0,Pathology,29516564.8
1,Pathology,29516564.8
2,Pathology,29516564.8
3,Pathology,29516564.8
4,Pathology,29516564.8
...,...,...
6458261,Otolaryngology,8748779.9
6458262,Otolaryngology,8748779.9
6458263,Otolaryngology,8748779.9
6458264,Otolaryngology,8748779.9


In [128]:
# write file without 99200 and 99300; with service_counts and total_count and ratio to csv
individual_provider_payment_df_clean_no_office_merged.to_csv("../data/final_data_file.csv", index=False)

In [68]:
individual_provider_payment_df_clean_no_office_merged.shape

(6458266, 26)

In [18]:
provider_no_office

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Average Medicare Allowed Amount,Average Medicare Payment Amount,National Provider Identifier,Number of Distinct Medicare Beneficiary/Per Day Services,Number of Medicare Beneficiaries,Number of Services,Total Revenue
Provider Type,HCPCS Code,HCPCS Description,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Addiction Medicine,36415,Insertion of needle into vein for collection of blood sample,3.000000,2.933529,1.443171e+09,85.000000,52.285714,85.000000,255.000000
Addiction Medicine,36556,"Insertion of central venous catheter for infusion, patient 5 years or older",112.510000,88.210000,1.770876e+09,11.000000,11.000000,12.000000,1350.120000
Addiction Medicine,36620,"Insertion of arterial catheter for blood sampling or infusion, accessed through the skin",47.990000,37.620000,1.770876e+09,13.000000,12.000000,13.000000,623.870000
Addiction Medicine,70100,"X-ray of mandible, less than 4 views",12.624219,9.532713,1.306066e+09,293.500000,255.000000,295.000000,4606.980000
Addiction Medicine,70310,"X-ray of teeth, less than full mouth",12.237704,9.257998,1.306066e+09,293.500000,255.000000,295.000000,4654.485000
...,...,...,...,...,...,...,...,...,...
Vascular Surgery,Q4131,"Epifix or epicord, per square centimeter",161.975351,126.948602,1.543785e+09,237.500000,27.000000,5138.000000,830971.424989
Vascular Surgery,Q4148,"Neox 1k, per square centimeter",250.330000,196.258910,1.275593e+09,27.000000,11.000000,211.000000,52819.630000
Vascular Surgery,Q9965,"Low osmolar contrast material, 100-199 mg/ml iodine concentration, per ml",0.834526,0.654269,1.609892e+09,111.000000,70.000000,5786.000000,4828.570000
Vascular Surgery,Q9966,"Low osmolar contrast material, 200-299 mg/ml iodine concentration, per ml",0.246482,0.192448,1.531503e+09,93.891892,65.135135,5398.270270,1291.882703


In [31]:
names = ['Amanda', 'Ness', 'Cat', 'Ben', 'Jacob', 'Sophia']


In [32]:
generator = names.pop(random.randrange(0,len(names),1))
print(generator)

Amanda
