In [None]:
import pandas as pd
import numpy as np


def read_chunks(file_path, size):
    """
    Read staging table in chunks because it has many rows. Returns a concatenated dataframe.
    """
    data_chunks = []
    chunk_counter = 1
    print(f"Reading {file_path} in chunks...")
    for chunk in pd.read_csv(file_path, chunksize=size, low_memory=False):
        print(f"Reading chunk # {str(chunk_counter)}")
        data_chunks.append(chunk)
        chunk_counter += 1
    dataset = pd.concat(data_chunks)
    print('Read complete.')
    return dataset

In [3]:
# Research Payments File Path
research_2020_path = "https://download.cms.gov/openpayments/PGYR19_P012023/OP_DTL_RSRCH_PGYR2019_P01202023.csv"
research_2021_path ="https://download.cms.gov/openpayments/PGYR21_P012023/OP_DTL_RSRCH_PGYR2021_P01202023.csv"
# General Payments File Path
payments_2020_path = "https://download.cms.gov/openpayments/PGYR20_P012023/OP_DTL_GNRL_PGYR2020_P01202023.csv"
payments_2021_path ="https://download.cms.gov/openpayments/PGYR21_P012023/OP_DTL_GNRL_PGYR2021_P01202023.csv"
# Ownership Payments File Path
ownership_2020_path = "https://download.cms.gov/openpayments/PGYR20_P012023/OP_DTL_OWNRSHP_PGYR2020_P01202023.csv"
ownership_2021_path = "https://download.cms.gov/openpayments/PGYR21_P012023/OP_DTL_OWNRSHP_PGYR2021_P01202023.csv"
# Physician Profile File Path
profile_path = "https://download.cms.gov/openpayments/PHPRFL_P012023/OP_CVRD_RCPNT_PRFL_SPLMTL_P01202023.csv"
# Provider Mapping Path
provider_mapping_path = "https://download.cms.gov/openpayments/SMRY_P01202023/PBLCTN_PRVDR_PRFL_MAPPING_P01202023_01042023.csv"

In [None]:
# Read Research Data for 2020 and 2021
research_2020_df = read_chunks(research_2020_path, 50000)
research_2021_df = read_chunks(research_2021_path, 50000)
# concat research df across both years
concat_research_df = pd.concat([research_2020_df, research_2021_df])
research_keep_col_list = ['Change_Type',
'ClinicalTrials_Gov_Identifier',
'Context_of_Research','Covered_Recipient_First_Name',
'Covered_Recipient_Last_Name',
'Covered_Recipient_License_State_code1',
'Covered_Recipient_License_State_code2',
'Covered_Recipient_License_State_code3',
'Covered_Recipient_License_State_code4',
'Covered_Recipient_License_State_code5',
'Covered_Recipient_Middle_Name',
'Covered_Recipient_Name_Suffix',
'Covered_Recipient_NPI',
'Covered_Recipient_Primary_Type_1',
'Covered_Recipient_Primary_Type_2',
'Covered_Recipient_Primary_Type_3',
'Covered_Recipient_Primary_Type_4',
'Covered_Recipient_Primary_Type_5',
'Covered_Recipient_Primary_Type_6',
'Covered_Recipient_Profile_ID',
'Covered_Recipient_Type',
'Date_of_Payment',
'Dispute_Status_for_Publication',
'Expenditure_Category1',
'Expenditure_Category2',
'Expenditure_Category3',
'Expenditure_Category4',
'Expenditure_Category5',
'Expenditure_Category6',
'Form_of_Payment_or_Transfer_of_Value',
'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_2',
'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_3',
'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_4',
'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_5',
'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_1',
'Name_of_Study',
'Payment_Publication_Date',
'Product_Category_or_Therapeutic_Area_1',
'Product_Category_or_Therapeutic_Area_2',
'Product_Category_or_Therapeutic_Area_3',
'Product_Category_or_Therapeutic_Area_4',
'Product_Category_or_Therapeutic_Area_5',
'Program_Year',
'Recipient_City',
'Recipient_Country',
'Recipient_Postal_Code',
'Recipient_Primary_Business_Street_Address_Line2',
'Recipient_Primary_Business_Street_Address_Line1',
'Recipient_Province',
'Recipient_State',
'Recipient_Zip_Code',
'Record_ID',
'Teaching_Hospital_CCN',
'Teaching_Hospital_ID',
'Teaching_Hospital_Name',
'Total_Amount_of_Payment_USDollars']
cleaned_research_df = concat_research_df[research_keep_col_list]
cleaned_research_df.head()
del concat_research_df

In [None]:
# Read Payments Data for 2020 and 2021
payments_2020_df = read_chunks(payments_2020_path, 10000)
payments_2021_df = read_chunks(payments_2021_path, 10000)
drop_payment_cols = ['Covered_Recipient_Middle_Name','Covered_Recipient_Name_Suffix','Recipient_Province',
                     'Recipient_Postal_Code','Contextual_Information','Covered_or_Noncovered_Indicator_1',
                     'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_1','Product_Category_or_Therapeutic_Area_1',
                     'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_1','Associated_Drug_or_Biological_NDC_1',
                     'Associated_Device_or_Medical_Supply_PDI_1','Covered_or_Noncovered_Indicator_2',
                     'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_2','Product_Category_or_Therapeutic_Area_2',
                     'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_2','Associated_Drug_or_Biological_NDC_2',
                     'Associated_Device_or_Medical_Supply_PDI_2','Covered_or_Noncovered_Indicator_3',
                     'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_3','Product_Category_or_Therapeutic_Area_3',
                     'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_3','Associated_Drug_or_Biological_NDC_3',
                     'Associated_Device_or_Medical_Supply_PDI_3','Covered_or_Noncovered_Indicator_4',
                     'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_4','Product_Category_or_Therapeutic_Area_4',
                     'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_4','Associated_Drug_or_Biological_NDC_4',
                     'Associated_Device_or_Medical_Supply_PDI_4','Covered_or_Noncovered_Indicator_5',
                     'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_5','Product_Category_or_Therapeutic_Area_5',
                     'Name_of_Drug_or_Biological_or_Device_or_Medical_Supply_5','Associated_Drug_or_Biological_NDC_5',
                     'Associated_Device_or_Medical_Supply_PDI_5','Covered_Recipient_Specialty_2','Covered_Recipient_Specialty_3','Covered_Recipient_Specialty_4',
                     'Covered_Recipient_Specialty_5','Covered_Recipient_Specialty_6','Covered_Recipient_Primary_Type_2',
                     'Covered_Recipient_Primary_Type_3','Covered_Recipient_Primary_Type_4',
                     'Covered_Recipient_Primary_Type_5', 'Covered_Recipient_Primary_Type_6']

# concat general payments df across both years
payments_2020_df.drop(columns=drop_payment_cols, inplace=True)
payments_2021_df.drop(columns=drop_payment_cols, inplace=True)

Reading https://download.cms.gov/openpayments/PGYR20_P012023/OP_DTL_GNRL_PGYR2020_P01202023.csv in chunks...


In [None]:
# Read Ownership Data for 2020 and 2021
ownership_2020_df = read_chunks(ownership_2020_path, 50000)
ownership_2021_df = read_chunks(ownership_2021_path, 50000)
# concat ownership df across both years
concat_ownership_df = pd.concat([ownership_2020_df, ownership_2021_df])

In [None]:
# Read Physician Profile Supplement - no concatenation
profile_df = read_chunks(profile_path, 50000)
# Provider profile ID mapping table - Maps secondary/duplicate provider profile IDs to the primary ID in the Open Payments system. For multiple reasons, a small number of providers have more than one ID in the payments records
# no concatenation
provider_mapping_df = read_chunks(provider_mapping_path, 50000)

In [None]:
# high level summary for all string columns
for col in concat_research_df.columns:
    col_type = concat_research_df[col].dtype
    print("")
    print(f"'{col}'")
    print(f"Column is {col_type} type.")
    if col_type != 'float64':
        print(concat_research_df[col].unique())
        try:
            print(concat_research_df[col].min())
            print(concat_research_df[col].max())
        except:
            print('incomplete min/max summary')