In [1]:
import pandas as pd
import numpy as np
import requests
import json
import boto3
import io
from io import BytesIO, StringIO
from dags.common.credentials import secrets as s
# import pyarrow as pa
# import pyarrow.parquet as pq
# import s3fs

def read_chunks(file_path, size):
    """
    Read staging table in chunks because it has many rows. Returns a concatenated dataframe.
    """
    data_chunks = []
    chunk_counter = 1
    print(f"Reading {file_path} in chunks...")
    for chunk in pd.read_csv(file_path, chunksize=size, low_memory=False):
        print(f"Reading chunk # {str(chunk_counter)}")
        data_chunks.append(chunk)
        chunk_counter += 1
    dataset = pd.concat(data_chunks)
    print('Read complete.')
    return dataset
    
def upload_dataframe_to_s3_csv(dataframe, bucket_name, s3_key):
    # Convert DataFrame to CSV string
    csv_buffer = StringIO()
    dataframe.to_csv(csv_buffer, index=False)
    
    # Upload the CSV string as a file to S3
    s3.put_object(Body=csv_buffer.getvalue(), Bucket=bucket_name, Key=s3_key)
    
def read_chunks_bytes(s3_obj, size):
    """
    Read S3 Object in chunks because it has many rows. Returns a concatenated dataframe.
    """
    data_chunks = []
    chunk_counter = 1
    print(f"Reading S3 Object in chunks...")
    for chunk in pd.read_csv(io.BytesIO(s3_obj['Body'].read()), chunksize=size, low_memory=False):
        print(f"Reading byte chunk # {str(chunk_counter)}")
        data_chunks.append(chunk)
        chunk_counter += 1
    dataset = pd.concat(data_chunks)
    print('Read complete.')
    return dataset

s3 = boto3.client(
's3',
aws_access_key_id=s.s3_aws_access_key_id,
aws_secret_access_key=s.s3_secret_access_key,
region_name=s.aws_region)

In [8]:
# Airflow Job 1

# Research Payments File Path
research_2020_path = "https://download.cms.gov/openpayments/PGYR20_P063023/OP_DTL_RSRCH_PGYR2020_P06302023.csv"
research_2021_path ="https://download.cms.gov/openpayments/PGYR21_P012023/OP_DTL_RSRCH_PGYR2021_P01202023.csv"
# concat research df across both years
research_keep_col_list = ['Change_Type',
'ClinicalTrials_Gov_Identifier',
'Context_of_Research','Covered_Recipient_First_Name',
'Covered_Recipient_Last_Name',
'Covered_Recipient_License_State_code1',
'Covered_Recipient_License_State_code2',
'Covered_Recipient_License_State_code3',
'Covered_Recipient_License_State_code4',
'Covered_Recipient_License_State_code5',
'Covered_Recipient_Middle_Name',
'Covered_Recipient_Name_Suffix',
'Covered_Recipient_NPI',
'Covered_Recipient_Primary_Type_1',
'Covered_Recipient_Primary_Type_2',
'Covered_Recipient_Primary_Type_3',
'Covered_Recipient_Primary_Type_4',
'Covered_Recipient_Primary_Type_5',
'Covered_Recipient_Primary_Type_6',
'Covered_Recipient_Profile_ID',
'Covered_Recipient_Type',
'Date_of_Payment',
'Dispute_Status_for_Publication',
'Expenditure_Category1',
'Expenditure_Category2',
'Expenditure_Category3',
'Expenditure_Category4',
'Expenditure_Category5',
'Expenditure_Category6',
'Form_of_Payment_or_Transfer_of_Value',
'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_2',
'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_3',
'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_4',
'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_5',
'Indicate_Drug_or_Biological_or_Device_or_Medical_Supply_1',
'Name_of_Study',
'Payment_Publication_Date',
'Product_Category_or_Therapeutic_Area_1',
'Product_Category_or_Therapeutic_Area_2',
'Product_Category_or_Therapeutic_Area_3',
'Product_Category_or_Therapeutic_Area_4',
'Product_Category_or_Therapeutic_Area_5',
'Program_Year',
'Recipient_City',
'Recipient_Country',
'Recipient_Postal_Code',
'Recipient_Primary_Business_Street_Address_Line2',
'Recipient_Primary_Business_Street_Address_Line1',
'Recipient_Province',
'Recipient_State',
'Recipient_Zip_Code',
'Record_ID',
'Teaching_Hospital_CCN',
'Teaching_Hospital_ID',
'Teaching_Hospital_Name',
'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_ID',
'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Name',
'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_State',
'Applicable_Manufacturer_or_Applicable_GPO_Making_Payment_Country',
'Total_Amount_of_Payment_USDollars',
                         'Noncovered_Recipient_Entity_Name']

# upload dataframe as parquet
# research_2020_df = read_chunks(research_2020_path, 50000)
# research_2020_df = research_2020_df[research_keep_col_list]
research_2020_file_name = f"{s.source_s3_key}2020_research_payments.csv"
# upload_dataframe_to_s3_csv(research_2020_df, s.bucket_name, research_2020_file_name)

research_2021_df = read_chunks(research_2021_path, 50000)
research_2021_df = research_2021_df[research_keep_col_list]
research_2021_file_name = f"{s.source_s3_key}2021_research_payments.csv"
upload_dataframe_to_s3_csv(research_2021_df, s.bucket_name, research_2021_file_name)

Reading https://download.cms.gov/openpayments/PGYR21_P012023/OP_DTL_RSRCH_PGYR2021_P01202023.csv in chunks...
Reading chunk # 1
Reading chunk # 2
Reading chunk # 3
Reading chunk # 4
Reading chunk # 5
Reading chunk # 6
Reading chunk # 7
Reading chunk # 8
Reading chunk # 9
Reading chunk # 10
Reading chunk # 11
Reading chunk # 12
Reading chunk # 13
Reading chunk # 14
Read complete.


In [4]:
research_2020_df

Unnamed: 0,Change_Type,Covered_Recipient_Type,Noncovered_Recipient_Entity_Name,Teaching_Hospital_CCN,Teaching_Hospital_ID,Teaching_Hospital_Name,Covered_Recipient_Profile_ID,Covered_Recipient_NPI,Covered_Recipient_First_Name,Covered_Recipient_Middle_Name,...,Preclinical_Research_Indicator,Delay_in_Publication_Indicator,Name_of_Study,Dispute_Status_for_Publication,Record_ID,Program_Year,Payment_Publication_Date,ClinicalTrials_Gov_Identifier,Research_Information_Link,Context_of_Research
0,UNCHANGED,Covered Recipient Teaching Hospital,,330154.0,8722.0,Memorial Hospital For Cancer And All,,,,,...,No,No,A PHASE 3 MULTINATIONAL RANDOMIZED OPENLABEL P...,No,742907651,2020,06/30/2023,,,Study Drug
1,UNCHANGED,Covered Recipient Teaching Hospital,,450647.0,9534.0,Medical City Dallas,,,,,...,No,No,An Open-Label Assessment of the Effects of Var...,No,744310283,2020,06/30/2023,,,Payments or transfers of value related to rese...
2,UNCHANGED,Covered Recipient Teaching Hospital,,100006.0,9470.0,Orlando Health,,,,,...,No,No,AMPLATZER Duct Occluder II Additional Sizes (A...,No,751740703,2020,06/30/2023,NCT03055858,,ADO II Additional Sizes IDE
3,CHANGED,Covered Recipient Physician,,,,,303222.0,1.679551e+09,BRADLEY,S,...,No,No,Phoenix DW Registration,No,733472573,2020,06/30/2023,,,
4,UNCHANGED,Covered Recipient Teaching Hospital,,220110.0,8641.0,BRIGHAM AND WOMENS HOSPITAL,,,,,...,No,No,A retrospective analysis of pre-existing and a...,No,731648447,2020,06/30/2023,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
669343,UNCHANGED,Non-covered Recipient Entity,UPMC HILLMAN CANCER CENTERS,,,,,,,,...,No,No,Adjuvant Therapy with Pembrolizumab versus Pla...,No,740742239,2020,06/30/2023,,,Study drug provided to a clinical study site.
669344,UNCHANGED,Non-covered Recipient Entity,UCLA HEMATOLOGY AND ONCOLOGY,,,,,,,,...,No,No,"A Phase 3 Multicenter, Randomized, Double-blin...",No,740808647,2020,06/30/2023,,,Equipment or non-drug supplies provided to a c...
669345,UNCHANGED,Non-covered Recipient Entity,"Advanced Clinical Research,Inc",,,,,,,,...,No,No,MEQ00066 Study to Assess the Safety and Immuno...,No,744033735,2020,06/30/2023,NCT04142242,,
669346,UNCHANGED,Non-covered Recipient Entity,ADVNCD MEDCL RSRCH PC,,,,,,,,...,No,No,BI 730357 Psoriasis Long Term Extension,No,751188659,2020,06/30/2023,,,


In [11]:
# Airflow Job 2

# General Payments File Path
payments_2020_path = "https://download.cms.gov/openpayments/PGYR20_P012023/OP_DTL_GNRL_PGYR2020_P01202023.csv"
payments_2021_path ="https://download.cms.gov/openpayments/PGYR21_P012023/OP_DTL_GNRL_PGYR2021_P01202023.csv"

# Read Payments Data for 2020 and 2021
payments_2020_df = read_chunks(payments_2020_path, 10000)
payments_2021_df = read_chunks(payments_2021_path, 10000)

# upload payments file to S3
payments_2020_df.drop(columns=drop_payment_cols, inplace=True)
payments_2020_file_name = f"{s3_key}2020_general_payments.csv"
upload_dataframe_to_s3_csv(payments_2020_df, bucket_name, payments_2020_file_name)

payments_2021_df.drop(columns=drop_payment_cols, inplace=True)
payments_2021_file_name = f"{s3_key}2021_general_payments.csv"
upload_dataframe_to_s3_csv(payments_2021_df, bucket_name, payments_2021_file_name)

Reading https://download.cms.gov/openpayments/PGYR20_P012023/OP_DTL_GNRL_PGYR2020_P01202023.csv in chunks...
Reading chunk # 1
Reading chunk # 2
Reading chunk # 3
Reading chunk # 4
Reading chunk # 5
Reading chunk # 6
Reading chunk # 7
Reading chunk # 8
Reading chunk # 9
Reading chunk # 10
Reading chunk # 11
Reading chunk # 12
Reading chunk # 13
Reading chunk # 14
Reading chunk # 15
Reading chunk # 16
Reading chunk # 17
Reading chunk # 18
Reading chunk # 19
Reading chunk # 20
Reading chunk # 21
Reading chunk # 22
Reading chunk # 23
Reading chunk # 24
Reading chunk # 25
Reading chunk # 26
Reading chunk # 27
Reading chunk # 28
Reading chunk # 29
Reading chunk # 30
Reading chunk # 31
Reading chunk # 32
Reading chunk # 33
Reading chunk # 34
Reading chunk # 35
Reading chunk # 36
Reading chunk # 37
Reading chunk # 38
Reading chunk # 39
Reading chunk # 40
Reading chunk # 41
Reading chunk # 42
Reading chunk # 43
Reading chunk # 44
Reading chunk # 45
Reading chunk # 46
Reading chunk # 47
Reading

Reading chunk # 411
Reading chunk # 412
Reading chunk # 413
Reading chunk # 414
Reading chunk # 415
Reading chunk # 416
Reading chunk # 417
Reading chunk # 418
Reading chunk # 419
Reading chunk # 420
Reading chunk # 421
Reading chunk # 422
Reading chunk # 423
Reading chunk # 424
Reading chunk # 425
Reading chunk # 426
Reading chunk # 427
Reading chunk # 428
Reading chunk # 429
Reading chunk # 430
Reading chunk # 431
Reading chunk # 432
Reading chunk # 433
Reading chunk # 434
Reading chunk # 435
Reading chunk # 436
Reading chunk # 437
Reading chunk # 438
Reading chunk # 439
Reading chunk # 440
Reading chunk # 441
Reading chunk # 442
Reading chunk # 443
Reading chunk # 444
Reading chunk # 445
Reading chunk # 446
Reading chunk # 447
Reading chunk # 448
Reading chunk # 449
Reading chunk # 450
Reading chunk # 451
Reading chunk # 452
Reading chunk # 453
Reading chunk # 454
Reading chunk # 455
Reading chunk # 456
Reading chunk # 457
Reading chunk # 458
Reading chunk # 459
Reading chunk # 460


Reading chunk # 237
Reading chunk # 238
Reading chunk # 239
Reading chunk # 240
Reading chunk # 241
Reading chunk # 242
Reading chunk # 243
Reading chunk # 244
Reading chunk # 245
Reading chunk # 246
Reading chunk # 247
Reading chunk # 248
Reading chunk # 249
Reading chunk # 250
Reading chunk # 251
Reading chunk # 252
Reading chunk # 253
Reading chunk # 254
Reading chunk # 255
Reading chunk # 256
Reading chunk # 257
Reading chunk # 258
Reading chunk # 259
Reading chunk # 260
Reading chunk # 261
Reading chunk # 262
Reading chunk # 263
Reading chunk # 264
Reading chunk # 265
Reading chunk # 266
Reading chunk # 267
Reading chunk # 268
Reading chunk # 269
Reading chunk # 270
Reading chunk # 271
Reading chunk # 272
Reading chunk # 273
Reading chunk # 274
Reading chunk # 275
Reading chunk # 276
Reading chunk # 277
Reading chunk # 278
Reading chunk # 279
Reading chunk # 280
Reading chunk # 281
Reading chunk # 282
Reading chunk # 283
Reading chunk # 284
Reading chunk # 285
Reading chunk # 286


Reading chunk # 647
Reading chunk # 648
Reading chunk # 649
Reading chunk # 650
Reading chunk # 651
Reading chunk # 652
Reading chunk # 653
Reading chunk # 654
Reading chunk # 655
Reading chunk # 656
Reading chunk # 657
Reading chunk # 658
Reading chunk # 659
Reading chunk # 660
Reading chunk # 661
Reading chunk # 662
Reading chunk # 663
Reading chunk # 664
Reading chunk # 665
Reading chunk # 666
Reading chunk # 667
Reading chunk # 668
Reading chunk # 669
Reading chunk # 670
Reading chunk # 671
Reading chunk # 672
Reading chunk # 673
Reading chunk # 674
Reading chunk # 675
Reading chunk # 676
Reading chunk # 677
Reading chunk # 678
Reading chunk # 679
Reading chunk # 680
Reading chunk # 681
Reading chunk # 682
Reading chunk # 683
Reading chunk # 684
Reading chunk # 685
Reading chunk # 686
Reading chunk # 687
Reading chunk # 688
Reading chunk # 689
Reading chunk # 690
Reading chunk # 691
Reading chunk # 692
Reading chunk # 693
Reading chunk # 694
Reading chunk # 695
Reading chunk # 696


Reading chunk # 1055
Reading chunk # 1056
Reading chunk # 1057
Reading chunk # 1058
Reading chunk # 1059
Reading chunk # 1060
Reading chunk # 1061
Reading chunk # 1062
Reading chunk # 1063
Reading chunk # 1064
Reading chunk # 1065
Reading chunk # 1066
Reading chunk # 1067
Reading chunk # 1068
Reading chunk # 1069
Reading chunk # 1070
Reading chunk # 1071
Reading chunk # 1072
Reading chunk # 1073
Reading chunk # 1074
Reading chunk # 1075
Reading chunk # 1076
Reading chunk # 1077
Reading chunk # 1078
Reading chunk # 1079
Reading chunk # 1080
Reading chunk # 1081
Reading chunk # 1082
Reading chunk # 1083
Reading chunk # 1084
Reading chunk # 1085
Reading chunk # 1086
Reading chunk # 1087
Reading chunk # 1088
Reading chunk # 1089
Reading chunk # 1090
Reading chunk # 1091
Reading chunk # 1092
Reading chunk # 1093
Reading chunk # 1094
Reading chunk # 1095
Reading chunk # 1096
Reading chunk # 1097
Reading chunk # 1098
Reading chunk # 1099
Reading chunk # 1100
Reading chunk # 1101
Reading chunk

NameError: name 'upload_dataframe_to_s3' is not defined

In [13]:
# Airflow Job 3-done

# Read Ownership Data for 2020 and 2021
ownership_2020_df = read_chunks(ownership_2020_path, 50000)
ownership_2020_file_name = f"{s3_key}2020_ownership_payments.csv"
upload_dataframe_to_s3_csv(ownership_2020_df, bucket_name, ownership_2020_file_name)

ownership_2021_df = read_chunks(ownership_2021_path, 50000)
ownership_2021_file_name = f"{s3_key}2021_ownership_payments.csv"
upload_dataframe_to_s3_csv(ownership_2021_df, bucket_name, ownership_2021_file_name)

Reading https://download.cms.gov/openpayments/PGYR20_P012023/OP_DTL_OWNRSHP_PGYR2020_P01202023.csv in chunks...
Reading chunk # 1
Read complete.
Reading https://download.cms.gov/openpayments/PGYR21_P012023/OP_DTL_OWNRSHP_PGYR2021_P01202023.csv in chunks...
Reading chunk # 1
Read complete.


In [None]:
# Provider Mapping Path
# provider_mapping_path = "https://download.cms.gov/openpayments/SMRY_P01202023/PBLCTN_PRVDR_PRFL_MAPPING_P01202023_01042023.csv"
# # Provider profile ID mapping table - Maps secondary/duplicate provider profile IDs to the primary ID in the Open Payments system. For multiple reasons, a small number of providers have more than one ID in the payments records
# # no concatenation
# provider_mapping_df = read_chunks(provider_mapping_path, 50000)

In [15]:
# Airflow Job 4-done

# MIPS Ratings
mips_data_path = 'https://data.cms.gov/provider-data/sites/default/files/resources/a0f235e13d54670824f07977299e80e3_1676693125/ec_score_file.csv

# MIPS Data
mips_df = pd.read_csv(mips_data_path)
mips_df.drop(columns = [' Org_PAC_ID', ' facility_ccn',
       ' facility_lbn',' Cost_category_score'], inplace=True)
mips_file_name = f"{s3_key}mips_score.csv"
upload_dataframe_to_s3_csv(mips_df, bucket_name, mips_file_name)

In [20]:
# Airflow Job 5-done

# Hospital Owner API URL
hospital_url = "https://data.cms.gov/data-api/v1/dataset/029c119f-f79c-49be-9100-344d31d10344/data"

# hospital owner data
hospital_response = requests.get(hospital_url)
hospital_df = pd.DataFrame(hospital_response.json())
hospital_file_name = f"{s3_key}hospital_owner.csv"
upload_dataframe_to_s3_csv(hospital_df, bucket_name, hospital_file_name)

In [25]:
# Airflow Job 6 -done

# Physician Profile File Path
profile_path = "https://download.cms.gov/openpayments/PHPRFL_P012023/OP_CVRD_RCPNT_PRFL_SPLMTL_P01202023.csv"
# # Read Physician Profile Supplement - no concatenation
profile_df = read_chunks(profile_path, 50000)
profile_file_name = f"{s3_key}physician_profile.csv"
upload_dataframe_to_s3_csv(profile_df, bucket_name, profile_file_name)

In [53]:
#try general payments api
api_url = 'https://openpaymentsdata.cms.gov/api/1/datastore/sql?query=[SELECT * FROM bd4a5569-f475-5de2-b9e1-62f9f42e3d4a][WHERE Date_of_Payment = "08/21/2020"][LIMIT 500 OFFSET 500]&show_db_columns=false'
general_response = requests.get(api_url)
general_response

<Response [200]>

In [None]:
# # high level summary for all string columns
# for col in concat_research_df.columns:
#     col_type = concat_research_df[col].dtype
#     print("")
#     print(f"'{col}'")
#     print(f"Column is {col_type} type.")
#     if col_type != 'float64':
#         print(concat_research_df[col].unique())
#         try:
#             print(concat_research_df[col].min())
#             print(concat_research_df[col].max())
#         except:
#             print('incomplete min/max summary')