In [97]:
import pandas
import requests
import json
from numpy import nan
from typing import Optional, Tuple, MutableSet, List
from bs4 import BeautifulSoup

In [19]:
def _datastore_id_to_query_key(id: str) -> str:

    url = f"https://openpaymentsdata.cms.gov/api/1/metastore/schemas/dataset/items/{id}?show-reference-ids=false"
    d = json.loads(requests.get(url).text)
    return d["distribution"][0]["identifier"]

In [20]:
research_master_ids = ('nvfc-jcr4', '94mj-bpz5', '29v2-guh5')
research_keys = [_datastore_id_to_query_key(i) for i in research_master_ids]

In [21]:
general_master_ids = ('ud7t-2ipu', 'qsys-b88w', 'txng-a8vj')
general_keys = [_datastore_id_to_query_key(i) for i in general_master_ids]

In [22]:
target_companies: List[str] = pandas.read_csv("compareToCompanies.csv")\
    ['Company']\
    .to_list()

In [23]:
profile_info_key = _datastore_id_to_query_key('yjhd-k7tx')
profile_info_key

'c218c372-76e4-5603-9981-67c324402722'

In [28]:
def _company_name_to_id(name: str) -> List[str]:
    name = name.strip()

    targets = [name]
    if len(name.split()) > 1:
        targets.append(name.split()[0])
    results = []
    url = f"https://openpaymentsdata.cms.gov/api/1/datastore/query/{profile_info_key}"
    for target_name in targets:
        payload = {
            "keys": "true",
            "limit": "10",
            "offset": "0",
            "conditions[0][resource]": "t",
            "conditions[0][property]": "entity_type",
            "conditions[0][value]": "c",
            "conditions[0][operator]": "=",
            "conditions[1][groupOperator]": "or",
            "conditions[1][conditions][0][resource]": "t",
            "conditions[1][conditions][0][property]": "entity_name",
            "conditions[1][conditions][0][value]": f"%%{target_name}%%",
            "conditions[1][conditions][0][operator]": "like",
            "sorts[0][property]": "entity_name",
            "sorts[0][order]": "asc"
        }
        r = requests.get(url, params=payload)
        lst = r.json()["results"]

        out = [d["entity_id"] for d in lst]
        if len(out) == 1:
            return out
        

    return []
    

In [29]:
company_names_to_ids = {i: _company_name_to_id(i) for i in target_companies}

In [34]:
company_names_to_ids = {k: v[0] for k, v in company_names_to_ids.items() if len(v) == 1}

In [55]:
company_ids = set(company_names_to_ids.values())
company_ids

{'100000000102',
 '100000000157',
 '100000000278',
 '100000010482',
 '100000010574',
 '100000010605',
 '100000010997',
 '100000011034',
 '100000016250',
 '100000076361',
 '100000131389',
 '100000136519',
 '100000181754',
 '100000216803',
 '100000226814'}

In [105]:
search_col = "applicable_manufacturer_or_applicable_gpo_making_payment_id"
page_size = 1_000_000
df = pandas.DataFrame()

for key in general_keys:
    for id in company_ids:
        offset = 0
        while True:
            #print(df)
            query = f'[SELECT * FROM {key}][WHERE {search_col} = "{id}"][LIMIT {page_size} OFFSET {offset}];'
            url = f'https://openpaymentsdata.cms.gov/api/1/datastore/sql?query={query}'

            r = requests.get(url) # more robust than pandas
            r.raise_for_status()

            tmp: pandas.DataFrame = pandas.read_json(r.text)

            if len(tmp) == 0 and len(tmp.columns) == 0:
                break
            
            df = df.append(tmp)
            offset += page_size

del search_col, page_size

In [106]:
df['physician_profile_id'].replace('', nan, inplace=True)
df.dropna(subset=['physician_profile_id'], inplace=True)

In [107]:
df.reset_index(inplace=True, drop=True)


In [111]:
df_research = pandas.DataFrame()


search_col = "applicable_manufacturer_or_applicable_gpo_making_payment_id"
page_size = 1_000_000

for key in research_keys:
    for id in company_ids:
        offset = 0
        while True:
            query = f'[SELECT * FROM {key}][WHERE {search_col} = "{id}"][LIMIT {page_size} OFFSET {offset}];'
            url = f'https://openpaymentsdata.cms.gov/api/1/datastore/sql?query={query}'

            r = requests.get(url) # more robust than pandas
            r.raise_for_status()

            tmp: pandas.DataFrame = pandas.read_json(r.text)

            if len(tmp) == 0 and len(tmp.columns) == 0:
                break

            tmp.reset_index(drop=True, inplace=True)
            tmp = tmp[tmp.index.notnull()]

            for i in range(5): # get all 5 principal investigators into a master dataframe
                investigator_key = f'principal_investigator_{i+1}'
                col_name = f'{investigator_key}_profile_id'

                columns_to_drop = [col for col in tmp.columns 
                    if col.startswith('principal_investigator') 
                    and not col.startswith(investigator_key)
                ]

                columns_to_rename = {
                    col: col.replace(investigator_key, 'principal_investigator') 
                    for col in tmp.columns if col.startswith(investigator_key)
                }
                
                profile_id_indexed = tmp\
                    .drop(columns=columns_to_drop)\
                    .rename(columns=columns_to_rename)


                df_research = df_research.append(profile_id_indexed)

            offset += page_size

del search_col, page_size

    
    



In [112]:
df_research['principal_investigator_profile_id'].replace('', nan, inplace=True)
df_research.dropna(subset=['principal_investigator_profile_id'], inplace=True)

In [113]:
df_research.reset_index(drop=True, inplace=True)

In [114]:
df_research

Unnamed: 0,change_type,covered_recipient_type,noncovered_recipient_entity_name,teaching_hospital_ccn,teaching_hospital_id,teaching_hospital_name,physician_profile_id,physician_first_name,physician_middle_name,physician_last_name,...,preclinical_research_indicator,delay_in_publication_indicator,name_of_study,dispute_status_for_publication,record_id,program_year,payment_publication_date,clinicaltrials_gov_identifier,research_information_link,context_of_research
0,UNCHANGED,Covered Recipient Teaching Hospital,,340030,6801,DUKE UNIVERSITY HOSPITAL,,,,,...,No,No,"A Prospective, Single-Center, Randomized, Doub...",No,608351697,2018,06/30/2021,,,
1,UNCHANGED,Non-covered Recipient Entity,Apex Eye Clinical Research LLC,,,,,,,,...,No,No,"A Prospective, Multicenter, Randomized, Parall...",No,608348859,2018,06/30/2021,,,
2,UNCHANGED,Non-covered Recipient Entity,Oculus Research Inc.,,,,,,,,...,No,No,"A Prospective, Multicenter, Randomized, Parall...",No,608349661,2018,06/30/2021,,,
3,UNCHANGED,Non-covered Recipient Entity,"Rochester Ophthalmological Group, PC",,,,,,,,...,No,No,"A Prospective, Multicenter, Randomized, Parall...",No,608351113,2018,06/30/2021,,,
4,UNCHANGED,Non-covered Recipient Entity,Tekwani Vision Center,,,,,,,,...,No,No,"A Prospective, Multicenter, Randomized, Parall...",No,608352557,2018,06/30/2021,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3864,NEW,Covered Recipient Teaching Hospital,,230046,9198,UNIV OF MI HOSPITALS & HLTH CTRS,,,,,...,No,No,"A multi-centered, randomized, double-blind, pl...",No,721242077,2020,06/30/2021,,,
3865,NEW,Covered Recipient Teaching Hospital,,490009,9389,UNIVERSITY OF VIRGINIA MEDICAL CENTE,,,,,...,No,No,"A multi-centered, randomized, double-blind, pl...",No,721242343,2020,06/30/2021,,,
3866,NEW,Covered Recipient Teaching Hospital,,110010,9427,EMORY UNIVERSITY HOSPITAL,,,,,...,No,No,"A 12-week, Randomized, Double-blind, Placebo-c...",No,721242599,2020,06/30/2021,,,
3867,NEW,Covered Recipient Teaching Hospital,,250001,9578,UNIVERSITY OF MISSISSIPPI MEDICAL,,,,,...,No,No,"A Two-part, Multicenter, Dose-titration Study ...",No,721242855,2020,06/30/2021,,,


In [115]:
target_columns = [
 'physician_profile_id',
 'physician_first_name',
 'physician_middle_name',
 'physician_last_name',
 'physician_name_suffix',
 'recipient_primary_business_street_address_line1',
 'recipient_primary_business_street_address_line2',
 'recipient_city',
 'recipient_state',
 'recipient_zip_code',
 'recipient_country',
 'recipient_postal_code',
 'physician_primary_type',
 'specialty_code',
 'total_amount_of_payment_usdollars',
 'date_of_payment',
 'number_of_payments_included_in_total_amount',
 'form_of_payment_or_transfer_of_value',
 'nature_of_payment_or_transfer_of_value',
 'record_id',
 'program_year',
 'payment_publication_date',
 'applicable_manufacturer_or_applicable_gpo_making_payment_name',
 'applicable_manufacturer_or_applicable_gpo_making_payment_id']

In [116]:
def _research_is_a_target_column(col: str) -> bool:
    for banned_start_name in ['physician', 'recipient']:
        if col.startswith(banned_start_name):
            return False

    if not col.startswith('principal_investigator'):
        return col in target_columns

    return col.replace('principal_investigator', 'physician') in target_columns\
        or col.replace('principal_investigator', 'recipient') in target_columns

def _research_rename_func(col: str) -> str:
    if col in target_columns:
        return col

    for s in ['physician', 'recipient']:
        if (replaced := col.replace('principal_investigator', s)) in target_columns:
            return replaced
    

In [117]:
df_research = df_research.loc[:, 
    df_research.columns[
        df_research\
        .columns\
        .map(_research_is_a_target_column)
    ]\
    .to_list()\
]\
.rename(columns=_research_rename_func)

In [118]:
df_research['is_research'] = True

In [119]:
df.reset_index(inplace=True)

In [120]:
specialty_codes = pandas.read_csv('../../data/util/specialty_codes.csv', comment='#', index_col=0)

In [121]:
specialty_codes.set_index(['Grouping', 'Classification', 'Specialization'], inplace=True)

In [122]:
def match_specialty_string_to_code(string: Optional[str]) -> Optional[str]:
    if pandas.isnull(string):
        return None
    match = string.split('|')
    match += [None for _ in range(3 - len(match))]
    try:
        return specialty_codes.at[tuple(match), 'Code']
    except KeyError:
        return None

In [123]:
df['specialty_code'] = df['physician_specialty'].apply(match_specialty_string_to_code)

In [124]:
df['is_research'] = False

In [125]:
df = df.loc[:, target_columns + ['is_research']]\
    .append(df_research)\
    .reset_index(drop=True)


In [126]:
df.to_csv('../../data/processed/open_payments.csv')