In [None]:
import pandas
import requests
import json
from numpy import nan
from typing import Optional, Tuple, MutableSet, List
from bs4 import BeautifulSoup
from io import StringIO

In [None]:
def _datastore_id_to_query_key(id: str) -> str:
    url = f"https://openpaymentsdata.cms.gov/api/1/metastore/schemas/dataset/items/{id}?show-reference-ids=false"
    d = json.loads(requests.get(url).text)
    return d["distribution"][0]["identifier"]

In [None]:
research_master_ids = ('nvfc-jcr4', '94mj-bpz5', '29v2-guh5')
research_keys = [_datastore_id_to_query_key(i) for i in research_master_ids]

In [None]:
general_master_ids = ('ud7t-2ipu', 'qsys-b88w', 'txng-a8vj')
general_keys = [_datastore_id_to_query_key(i) for i in general_master_ids]

In [None]:
target_companies: List[str] = pandas.read_csv("compareToCompanies.csv")\
    ['Company']\
    .to_list()

In [None]:
profile_info_key = _datastore_id_to_query_key('yjhd-k7tx')
profile_info_key

In [None]:
def _company_name_to_id(name: str) -> List[str]:
    name = name.strip()

    targets = [name]
    if len(name.split()) > 1:
        targets.append(name.split()[0])
    results = []
    url = f"https://openpaymentsdata.cms.gov/api/1/datastore/query/{profile_info_key}"
    for target_name in targets:
        payload = {
            "keys": "true",
            "limit": "10",
            "offset": "0",
            "conditions[0][resource]": "t",
            "conditions[0][property]": "entity_type",
            "conditions[0][value]": "c",
            "conditions[0][operator]": "=",
            "conditions[1][groupOperator]": "or",
            "conditions[1][conditions][0][resource]": "t",
            "conditions[1][conditions][0][property]": "entity_name",
            "conditions[1][conditions][0][value]": f"%%{target_name}%%",
            "conditions[1][conditions][0][operator]": "like",
            "sorts[0][property]": "entity_name",
            "sorts[0][order]": "asc"
        }
        r = requests.get(url, params=payload)
        lst = r.json()["results"]

        out = [d["entity_id"] for d in lst]
        if len(out) == 1:
            return out
        

    return []
    

In [None]:
company_names_to_ids = {i: _company_name_to_id(i) for i in target_companies}

In [None]:
company_names_to_ids = {k: v[0] for k, v in company_names_to_ids.items() if len(v) == 1}

In [None]:
company_ids = set(company_names_to_ids.values())
company_ids

In [None]:
search_col = "applicable_manufacturer_or_applicable_gpo_making_payment_id"
df_general = pandas.DataFrame()

for dataset_id in general_master_ids:
    for company_id in company_ids:
        print(company_id, "beginning query")
        params = {
            "conditions[0][property]": search_col,
            "conditions[0][value]"   : company_id,
            "conditions[0][operator]": '=',
            "format"                 : 'csv'
        }
        url = f"https://openpaymentsdata.cms.gov/api/1/datastore/query/{dataset_id}/0/download"

        r = requests.get(url, params=params)
        r.raise_for_status()

        tmp = pandas.read_csv(StringIO(r.text), dtype=str)
        df_general = df_general.append(tmp)
del search_col

In [None]:
df_general['physician_profile_id'].replace('', nan, inplace=True)
df_general.dropna(subset=['physician_profile_id'], inplace=True)

In [None]:
df_general.reset_index(inplace=True, drop=True)


In [None]:
df_research = pandas.DataFrame()


search_col = "applicable_manufacturer_or_applicable_gpo_making_payment_id"

for dataset_id in research_master_ids:
    for company_id in company_ids:
        print(company_id, "beginning query")
        params = {
            "conditions[0][property]": search_col,
            "conditions[0][value]"   : company_id,
            "conditions[0][operator]": '=',
            "format"                 : 'csv'
        }
        url = f"https://openpaymentsdata.cms.gov/api/1/datastore/query/{dataset_id}/0/download"

        r = requests.get(url, params=params)
        r.raise_for_status()

        tmp = pandas.read_csv(StringIO(r.text), dtype=str)

        tmp.reset_index(drop=True, inplace=True)
        tmp = tmp[tmp.index.notnull()]

        for i in range(5): # get all 5 principal investigators into a master dataframe
            investigator_key = f'principal_investigator_{i+1}'
            col_name = f'{investigator_key}_profile_id'

            columns_to_drop = [col for col in tmp.columns 
                if col.startswith('principal_investigator') 
                and not col.startswith(investigator_key)
            ]

            columns_to_rename = {
                col: col.replace(investigator_key, 'principal_investigator') 
                for col in tmp.columns if col.startswith(investigator_key)
            }
            
            profile_id_indexed = tmp\
                .drop(columns=columns_to_drop)\
                .rename(columns=columns_to_rename)

            df_research = df_research.append(profile_id_indexed)


del search_col

In [None]:
df_research['principal_investigator_profile_id'].replace('', nan, inplace=True)
df_research.dropna(subset=['principal_investigator_profile_id'], inplace=True)

In [None]:
df_research.reset_index(drop=True, inplace=True)

In [None]:
target_columns = [
 'physician_profile_id',
 'physician_first_name',
 'physician_middle_name',
 'physician_last_name',
 'physician_name_suffix',
 'physician_specialty',
 'recipient_primary_business_street_address_line1',
 'recipient_primary_business_street_address_line2',
 'recipient_city',
 'recipient_state',
 'recipient_zip_code',
 'recipient_country',
 'recipient_postal_code',
 'physician_primary_type',
 'total_amount_of_payment_usdollars',
 'date_of_payment',
 'number_of_payments_included_in_total_amount',
 'form_of_payment_or_transfer_of_value',
 'nature_of_payment_or_transfer_of_value',
 'record_id',
 'program_year',
 'payment_publication_date',
 'applicable_manufacturer_or_applicable_gpo_making_payment_name',
 'applicable_manufacturer_or_applicable_gpo_making_payment_id']

In [None]:
def _research_is_a_target_column(col: str) -> bool:
    for banned_start_name in ['physician', 'recipient']:
        if col.startswith(banned_start_name):
            return False

    if not col.startswith('principal_investigator'):
        return col in target_columns

    return col.replace('principal_investigator', 'physician') in target_columns\
        or col.replace('principal_investigator', 'recipient') in target_columns

def _research_rename_func(col: str) -> str:
    if col in target_columns:
        return col

    for s in ['physician', 'recipient']:
        if (replaced := col.replace('principal_investigator', s)) in target_columns:
            return replaced
    

In [None]:
research_target_columns = df_research.columns\
    [df_research.columns.map(_research_is_a_target_column)]\
    .to_list()

df_research = df_research.loc[:, research_target_columns]\
    .rename(columns=_research_rename_func)

In [None]:
specialty_codes = pandas.read_csv('../../data/util/specialty_codes.csv', comment='#', index_col=0)

In [None]:
specialty_codes.set_index(['Grouping', 'Classification', 'Specialization'], inplace=True)

In [None]:
def match_specialty_string_to_code(string: Optional[str]) -> Optional[str]:
    if pandas.isnull(string):
        return None
    match = string.split('|')
    match += [None for _ in range(3 - len(match))]
    try:
        return specialty_codes.at[tuple(match), 'Code']
    except KeyError:
        return None

In [None]:
df_general['is_research'] = False

In [None]:
df_research['is_research'] = True

In [None]:
df = df_general.loc[:, target_columns + ['is_research']]\
    .append(df_research)\
    .reset_index(drop=True)


In [None]:
df['specialty_code'] = df['physician_specialty'].apply(match_specialty_string_to_code)

In [None]:
df.to_csv('../../data/processed/open_payments.csv')

In [None]:
horizon = df[df['applicable_manufacturer_or_applicable_gpo_making_payment_id'] == '100000131389']

In [None]:
df[df['physician_profile_id'] == '1083708']