In [1]:
import pandas as pd
import os
from dbHelpers import createEngine
import uuid
import numpy as np

In [2]:
engine = createEngine()

In [3]:
path = '../scratch/halloween_data'

In [4]:
fips_state_df = pd.read_sql('select * from npd.fips_state', con = engine)
fips_state_df.set_index('abbreviation', inplace=True)

In [None]:
def show_or_load(df, table_name, schema_name, load=False):
    if load:
        df.to_sql(table_name, schema = schema_name, con = engine, if_exists='append')
    else:
        df.head()

In [6]:
primary_to_bool = {1: True, 0: False}


def convertBool(val):
    if val in primary_to_bool.keys():
        return primary_to_bool[val]
    else:
        return False
    
def val_or_nan(df, index, column):
    if index in df.index:
        return df.loc[index][column]
    else:
        return np.nan

In [7]:
df_dict={}
for f in os.listdir(path):
    if '.csv' in f:
        tablename = f.split('.csv')[0]
        df = pd.read_csv(os.path.join(path,f))
        df_dict[f]=df
        #df.to_sql(tablename, index=False, schema = 'raw_csv', con = engine, if_exists='replace')

  df = pd.read_csv(os.path.join(path,f))


In [8]:
practitioner_df = df_dict['practitioner.csv']
#note: we can do this because each practitioner only appears once in this table
practitioner_df['id'] = [uuid.uuid4() for i in practitioner_df.index]
practitioner_df_renamed = practitioner_df.rename(columns = {'gender_code': 'sex', 'name_prefix': 'prefix', 'name_suffix': 'suffix'})
npi_type1_df = practitioner_df_renamed[['npi']]
npi_type1_df['entity_type_code'] = 1
practitioner_taxonomy_df = df_dict['practitionerrole.csv']
merged_taxonomy_df = practitioner_taxonomy_df.merge(practitioner_df_renamed, left_on = 'practitioner_id', right_on = 'npi', suffixes = ('tax', 'individual')) 
merged_taxonomy_df = merged_taxonomy_df.loc[merged_taxonomy_df['state_code']!='ZZ']
merged_taxonomy_df['state_code'] = merged_taxonomy_df['state_code'].apply(lambda x: val_or_nan(fips_state_df, x, 'id'))
merged_taxonomy_df_renamed = merged_taxonomy_df.rename(columns={'idindividual': 'individual_id', 'taxonomy_code':'nucc_code'})
provider_to_taxonomy_df = merged_taxonomy_df_renamed[['individual_id', 'nucc_code', 'is_primary']]
provider_to_taxonomy_df['is_primary'] = provider_to_taxonomy_df['is_primary'].apply(lambda x: convertBool(x))
dedup_taxonomy_df = provider_to_taxonomy_df.sort_values(by='is_primary', ascending=False)[
        ['individual_id', 'nucc_code', 'is_primary']].drop_duplicates(subset=['nucc_code', 'individual_id'])
dedup_taxonomy_df['id'] = [uuid.uuid4() for i in dedup_taxonomy_df.index]
credential_df = provider_to_taxonomy_df.merge(merged_taxonomy_df_renamed, on = ['individual_id', 'nucc_code'], suffixes = ('tax', 'cred'))
credential_df_renamed = credential_df.rename(columns={'idtax': 'provider_to_taxonomy_id'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  npi_type1_df['entity_type_code'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  provider_to_taxonomy_df['is_primary'] = provider_to_taxonomy_df['is_primary'].apply(lambda x: convertBool(x))


In [9]:
organization_df = df_dict['organization.csv']
organization_df['is_primary'] = True
organization_df_renamed = organization_df.rename(columns={'id':'old_org_id', 'parent_id':'old_parent_id', 'organization_name':'name'})
organization_df_renamed['org_id'] = [uuid.uuid4() for i in organization_df_renamed.index]
organization_df_renamed['org_parent_id'] = organization_df_renamed['old_parent_id'].apply(lambda x: val_or_nan(organization_df_renamed, x, 'org_id'))
organization_npi_df = df_dict['organization_npi.csv']
organization_npi_df_renamed = organization_npi_df.rename(columns={'organization_id':'old_org_id'})
organization_npi_df_renamed['id'] = [uuid.uuid4() for i in organization_npi_df_renamed.index]
npi_type2_df = organization_npi_df_renamed[['npi']]
npi_type2_df['entity_type_code'] = 2
clinical_organization_df = organization_npi_df_renamed.merge(organization_df_renamed, on='old_org_id', how='outer')
clinical_organization_df_renamed = clinical_organization_df.rename(columns={'org_id':'parent_id'})
other_organization_df = organization_df_renamed.rename(columns = {'org_id':'id', 'org_parent_id': 'parent_id'})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  npi_type2_df['entity_type_code'] = 2


In [10]:
npi_df = pd.concat([npi_type1_df,npi_type2_df])

In [11]:
endpoint_df = df_dict['endpoint.csv']
endpoint_df_renamed = endpoint_df.rename(columns={'id':'endpoint_id','fhir_url':'address'})
ehr_vendor_df = endpoint_df.drop_duplicates(subset='vendor_name')
ehr_vendor_df['id'] = [uuid.uuid4() for i in ehr_vendor_df.index]
ehr_vendor_df_renamed = ehr_vendor_df.rename(columns={'vendor_name':'name'})
ehr_vendor_df_renamed.set_index('name', inplace=True, drop=False)
endpoint_df_renamed['ehr_vendor_id'] = endpoint_df_renamed['vendor_name'].apply(lambda x: ehr_vendor_df_renamed.loc[x]['id'])
endpoint_df_renamed['environment_type_id'] = 'prod'
endpoint_df_renamed['endpoint_connection_type_id'] = 'hl7-fhir-rest'
endpoint_df_renamed['id'] = [uuid.uuid4() for i in endpoint_df_renamed.index]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ehr_vendor_df['id'] = [uuid.uuid4() for i in ehr_vendor_df.index]


In [12]:
org_to_endpoint_df = df_dict['organization_endpoint.csv']
merged_org_to_endpoint_df = org_to_endpoint_df.merge(endpoint_df_renamed, on = 'endpoint_id', how='outer').merge(clinical_organization_df_renamed, left_on = 'organization_npi', right_on = 'npi', suffixes = ('endpoint', 'organization'), how='outer')
merged_org_to_endpoint_df= merged_org_to_endpoint_df[['idendpoint', 'idorganization']].rename(columns = {'idendpoint': 'endpoint_instance_id', 'idorganization':'organization_id'})

In [13]:
address_df = df_dict['location.csv']
address_df_renamed = address_df.rename(columns={'id':'address_us_id', 'line':'delivery_line_1', 'postalcode':'zipcode'})
address_df_renamed['id']= [uuid.uuid4() for i in address_df_renamed.index]
address_df_renamed = address_df_renamed.loc[(address_df_renamed['state'] != 'FM') & (address_df_renamed['state'] != '~') & (address_df_renamed['state'] != 'UK') & (address_df['state'] != 'MH')]
address_df_renamed['state_code'] = address_df_renamed['state'].apply(lambda x: fips_state_df.loc[x]['id'])
location_npi_df = df_dict['npi_location.csv']
merged_df_1 = location_npi_df.merge(address_df_renamed, left_on='location_id', right_on = 'address_us_id', how='outer')
merged_df_2 = merged_df_1.merge(npi_df, on = 'npi', suffixes=('address','npi'), how='outer')
merged_df_3 = merged_df_2.merge(practitioner_df_renamed, on = 'npi', suffixes = ('address', 'individual'), how='outer')
merged_location_df = merged_df_3.merge(clinical_organization_df_renamed, on = 'npi', suffixes = ('address', 'organization'), how='outer')
merged_location_df_renamed = merged_location_df.rename(columns={'idaddress':'address_id', 'idindividual':'individual_id', 'id':'organization_id', 'nameaddress':'name'})
merged_location_df_renamed['address_use_id'] = 2
individual_to_address_df = merged_location_df_renamed[['address_id','individual_id', 'address_use_id']].dropna(how='any')
location_df = merged_location_df_renamed[['address_id','organization_id','name', 'address_use_id']].dropna(how='any')
location_df['id'] = [uuid.uuid4() for i in location_df.index]
location_to_endpoint_df = location_df.merge(merged_org_to_endpoint_df, on = 'organization_id', how='outer')[['id', 'endpoint_instance_id']].dropna(how = 'any').rename(columns = {'id':'location_id'})


In [14]:
provider_to_organization_df = df_dict['personal_npi_to_organizational_npi.csv']
merged_provider_to_org_df = provider_to_organization_df.merge(practitioner_df_renamed, left_on = 'personal_npi', right_on = 'npi', how='outer').merge(clinical_organization_df_renamed, left_on = 'organizational_npi', right_on = 'npi', suffixes = ('individual', 'organization'), how='outer')
provider_to_org_df_renamed = merged_provider_to_org_df.rename(columns = {'idindividual':'individual_id', 'idorganization':'organization_id'})
provider_to_org_df_renamed['id'] = [uuid.uuid4() for i in provider_to_org_df_renamed.index]
provider_to_org_df_renamed['relationship_type_id'] = 2
provider_to_location_df = provider_to_org_df_renamed.merge(location_df, on='organization_id', how='outer')
provider_to_location_df['id'] = [uuid.uuid4() for i in provider_to_location_df.index]

In [16]:
schema_name = 'npd'
load = True

# load npi
show_or_load(npi_df, 'npi', schema_name, load)

# load individual
show_or_load(practitioner_df_renamed[['id', 'sex']], 'individual', schema_name, load)

practitioner_df_renamed_renamed = practitioner_df_renamed.rename(columns={'id':'individual_id'})

# load individual_to_name
show_or_load(practitioner_df_renamed_renamed[['individual_id', 'first_name', 'middle_name', 'last_name', 'prefix', 'suffix']], 'individual', schema_name, load)

# load provider
show_or_load(practitioner_df_renamed_renamed[['npi', 'individual_id']], 'provider', schema_name, load)

# load organization
show_or_load(other_organization_df[['id', 'parent_id']], 'organization', schema_name, load)
show_or_load(clinical_organization_df_renamed[['id', 'parent_id']], 'organization', schema_name, load)

other_organization_df_renamed = other_organization_df.rename(columns={'id':'organization_id', 'organization_name':'name'})
clinical_organization_df_renamed_renamed = clinical_organization_df_renamed.rename(columns={'id':'organization_id'})

# load organization_to_name
show_or_load(other_organization_df_renamed[['organization_id', 'name', 'is_primary']], 'organization_to_name', schema_name, load)
show_or_load(clinical_organization_df_renamed_renamed[['organization_id', 'name', 'is_primary']], 'organization_to_name', schema_name, load)

# load clinical_organization
show_or_load(clinical_organization_df_renamed_renamed[['organization_id', 'npi']], 'clinical_organization', schema_name, load)

# load ehr_vendor
show_or_load(ehr_vendor_df_renamed[['id', 'name']], 'ehr_vendor', schema_name, load)

# load endpoint_instance
show_or_load(endpoint_df_renamed[['id', 'ehr_vendor_id', 'address', 'endpoint_connection_type_id', 'environment_type_id']], 'endpoint_instance', schema_name, load)

# load address_us
show_or_load(address_df_renamed[['address_us_id', 'delivery_line_1','city','state_code','zipcode']].rename(columns={'address_us_id':'id'}), 'address_us', schema_name, load)

# load address
show_or_load(address_df_renamed[['id', 'address_us_id']], 'address', schema_name, load)

# load individual_to_address
show_or_load(individual_to_address_df, 'individual_to_address', schema_name, load)

# load organization_to_address
show_or_load(location_df[['address_id','organization_id']], 'organization_to_address', schema_name, load)

# load location
show_or_load(location_df[['id','address_id','organization_id']], 'location', schema_name, load)

# load location_to_endpoint
show_or_load(location_to_endpoint_df, 'location_to_endpoint', schema_name, load)

# load provider_to_organization
show_or_load(provider_to_org_df_renamed.dropna(how='any'), 'provider_to_organization', schema_name, load)

# load provider_to_location
show_or_load(provider_to_location_df.dropna(how='any'), 'provider_to_location', schema_name, load)

# load provider_to_taxonomy
show_or_load(dedup_taxonomy_df, 'provider_to_taxonomy', schema_name, load)

# load provider_to_credential
show_or_load(credential_df_renamed[['license_number', 'state_code', 'provider_to_taxonomy_id']], 'provider_to_credential', schema_name, load)

ValueError: Table 'npi' already exists.