In [35]:
import pandas as pd
import os
from dbHelpers import createEngine
import uuid
import numpy as np

In [9]:
engine = createEngine()

In [None]:
path = '../scratch/halloween_data'

In [18]:
fips_state_df = pd.read_sql('select * from npd.fips_state', con = engine)
fips_state_df.set_index('abbreviation', inplace=True)

In [22]:
def show_or_load(df, table_name, schema_name, load=False):
    if load:
        df.to_sql(table_name, schema = schema_name, con = engine)
    else:
        df.head()

In [36]:
primary_to_bool = {1: True, 0: False}


def convertBool(val):
    if val in primary_to_bool.keys():
        return primary_to_bool[val]
    else:
        return False
    
def val_or_nan(df, index, column):
    if index in df.index:
        return df.loc[index][column]
    else:
        return np.nan

In [106]:
df_dict={}
for f in os.listdir(path):
    if '.csv' in f:
        tablename = f.split('.csv')[0]
        df = pd.read_csv(os.path.join(path,f))
        df_dict[f]=df
        #df.to_sql(tablename, index=False, schema = 'raw_csv', con = engine, if_exists='replace')

  df = pd.read_csv(os.path.join(path,f))


In [108]:
practitioner_df = df_dict['practitioner.csv']
#note: we can do this because each practitioner only appears once in this table
practitioner_df['id'] = [uuid.uuid4() for i in practitioner_df.index]
practitioner_df.rename(columns = {'gender_code': 'sex', 'name_prefix': 'prefix', 'name_suffix': 'suffix'}, inplace=True)
npi1 = practitioner_df[['npi']]
npi1['entity_type_code'] = 1
practitioner_taxonomy_df = df_dict['practitionerrole.csv']
merged_taxonomy_df = practitioner_taxonomy_df.merge(practitioner_df, left_on = 'practitioner_id', right_on = 'npi', suffixes = ('tax', 'individual'), how='outer') 
merged_taxonomy_df = merged_taxonomy_df.loc[merged_taxonomy_df['state_code']!='ZZ']
merged_taxonomy_df['state_code'] = merged_taxonomy_df['state_code'].apply(lambda x: val_or_nan(fips_state_df, x, 'id'))
merged_taxonomy_df.rename(columns={'idindividual': 'individual_id', 'taxonomy_code':'nucc_code'}, inplace=True)
provider_to_taxonomy_df = merged_taxonomy_df[['individual_id', 'nucc_code', 'is_primary']]
provider_to_taxonomy_df['is_primary'] = provider_to_taxonomy_df['is_primary'].apply(lambda x: convertBool(x))
dedup_taxonomy_df = provider_to_taxonomy_df.sort_values(by='is_primary', ascending=False)[
        ['individual_id', 'nucc_code', 'is_primary']].drop_duplicates(subset=['nucc_code', 'individual_id'])
dedup_taxonomy_df['id'] = [uuid.uuid4() for i in dedup_taxonomy_df.index]
credential_df = provider_to_taxonomy_df.merge(dedup_taxonomy_df, on = ['individual_id', 'nucc_code'], suffixes = ('tax', 'cred'), how='outer')
credential_df.rename(columns={'idtax': 'provider_to_taxonomy_id'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  npi1['entity_type_code'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  provider_to_taxonomy_df['is_primary'] = provider_to_taxonomy_df['is_primary'].apply(lambda x: convertBool(x))


In [None]:
organization_df = df_dict['organization.csv']
organization_df['is_primary'] = True
organization_df.rename(columns={'id':'old_org_id', 'parent_id':'old_parent_id'}, inplace=True)
organization_df['org_id'] = [uuid.uuid4() for i in organization_df.index]
organization_df['org_parent_id'] = organization_df['old_parent_id'].apply(lambda x: val_or_nan(organization_df, x, 'org_id'))
organization_npi_df = df_dict['organization_npi.csv']
organization_npi_df.rename(columns={'organization_id':'old_org_id'}, inplace=True)
organization_npi_df['id'] = [uuid.uuid4() for i in organization_npi_df.index]
npi2 = organization_npi_df[['npi']]
npi2['entity_type_code'] = 2
merged_organization_df = organization_npi_df.merge(organization_df, on='old_org_id', how='outer')
merged_organization_df.rename(columns={'org_id':'parent_id'}, inplace=True)
organization_df.rename(columns = {'org_id':'id', 'org_parent_id': 'parent_id', 'organization_name':'name'}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  npi2['entity_type_code'] = 2


In [110]:
npi_df = pd.concat([npi1,npi2])

In [111]:
endpoint_df = df_dict['endpoint.csv']
endpoint_df.rename(columns={'id':'endpoint_id','fhir_url':'address'}, inplace=True)
ehr_vendor_df = endpoint_df.drop_duplicates(subset='vendor_name')
ehr_vendor_df['id'] = [uuid.uuid4() for i in ehr_vendor_df.index]
ehr_vendor_df.rename(columns={'vendor_name':'name'}, inplace=True)
ehr_vendor_df.set_index('name', inplace=True)
endpoint_df['ehr_vendor_id'] = endpoint_df['vendor_name'].apply(lambda x: ehr_vendor_df.loc[x]['id'])
endpoint_df['environment_type_id'] = 'prod'
endpoint_df['endpoint_connection_type_id'] = 'hl7-fhir-rest'
endpoint_df['id'] = [uuid.uuid4() for i in endpoint_df.index]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ehr_vendor_df['id'] = [uuid.uuid4() for i in ehr_vendor_df.index]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ehr_vendor_df.rename(columns={'vendor_name':'name'}, inplace=True)


In [112]:
org_to_endpoint_df = df_dict['organization_endpoint.csv']
merged_org_to_endpoint_df = org_to_endpoint_df.merge(endpoint_df, on = 'endpoint_id', how='outer').merge(organization_npi_df, left_on = 'organization_npi', right_on = 'npi', suffixes = ('endpoint', 'organization'), how='outer')
merged_org_to_endpoint_df= merged_org_to_endpoint_df[['idendpoint', 'idorganization']].rename(columns = {'idendpoint': 'endpoint_instance_id', 'idorganization':'organization_id'})

In [113]:
address_df = df_dict['location.csv']
address_df.rename(columns={'id':'address_us_id', 'line':'delivery_line_1', 'postalcode':'zipcode'}, inplace=True)
address_df['id']= [uuid.uuid4() for i in address_df.index]
address_df = address_df.loc[(address_df['state'] != 'FM') & (address_df['state'] != '~') & (address_df['state'] != 'UK') & (address_df['state'] != 'MH')]
address_df['state_code'] = address_df['state'].apply(lambda x: fips_state_df.loc[x]['id'])
location_npi_df = df_dict['npi_location.csv']
merged_df_1 = location_npi_df.merge(address_df, left_on='location_id', right_on = 'address_us_id', how='outer')
merged_df_2 = merged_df_1.merge(npi_df, on = 'npi', suffixes=('address','npi'), how='outer')
merged_df_3 = merged_df_2.merge(practitioner_df, on = 'npi', suffixes = ('address', 'individual'), how='outer')
merged_location_df = merged_df_3.merge(merged_organization_df, on = 'npi', suffixes = ('address', 'organization'), how='outer')
merged_location_df.rename(columns={'idaddress':'address_id', 'idindividual':'individual_id', 'id':'organization_id'}, inplace=True)
merged_location_df['address_use_id'] = 2
individual_to_address_df = merged_location_df[['address_id','individual_id', 'address_use_id']].dropna(how='any')
location_df = merged_location_df[['address_id','organization_id','name', 'address_use_id']].dropna(how='any')
location_df['id'] = [uuid.uuid4() for i in location_df.index]
location_to_endpoint_df = location_df.merge(merged_org_to_endpoint_df, on = 'organization_id', how='outer')[['id', 'endpoint_instance_id']].dropna(how = 'any').rename(columns = {'id':'location_id'})


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  address_df['state_code'] = address_df['state'].apply(lambda x: fips_state_df.loc[x]['id'])


In [114]:
provider_to_organization_df = df_dict['personal_npi_to_organizational_npi.csv']
merged_provider_to_org_df = provider_to_organization_df.merge(practitioner_df, left_on = 'personal_npi', right_on = 'npi', how='outer').merge(merged_organization_df, left_on = 'organizational_npi', right_on = 'npi', suffixes = ('individual', 'organization'), how='outer')
merged_provider_to_org_df.rename(columns = {'idindividual':'individual_id', 'idorganization':'organization_id'}, inplace=True)
merged_provider_to_org_df['id'] = [uuid.uuid4() for i in merged_provider_to_org_df.index]
merged_provider_to_org_df['relationship_type_id'] = 2
provider_to_location_df = merged_provider_to_org_df.merge(location_df, on='organization_id', how='outer')
provider_to_location_df['id'] = [uuid.uuid4() for i in provider_to_location_df.index]

In [115]:
schema_name = 'npd'
load = False

# load npi
show_or_load(npi_df, 'npi', schema_name, load)

# load individual
show_or_load(practitioner_df[['id', 'sex']], 'individual', schema_name, load)

practitioner_df.rename(columns={'id':'individual_id'}, inplace=True)

# load individual_to_name
show_or_load(practitioner_df[['individual_id', 'first_name', 'middle_name', 'last_name', 'prefix', 'suffix']], 'individual', schema_name, load)

# load provider
show_or_load(practitioner_df[['npi', 'individual_id']], 'provider', schema_name, load)

# load organization
show_or_load(organization_df[['id', 'parent_id']], 'organization', schema_name, load)
show_or_load(merged_organization_df[['id', 'parent_id']], 'organization', schema_name, load)

organization_df.rename(columns={'id':'organization_id'}, inplace=True)
merged_organization_df.rename(columns={'id':'organization_id'}, inplace=True)

# load organization_to_name
show_or_load(organization_df[['organization_id', 'name', 'is_primary']], 'organization_to_name', schema_name, load)
show_or_load(merged_organization_df[['organization_id', 'name', 'is_primary']], 'organization_to_name', schema_name, load)

# load clinical_organization
show_or_load(merged_organization_df[['organization_id', 'npi']], 'clinical_organization', schema='npd', con = engine)

# load ehr_vendor
show_or_load(ehr_vendor_df[['id', 'name']], 'ehr_vendor', schema_name, load)

# load endpoint_instance
show_or_load(endpoint_df[['id', 'ehr_vendor_id', 'address', 'endpoint_connection_type_id', 'environment_type_id']], 'endpoint_instance', schema_name, load)

# load address_us
show_or_load(location_df[['address_us_id', 'delivery_line_1','city','state_code','zipcode']].rename(columns={'address_us_id':'id'}), 'address_us', schema_name, load)

# load address
show_or_load(location_df[['id', 'address_us_id']], 'address', schema_name, load)

# load individual_to_address
show_or_load(individual_to_address_df, 'individual_to_address', schema_name, load)

# load organization_to_address
show_or_load(location_df[['address_id','organization_id']], 'organization_to_address', schema_name, load)

# load location
show_or_load(location_df[['id','address_id','organization_id']], 'location', schema_name, load)

# load location_to_endpoint
show_or_load(location_to_endpoint_df, 'location_to_endpoint', schema_name, load)

# load provider_to_organization
show_or_load(merged_provider_to_org_df.dropna(how='any'), 'provider_to_organization', schema_name, load)

# load provider_to_location
show_or_load(provider_to_location_df.dropna(how='any'), 'provider_to_location', schema_name, load)

# load provider_to_taxonomy
show_or_load(dedup_taxonomy_df, 'provider_to_taxonomy', schema_name, load)

# load provider_to_credential
show_or_load(credential_df[['license_number', 'state_code', 'provider_to_taxonomy_id']], 'provider_to_taxonomy', schema_name, load)

KeyError: "['name'] not in index"

In [116]:
organization_df

Unnamed: 0,old_org_id,organization_name,old_parent_id,is_primary,organization_id,parent_id
0,1,WRIGHT-PATTERSON MEDICAL CENTER,1.0,True,a7b122e0-04d8-411c-a39d-db98676c65b5,884e78c4-403b-4d65-b234-04d684c5cb2d
1,2,WOMACK ARMY MEDICAL CENTER,792910.0,True,884e78c4-403b-4d65-b234-04d684c5cb2d,2d91f8f7-5df1-4206-a640-1e36df3e262a
2,3,WESTERN DENTAL,,True,37c876d2-bf34-4585-9be1-b09b70b0304e,
3,4,BMC NALF SAN CLEMENTE,,True,97215383-8ab4-44b4-9299-05fb4b8e56ff,
4,5,AUDIOLOGY ASSOCIATES OF WESTCHESTER,,True,af11c443-9427-4706-9376-022d716b4d22,
...,...,...,...,...,...,...
1367560,1367561,CITY OF HOPE,,True,f3f5d2aa-02da-4ad0-81b9-dbdc8d2e9a65,
1367561,1367562,CITY OF HOPE,,True,9f9ae95c-1afc-442a-a52d-a9dd97bf8b18,
1367562,1367563,AUTUMN CARE OF SUFFOLK,,True,3485df14-27d6-4836-b282-df16977861aa,
1367563,1367564,CHURCH OF JESUS CHRIST OF LATTER-DAY SAINTS,,True,20775cc3-b9a0-472e-9a0c-d7b29bb012fc,
