In [8]:
import pandas as pd
import os
from dbHelpers import createEngine
import uuid

In [9]:
engine = createEngine()

In [10]:
path = '../scratch/halloween_data'

In [18]:
fips_state_df = pd.read_sql('select * from npd.fips_state', con = engine)
fips_state_df.set_index('abbreviation', inplace=True)

In [12]:
df_dict={}
for f in os.listdir(path):
    if '.csv' in f:
        tablename = f.split('.csv')[0]
        df = pd.read_csv(os.path.join(path,f))
        df_dict[f]=df
        df.to_sql(tablename, index=False, schema = 'raw_csv', con = engine, if_exists='replace')

  df = pd.read_csv(os.path.join(path,f))


In [None]:
practitioner_df = df_dict['practitioner.csv']
#note: we can do this because each practitioner only appears once in this table
practitioner_df[id] = [uuid.uuid4() for i in practitioner_df.index]
practitioner_df.set_index('id', inplace=True)
practitioner_df.rename(columns = {'gender_code': 'sex', 'name_prefix': 'prefix', 'name_suffix': 'suffix'}, inplace=True)
npi1 = practitioner_df[['npi']]
npi1['entity_type_code'] = 1


In [None]:
organization_df = df_dict['organization.csv']
organization_df['is_primary'] = True
organization_df.rename(columns={'id':'old_org_id', 'parent_id':'old_parent_id'}, inplace=True)
organization_df['org_id'] = [uuid.uuid4() for i in organization_df.index]
organization_df.set_index('old_org_id', inplace=True)
organization_df['org_parent_id'] = [organization_df['old_parent_id'].apply(lambda x: organization_df.loc[x]['org_id'])]
organization_npi_df = df_dict['organization_npi.csv']
organization_npi_df.rename(columns={'organization_id':'old_org_id'}, inplace=True)
organization_npi_df['id'] = [uuid.uuid4() for i in organization_npi_df.index]
npi2 = organization_npi_df[['npi']]
npi2['entity_type_code'] = 2
merged_organization_df = organization_npi_df.merge(organization_df, on='old_org_id')
merged_organization_df.rename(columns={'org_id':'parent_id'}, inplace=True)
organization_df.rename(columns = {'org_id':'id', 'org_parent_id': 'parent_id'}, inplace=True)


NameError: name 'organizaiton_df' is not defined

In [None]:
npi_df = pd.concat([npi1,npi2])

In [None]:
endpoint_df = df_dict['endpoint.csv']
ehr_vendor_df = endpoint_df[['vendor_name']].drop_duplicates()
ehr_vendor_df['id'] = [uuid.uuid4() for i in ehr_vendor_df.index]
ehr_vendor_df.rename(columns={'vendor_name':'name'}, inplace=True)
ehr_vendor_df.set_index('vendor_name', inplace=True)
endpoint_df['ehr_vendor_id'] = endpoint_df['vendor_name'].apply(lambda x: ehr_vendor_df.loc[x]['id'])
endpoint_df['environment_type_id'] = 'prod'
endpoint_df['endpoint_connection_type_id'] = 'hl7-fhir-rest'
endpoint_df['id'] = [uuid.uuid4() for i in endpoint_df.index]
endpoint_df.rename(columns={'fhir_url':'address'})

In [None]:
endpoint_to_organization_df = df_dict['organization_endpoint.csv']


In [None]:
address_df = df_dict['location.csv']
address_df.rename(columns={'id':'address_us_id', 'line':'delivery_line_1', 'postalcode':'zipcode'}, inplace=True)
address_df['id']= [uuid.uuid4() for i in address_df.index]
address_df['state_code'] = address_df['state'].apply(lambda x: fips_state_df.loc[x]['id'])
location_npi_df = df_dict['location_npi.csv']
merged_location_df = location_npi_df.merge(address_df, left_on='address_us_id', right_on = 'location_id').merge(npi_df,practitioner_df,organization_df, on = 'npi', suffixes=('npi','individual','organization'))
merged_location_df.rename(columns={'id':'address_id', 'id_individual':'individual_id', 'id_organization':'organization_id'}, inplace=True)
merged_location_df['address_use_id'] = 2
individual_to_address_df = merged_location_df[['address_id','individual_id', 'address_use_id']].dropna(how='any')
location_df = merged_location_df[['address_id','organization_id','name', 'address_use_id']].dropna(how='any')
location_df['id'] = [uuid.uuid4() for i in location_df.index]

In [None]:
# load npi
npi_df.to_sql('npi', schema = 'npd', con = engine)

# load individual
practitioner_df[['id', 'sex']].to_sql('individual', schema = 'npd', con = engine)

practitioner_df.rename(columns={'id':'individual_id'}, inplace=True)

# load individual_to_name
practitioner_df[['individual_id', 'first_name', 'middle_name', 'last_name', 'prefix', 'suffix']].to_sql('individual', schema = 'npd', con = engine)

# load provider
practitioner_df[['npi', 'individual_id']].to_sql('provider', schema='npd', con=engine)

# load organization
organization_df[['id', 'parent_id']].to_sql('organization', schema='npd', con=engine)
merged_organization_df[['id', 'parent_id']].to_sql('organization', schema='npd', con=engine)

organization_df.rename(columns={'id':'organization_id'}, inplace=True)
merged_organization_df.rename(columns={'id':'organization_id'}, inplace=True)

# load organization_to_name
organization_df[['organization_id', 'name', 'is_primary']].to_sql('organization_to_name', schema = 'npd', con = engine)
merged_organization_df[['organization_id', 'name', 'is_primary']].to_sql('organization_to_name', schema = 'npd', con = engine)

# load clinical_organization
merged_organization_df[['organization_id', 'npi']].to_sql('clinical_organization', schema='npd', con = engine)

# load ehr_vendor
ehr_vendor_df[['id', 'name']].to_sql('ehr_vendor', schema = 'npd', con = engine)

# load endpoint_instance
endpoint_df[['id', 'ehr_vendor_id', 'address', 'endpoint_connection_type_id', 'environment_type_id']].to_sql('endpoint_instance', schema = 'npd', con = engine)

# load address_us
location_df[['address_us_id', 'delivery_line_1','city','state_code','zipcode']].rename(columns={'address_us_id':'id'}).to_sql('address_us', schema = 'npd', con = engine)

# load address
location_df[['id', 'address_us_id']].to_sql('address', schema = 'npd', con = engine)

# load individual_to_address
individual_to_address_df.to_sql('individual_to_address', schema = 'npd', con = engine)

# load organization_to_address
location_df[['address_id','organization_id']].to_sql('organization_to_address', schema = 'npd', con = engine)

# load location
location_df[['id','address_id','organization_id']].to_sql('location', schema = 'npd', con = engine)