In [1]:
import sqlite3
import pandas as pd
from tqdm.notebook import tqdm

## NPPES data filtering
The NPPES dataset contains a large number of fields, only a few of which are relevant to this project:

'NPI'
Entity Type, indicated by the 'Entity Type Code' field:
1 = Provider (doctors, nurses, etc.)
2 = Facility (Hospitals, Urgent Care, Doctors Offices)
Entity Name: Either First/Last or Organization or Other Organization Name contained in the following fields:
'Provider Organization Name (Legal Business Name)'
'Provider Last Name (Legal Name)'
'Provider First Name'
'Provider Middle Name'
'Provider Name Prefix Text'
'Provider Name Suffix Text'
'Provider Credential Text'
Address: Business Practice Location (not mailing), contained in the following fields:
'Provider First Line Business Mailing Address'
'Provider Second Line Business Mailing Address'
'Provider Business Mailing Address City Name'
'Provider Business Mailing Address State Name'
'Provider Business Mailing Address Postal Code'
The provider's taxonomy code, which is contained in one of the 'Healthcare Provider Taxonomy Code*' columns. A provider can have up to 15 taxonomy codes, but we want the one which has Primary Switch = Y in the associated 'Healthcare Provider Primary Taxonomy Switch*' field. Note that this does not always occur in spot 1.

In [2]:
nppes = pd.DataFrame()

db = sqlite3.connect('data/nppes_lite.sqlite')

def run_query(n):
    query = '''
    SELECT npi, 
    entity_type_code,
    [provider_organization_name_(legal_business_name)],
    [provider_last_name_(legal_name)],
    provider_first_name,
    provider_middle_name,
    provider_name_prefix_text,
    provider_name_suffix_text,
    provider_credential_text,
    provider_first_line_business_practice_location_address,
    provider_second_line_business_practice_location_address,
    provider_business_practice_location_address_city_name,
    provider_business_practice_location_address_state_name,
    provider_business_practice_location_address_postal_code,
    healthcare_provider_taxonomy_code_{0} AS taxonomy_code
    FROM nppes_raw
    WHERE healthcare_provider_primary_taxonomy_switch_{0} = 'Y' AND
    entity_type_code IN (1,2)
    '''.format(n)
    nppes = pd.read_sql(query, db)
    return nppes

for i in range (1,16):
    results = run_query(i)
    nppes = nppes.append(results)

#### converting a dataframe to a table in the database

nppes.to_sql('nppes', db, if_exists = 'append', index = False) 

db.close()

nppes.head()

## Checking if all tables went into db

db = sqlite3.connect('data/nppes_lite.sqlite')

query= '''
SELECT *
FROM nppes_raw
LIMIT 5
'''
df = pd.read_sql(query, db)
df.head()

db = sqlite3.connect('data/nppes_lite.sqlite')

query= '''
SELECT *
FROM hop_team
LIMIT 5
'''
df = pd.read_sql(query, db)
df.head()

db = sqlite3.connect('data/nppes_lite.sqlite')

query= '''
SELECT *
FROM nucc_taxonomy
LIMIT 5
'''
df = pd.read_sql(query, db)
df.head()

db = sqlite3.connect('data/nppes_lite.sqlite')

query= '''
SELECT *
FROM zip_cbsa
LIMIT 5
'''
df = pd.read_sql(query, db)
df.head()

db = sqlite3.connect('data/nppes_lite.sqlite')

query= '''
SELECT *
FROM nppes
LIMIT 5
'''
df = pd.read_sql(query, db)
df.head()

### Adjusting nppes column names within sqlite 

db = sqlite3.connect('data/nppes_lite.sqlite')

db.execute('ALTER TABLE nppes RENAME COLUMN [provider_organization_name_(legal_business_name)] TO org_name')
db.execute('ALTER TABLE nppes RENAME COLUMN [provider_last_name_(legal_name)] TO last_name')
db.execute('ALTER TABLE nppes RENAME COLUMN provider_first_name TO first_name')
db.execute('ALTER TABLE nppes RENAME COLUMN provider_middle_name TO middle_name')
db.execute('ALTER TABLE nppes RENAME COLUMN provider_name_prefix_text TO name_prefix')
db.execute('ALTER TABLE nppes RENAME COLUMN provider_name_suffix_text TO name_suffix')
db.execute('ALTER TABLE nppes RENAME COLUMN provider_credential_text TO provider_credential')
db.execute('ALTER TABLE nppes RENAME COLUMN provider_first_line_business_practice_location_address TO address_1')
db.execute('ALTER TABLE nppes RENAME COLUMN provider_second_line_business_practice_location_address TO address_2')
db.execute('ALTER TABLE nppes RENAME COLUMN provider_business_practice_location_address_city_name TO city')
db.execute('ALTER TABLE nppes RENAME COLUMN provider_business_practice_location_address_state_name TO state')
db.execute('ALTER TABLE nppes RENAME COLUMN provider_business_practice_location_address_postal_code TO zip')

db.close()

### Database is hopefully complete