******************************************************************************************************************
# ETL Project: Extract, Transform, Load
******************************************************************************************************************

## Cause of Death Data
******

### Step 1: Extract 
******************************************************************************************************************

#### Importing Dependencies

In [12]:
# Import Dependencies
import pandas as pd
import os 
from config import username
from config import password
from sqlalchemy import create_engine

#### Create the CSV Data File Path

In [13]:
# CSV data file path: 
cause_of_death = os.path.join("Resources", "cause_of_death.csv")

#### Read the Data File and Store into a Pandas DataFrame

In [14]:
# Reading in the data file and storing it into a Pandas DataFrame:
cause_of_death_df = pd.read_csv("Resources/cause_of_death.csv")
cause_of_death_df.head()

Unnamed: 0,State,Year,Deaths,Population,Crude Rate,Crude Rate Lower 95% Confidence Interval,Crude Rate Upper 95% Confidence Interval,Prescriptions Dispensed by US Retailers in that year (millions)
0,Alabama,1999,39,4430141,0.9,0.6,1.2,116
1,Alabama,2000,46,4447100,1.0,0.8,1.4,126
2,Alabama,2001,67,4467634,1.5,1.2,1.9,138
3,Alabama,2002,75,4480089,1.7,1.3,2.1,142
4,Alabama,2003,54,4503491,1.2,0.9,1.6,149


In [15]:
# Updated the object type as both a string and integer are in the column
cause_of_death_df['Crude Rate'] = cause_of_death_df['Crude Rate'].astype(str)
cause_of_death_df

Unnamed: 0,State,Year,Deaths,Population,Crude Rate,Crude Rate Lower 95% Confidence Interval,Crude Rate Upper 95% Confidence Interval,Prescriptions Dispensed by US Retailers in that year (millions)
0,Alabama,1999,39,4430141,0.9,0.6,1.2,116
1,Alabama,2000,46,4447100,1,0.8,1.4,126
2,Alabama,2001,67,4467634,1.5,1.2,1.9,138
3,Alabama,2002,75,4480089,1.7,1.3,2.1,142
4,Alabama,2003,54,4503491,1.2,0.9,1.6,149
...,...,...,...,...,...,...,...,...
811,Wyoming,2010,49,563626,8.7,6.4,11.5,210
812,Wyoming,2011,47,568158,8.3,6.1,11,219
813,Wyoming,2012,47,576412,8.2,6,10.8,217
814,Wyoming,2013,52,582658,8.9,6.7,11.7,207


### Step 2: Transform

In [16]:
# Cleaned dataset and dropped any bad records:
cleaned_df = cause_of_death_df.dropna(how='any')
cleaned_df.head()

Unnamed: 0,State,Year,Deaths,Population,Crude Rate,Crude Rate Lower 95% Confidence Interval,Crude Rate Upper 95% Confidence Interval,Prescriptions Dispensed by US Retailers in that year (millions)
0,Alabama,1999,39,4430141,0.9,0.6,1.2,116
1,Alabama,2000,46,4447100,1.0,0.8,1.4,126
2,Alabama,2001,67,4467634,1.5,1.2,1.9,138
3,Alabama,2002,75,4480089,1.7,1.3,2.1,142
4,Alabama,2003,54,4503491,1.2,0.9,1.6,149


In [17]:
# Filtered dataset by selecting columns needed to answer potential query:
cleaned_subset = cleaned_df[["State", "Year", "Deaths", "Population", "Crude Rate",
                             "Prescriptions Dispensed by US Retailers in that year (millions)" ]]
cleaned_subset.head()

Unnamed: 0,State,Year,Deaths,Population,Crude Rate,Prescriptions Dispensed by US Retailers in that year (millions)
0,Alabama,1999,39,4430141,0.9,116
1,Alabama,2000,46,4447100,1.0,126
2,Alabama,2001,67,4467634,1.5,138
3,Alabama,2002,75,4480089,1.7,142
4,Alabama,2003,54,4503491,1.2,149


In [18]:
# Renaming the subset dataframe columns:
dispensed_df = cleaned_subset.rename(columns={
    'State': 'state',
    'Year': 'year',
    'Deaths': 'deaths',
    'Population': 'population',
    'Crude Rate': 'death_per_hundred_thousand',
    'Prescriptions Dispensed by US Retailers in that year (millions)': 'us_dispensed_prescriptions_millions'})
dispensed_df.head()

Unnamed: 0,state,year,deaths,population,death_per_hundred_thousand,us_dispensed_prescriptions_millions
0,Alabama,1999,39,4430141,0.9,116
1,Alabama,2000,46,4447100,1.0,126
2,Alabama,2001,67,4467634,1.5,138
3,Alabama,2002,75,4480089,1.7,142
4,Alabama,2003,54,4503491,1.2,149


In [19]:
new_dispensed_df = dispensed_df[["state", "year", "deaths", "population", "death_per_hundred_thousand",
                                "us_dispensed_prescriptions_millions"]].copy()
new_dispensed_df.head()

Unnamed: 0,state,year,deaths,population,death_per_hundred_thousand,us_dispensed_prescriptions_millions
0,Alabama,1999,39,4430141,0.9,116
1,Alabama,2000,46,4447100,1.0,126
2,Alabama,2001,67,4467634,1.5,138
3,Alabama,2002,75,4480089,1.7,142
4,Alabama,2003,54,4503491,1.2,149


## Prescriber Info Data

### Step 1: Extract
*********************************************************************

#### Create the CSV Data File Path

In [22]:
# CSV Data File Path:
prescriber_info = os.path.join("Resources", "prescriber-info.csv")

#### Read the Data File and Store into a Pandas DataFrame

In [23]:
# Reading in the data file and storing it into a Pandas DataFrame:
prescriber_info_df = pd.read_csv("Resources/prescriber-info.csv")
prescriber_info_df.head()

Unnamed: 0,NPI,Gender,State,Credentials,Specialty,ABILIFY,ACETAMINOPHEN.CODEINE,ACYCLOVIR,ADVAIR.DISKUS,AGGRENOX,...,VERAPAMIL.ER,VESICARE,VOLTAREN,VYTORIN,WARFARIN.SODIUM,XARELTO,ZETIA,ZIPRASIDONE.HCL,ZOLPIDEM.TARTRATE,Opioid.Prescriber
0,1710982582,M,TX,DDS,Dentist,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1245278100,F,AL,MD,General Surgery,0,0,0,0,0,...,0,0,0,0,0,0,0,0,35,1
2,1427182161,F,NY,M.D.,General Practice,0,0,0,0,0,...,0,0,0,0,0,0,0,0,25,0
3,1669567541,M,AZ,MD,Internal Medicine,0,43,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1679650949,M,NV,M.D.,Hematology/Oncology,0,0,0,0,0,...,0,0,0,0,17,28,0,0,0,1


### Step 2: Transform

In [24]:
# Cleaned dataset and dropped any bad records:
prescribed_df = prescriber_info_df.dropna(how='any')
prescribed_df.head()

Unnamed: 0,NPI,Gender,State,Credentials,Specialty,ABILIFY,ACETAMINOPHEN.CODEINE,ACYCLOVIR,ADVAIR.DISKUS,AGGRENOX,...,VERAPAMIL.ER,VESICARE,VOLTAREN,VYTORIN,WARFARIN.SODIUM,XARELTO,ZETIA,ZIPRASIDONE.HCL,ZOLPIDEM.TARTRATE,Opioid.Prescriber
0,1710982582,M,TX,DDS,Dentist,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1245278100,F,AL,MD,General Surgery,0,0,0,0,0,...,0,0,0,0,0,0,0,0,35,1
2,1427182161,F,NY,M.D.,General Practice,0,0,0,0,0,...,0,0,0,0,0,0,0,0,25,0
3,1669567541,M,AZ,MD,Internal Medicine,0,43,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1679650949,M,NV,M.D.,Hematology/Oncology,0,0,0,0,0,...,0,0,0,0,17,28,0,0,0,1


In [25]:
# Filtered dataset by selecting columns needed to answer potential query:
prescribed_subset = prescribed_df[["NPI", "Gender", "State", "Credentials", "Specialty",
                             "Opioid.Prescriber"]]
prescribed_subset.head()

Unnamed: 0,NPI,Gender,State,Credentials,Specialty,Opioid.Prescriber
0,1710982582,M,TX,DDS,Dentist,1
1,1245278100,F,AL,MD,General Surgery,1
2,1427182161,F,NY,M.D.,General Practice,0
3,1669567541,M,AZ,MD,Internal Medicine,1
4,1679650949,M,NV,M.D.,Hematology/Oncology,1


In [27]:
# Renaming the subset dataframe columns:
opioid_prescriber_df = prescribed_subset.rename(columns={
    'NPI': 'national_provider_identifier',
    'Gender': 'gender',
    'Credentials': 'Credentials (Medical Degree)',
    'Specialty': 'Specialty (Type of Practice)',
    'Opioid.Prescriber': 'Prescribed Opiate Drugs More Than 10 Times in the Year'})
opioid_prescriber_df.head()

Unnamed: 0,national_provider_identifier,gender,State,Credentials (Medical Degree),Specialty (Type of Practice),Prescribed Opiate Drugs More Than 10 Times in the Year
0,1710982582,M,TX,DDS,Dentist,1
1,1245278100,F,AL,MD,General Surgery,1
2,1427182161,F,NY,M.D.,General Practice,0
3,1669567541,M,AZ,MD,Internal Medicine,1
4,1679650949,M,NV,M.D.,Hematology/Oncology,1


In [None]:
# new_customer_location_df = customer_location_df[["id", "address", "us_state"]].copy()
# new_customer_location_df.head()

## Cause of Death and Prescriber Info Data

### Step 3: Load

In [7]:
# Connect to the local database
opioid_connection_string = (f'{username}:{password}@localhost:5432/opioid_db')
engine = create_engine(f'postgresql://{opioid_connection_string}')

In [8]:
# Check for tables
engine.table_names()

[]