******************************************************************************************************************
# ETL Project: Extract, Transform, Load
******************************************************************************************************************

## Cause of Death Data
******

### Step 1: Extract 
******************************************************************************************************************

#### Importing Dependencies

In [1]:
# Import Dependencies
import pandas as pd
import os 

#### Create the CSV Data File Path

In [2]:
# CSV data file path: 
cause_of_death = os.path.join("Resources", "cause_of_death.csv")

#### Read the Data File and Store into a Pandas DataFrame

In [3]:
# Reading in the data file and storing it into a Pandas DataFrame:
cause_of_death_df = pd.read_csv("Resources/cause_of_death.csv")
cause_of_death_df.head()

Unnamed: 0,State,Year,Deaths,Population,Crude Rate,Crude Rate Lower 95% Confidence Interval,Crude Rate Upper 95% Confidence Interval,Prescriptions Dispensed by US Retailers in that year (millions)
0,Alabama,1999,39,4430141,0.9,0.6,1.2,116
1,Alabama,2000,46,4447100,1.0,0.8,1.4,126
2,Alabama,2001,67,4467634,1.5,1.2,1.9,138
3,Alabama,2002,75,4480089,1.7,1.3,2.1,142
4,Alabama,2003,54,4503491,1.2,0.9,1.6,149


In [7]:
# Updated the object type as both a string and integer are in the column
cause_of_death_df['Crude Rate'] = cause_of_death_df['Crude Rate'].astype(str)
cause_of_death_df

Unnamed: 0,State,Year,Deaths,Population,Crude Rate,Crude Rate Lower 95% Confidence Interval,Crude Rate Upper 95% Confidence Interval,Prescriptions Dispensed by US Retailers in that year (millions)
0,Alabama,1999,39,4430141,0.9,0.6,1.2,116
1,Alabama,2000,46,4447100,1,0.8,1.4,126
2,Alabama,2001,67,4467634,1.5,1.2,1.9,138
3,Alabama,2002,75,4480089,1.7,1.3,2.1,142
4,Alabama,2003,54,4503491,1.2,0.9,1.6,149
...,...,...,...,...,...,...,...,...
811,Wyoming,2010,49,563626,8.7,6.4,11.5,210
812,Wyoming,2011,47,568158,8.3,6.1,11,219
813,Wyoming,2012,47,576412,8.2,6,10.8,217
814,Wyoming,2013,52,582658,8.9,6.7,11.7,207


### Step 2: Transform

In [4]:
# Cleaned dataset and dropped any bad records:
cleaned_df = cause_of_death_df.dropna(how='any')
cleaned_df.head()

Unnamed: 0,State,Year,Deaths,Population,Crude Rate,Crude Rate Lower 95% Confidence Interval,Crude Rate Upper 95% Confidence Interval,Prescriptions Dispensed by US Retailers in that year (millions)
0,Alabama,1999,39,4430141,0.9,0.6,1.2,116
1,Alabama,2000,46,4447100,1.0,0.8,1.4,126
2,Alabama,2001,67,4467634,1.5,1.2,1.9,138
3,Alabama,2002,75,4480089,1.7,1.3,2.1,142
4,Alabama,2003,54,4503491,1.2,0.9,1.6,149


In [5]:
# Filtered dataset by selecting columns needed to answer potential query:
cleaned_subset = cleaned_df[["State", "Year", "Deaths", "Population", "Crude Rate",
                             "Prescriptions Dispensed by US Retailers in that year (millions)" ]]
cleaned_subset.head()

Unnamed: 0,State,Year,Deaths,Population,Crude Rate,Prescriptions Dispensed by US Retailers in that year (millions)
0,Alabama,1999,39,4430141,0.9,116
1,Alabama,2000,46,4447100,1.0,126
2,Alabama,2001,67,4467634,1.5,138
3,Alabama,2002,75,4480089,1.7,142
4,Alabama,2003,54,4503491,1.2,149


In [6]:
# Renaming the subset dataframe columns:
dispensed_df = cleaned_subset.rename(columns={
    'Crude Rate': 'Death per Hundred Thousand',
    'Prescriptions Dispensed by US Retailers in that year (millions)': 'US Dispensed Prescriptions (millions)'})
dispensed_df.head()

Unnamed: 0,State,Year,Deaths,Population,Death per Hundred Thousand,US Dispensed Prescriptions (millions)
0,Alabama,1999,39,4430141,0.9,116
1,Alabama,2000,46,4447100,1.0,126
2,Alabama,2001,67,4467634,1.5,138
3,Alabama,2002,75,4480089,1.7,142
4,Alabama,2003,54,4503491,1.2,149


## Prescriber Info Data

### Step 1: Extract
*********************************************************************

#### Create the CSV Data File Path

In [None]:
# CSV Data File Path:
prescriber_info = os.path.join("Resources", "prescriber-info.csv")