In [1]:
#import dependencies
import pandas as pd
from sqlalchemy import create_engine
from config import db_password
import time

In [2]:
#import the file from github to a dataframe
url = 'https://raw.githubusercontent.com/Bropell/Asthma_Analysis_in_California_Counties/main/Resources/Raw_Data/Air_Pollutant_data.csv'
ap_df = pd.read_csv(url)
ap_df

Unnamed: 0,StateFIPS,State,CountyFIPS,County,Year,Value,Data Comment,Pollutant
0,1,Alabama,1001,Autauga,2005,0.72,,Pollutant: Benzene
1,1,Alabama,1001,Autauga,2005,2.60,,Pollutant: Formaldehyde
2,1,Alabama,1001,Autauga,2005,2.87,,Pollutant: Acetaldehyde
3,1,Alabama,1001,Autauga,2005,0.61,,Pollutant: Carbon tetrachloride
4,1,Alabama,1001,Autauga,2005,0.03,,"Pollutant: 1,3-butadiene"
...,...,...,...,...,...,...,...,...
31415,56,Wyoming,56045,Weston,2011,0.20,,Pollutant: Benzene
31416,56,Wyoming,56045,Weston,2011,0.75,,Pollutant: Formaldehyde
31417,56,Wyoming,56045,Weston,2011,1.14,,Pollutant: Acetaldehyde
31418,56,Wyoming,56045,Weston,2011,0.55,,Pollutant: Carbon tetrachloride


In [3]:
#filter to just states with the name California
ap_df2 = ap_df.loc[ap_df.State == 'California']
ap_df2.head()

Unnamed: 0,StateFIPS,State,CountyFIPS,County,Year,Value,Data Comment,Pollutant
1850,6,California,6001,Alameda,2005,1.11,,Pollutant: Benzene
1851,6,California,6001,Alameda,2005,2.9,,Pollutant: Formaldehyde
1852,6,California,6001,Alameda,2005,1.98,,Pollutant: Acetaldehyde
1853,6,California,6001,Alameda,2005,0.61,,Pollutant: Carbon tetrachloride
1854,6,California,6001,Alameda,2005,0.12,,"Pollutant: 1,3-butadiene"


In [4]:
#convert the year column to type string
ap_df2['Year'] = ap_df2['Year'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [5]:
#select specific year of 2011
ap_df3 = ap_df2.loc[ap_df2.Year == '2011']
ap_df3.head()

Unnamed: 0,StateFIPS,State,CountyFIPS,County,Year,Value,Data Comment,Pollutant
1855,6,California,6001,Alameda,2011,0.66,,Pollutant: Benzene
1856,6,California,6001,Alameda,2011,1.33,,Pollutant: Formaldehyde
1857,6,California,6001,Alameda,2011,1.2,,Pollutant: Acetaldehyde
1858,6,California,6001,Alameda,2011,0.55,,Pollutant: Carbon tetrachloride
1859,6,California,6001,Alameda,2011,0.07,,"Pollutant: 1,3-butadiene"


In [6]:
#reduce dataframe to just County Value Pollutant columns
ap_df4 = ap_df3[['County','Value','Pollutant']]
#ap_df4 = ap_df4.set_index("County")
ap_df4

Unnamed: 0,County,Value,Pollutant
1855,Alameda,0.66,Pollutant: Benzene
1856,Alameda,1.33,Pollutant: Formaldehyde
1857,Alameda,1.20,Pollutant: Acetaldehyde
1858,Alameda,0.55,Pollutant: Carbon tetrachloride
1859,Alameda,0.07,"Pollutant: 1,3-butadiene"
...,...,...,...
2425,Yuba,0.45,Pollutant: Benzene
2426,Yuba,2.37,Pollutant: Formaldehyde
2427,Yuba,2.28,Pollutant: Acetaldehyde
2428,Yuba,0.55,Pollutant: Carbon tetrachloride


In [7]:
ap_df4.dtypes

County        object
Value        float64
Pollutant     object
dtype: object

In [8]:
#export the cleaned data to a csv file change to "DataSource to: ../Clean_Data/"
ap_df4.to_csv('../Updated_Clean_Data/CLEAN_air_pollutant_data.csv', index=False)

In [9]:
#import the dataframe to a table in sql

#Make sure to add a config.py file in the folder where this script is that has the following
#db_password = "Your Password for SQL"
#also change the database name to the database we are importing the table to.
#we use final_project for our database name and it is create prior to running this script
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/final_project"

engine = create_engine(db_string)

rows_imported = 0
# get the start_time from time.time()
start_time = time.time()
#replace the file location to that of which the cleaned data was saved to csv.
#file line should match that of the export file location and name
for data in pd.read_csv(f'../Updated_Clean_Data/CLEAN_air_pollutant_data.csv', chunksize=1000000):
    print(f'importing rows {rows_imported} to {rows_imported + len(data)}...', end='')
    #name the table of which the dataframe is imported to in sql
    data.to_sql(name='air_pollutants', con=engine, index=False)
    rows_imported += len(data)

    # add elapsed time to final print out
    print(f'Done. {time.time() - start_time} total seconds elapsed')

importing rows 0 to 290...

ValueError: Table 'air_pollutants' already exists.

In [None]:
#all done file imported from github 
#dataframe cleaned
#file exported to csv
#file imported to sql