In [1]:
#import dependencies
import pandas as pd
from sqlalchemy import create_engine
from config import db_password
import time

In [2]:
#import the file from github to a dataframe
url = 'https://raw.githubusercontent.com/Bropell/Asthma_Analysis_in_California_Counties/main/Resources/Raw_Data/California_Electric_Substations.csv'
electric_substation_df = pd.read_csv(url)
electric_substation_df

Unnamed: 0,X,Y,OBJECTID,Owner,Path,Source,Type,HIFLD_ID,Name,Max_Voltage,ZIP_CODE,STATE,COUNTY,CITY,Lon,Lat
0,-13608019.18,4547328.302,1,Other,,CEC,SUBSTATION,310025.0,Jenney,115.0,94501,CA,Alameda County,Alameda,-122.242916,37.772436
1,-13654155.81,4616931.162,2,PG&E,,CEC,SUBSTATION,306474.0,Corona,115.0,94954,CA,Sonoma County,Petaluma,-122.657369,38.265013
2,-13539338.97,4548179.784,3,PG&E,,CEC,SUBSTATION,310124.0,South Bay 1,,94514,CA,Alameda County,Unincorporated,-121.625951,37.778482
3,-13545044.87,4543940.580,4,PG&E,,CEC,SUBSTATION,306256.0,Altamont,60.0,94551,CA,Alameda County,Unincorporated,-121.677208,37.748376
4,-13587799.24,4535905.499,5,PG&E,,CEC,SUBSTATION,303892.0,Castro Valley,230.0,94546,CA,Alameda County,Unincorporated,-122.061277,37.691281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4437,-13029912.97,3855039.869,4438,,,HIFLD,TAP,310978.0,Unknown,69.0,92114,CA,San Diego County,San Diego,-117.049700,32.696133
4438,-13557362.21,4579364.894,4439,,,HIFLD,TAP,310981.0,Unknown,115.0,94509,CA,Contra Costa County,Antioch,-121.787857,37.999569
4439,-13566821.45,4577639.055,4440,,,HIFLD,TAP,310982.0,Unknown,115.0,94565,CA,Contra Costa County,Unincorporated,-121.872831,37.987351
4440,-13026193.37,4007926.361,4441,,,HIFLD,TAP,310983.0,Unknown,33.0,92555,CA,Riverside County,Unincorporated,-117.016286,33.844389


In [3]:
#reducing columns in dataframe to 'OBJECTID', 'COUNTY','Max_Voltage','Source','STATE','ZIP_CODE','Lon','Lat' only
elect_df = electric_substation_df[['OBJECTID', 'COUNTY','Max_Voltage','Source','STATE','ZIP_CODE','Lon','Lat']]
elect_df

Unnamed: 0,OBJECTID,COUNTY,Max_Voltage,Source,STATE,ZIP_CODE,Lon,Lat
0,1,Alameda County,115.0,CEC,CA,94501,-122.242916,37.772436
1,2,Sonoma County,115.0,CEC,CA,94954,-122.657369,38.265013
2,3,Alameda County,,CEC,CA,94514,-121.625951,37.778482
3,4,Alameda County,60.0,CEC,CA,94551,-121.677208,37.748376
4,5,Alameda County,230.0,CEC,CA,94546,-122.061277,37.691281
...,...,...,...,...,...,...,...,...
4437,4438,San Diego County,69.0,HIFLD,CA,92114,-117.049700,32.696133
4438,4439,Contra Costa County,115.0,HIFLD,CA,94509,-121.787857,37.999569
4439,4440,Contra Costa County,115.0,HIFLD,CA,94565,-121.872831,37.987351
4440,4441,Riverside County,33.0,HIFLD,CA,92555,-117.016286,33.844389


In [4]:
elect_df = elect_df.dropna()
elect_df

Unnamed: 0,OBJECTID,COUNTY,Max_Voltage,Source,STATE,ZIP_CODE,Lon,Lat
0,1,Alameda County,115.0,CEC,CA,94501,-122.242916,37.772436
1,2,Sonoma County,115.0,CEC,CA,94954,-122.657369,38.265013
3,4,Alameda County,60.0,CEC,CA,94551,-121.677208,37.748376
4,5,Alameda County,230.0,CEC,CA,94546,-122.061277,37.691281
5,6,Alameda County,115.0,CEC,CA,94618,-122.228585,37.847373
...,...,...,...,...,...,...,...,...
4436,4437,Los Angeles County,66.0,HIFLD,CA,91722,-117.925235,34.095981
4437,4438,San Diego County,69.0,HIFLD,CA,92114,-117.049700,32.696133
4438,4439,Contra Costa County,115.0,HIFLD,CA,94509,-121.787857,37.999569
4439,4440,Contra Costa County,115.0,HIFLD,CA,94565,-121.872831,37.987351


In [5]:
#export the cleaned data to a csv file change to "DataSource to: ../Clean_Data/"
elect_df.to_csv('../Updated_Clean_Data/CLEAN_elec_substation_data.csv', index=False)

In [6]:
#import the dataframe to a table in sql
#Make sure to add a config.py file in the folder where this script is that has the following
#db_password = "Your Password for SQL"
#also change the database name to the database we are importing the table to.
#we use final_project for our database name and it is create prior to running this script
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/asthma_final_project"

engine = create_engine(db_string)

rows_imported = 0
# get the start_time from time.time()
start_time = time.time()
#replace the file location to that of which the cleaned data was saved to csv.
#file line should match that of the export file location and name
for data in pd.read_csv(f'../Updated_Clean_Data/CLEAN_elec_substation_data.csv', chunksize=1000000):
    print(f'importing rows {rows_imported} to {rows_imported + len(data)}...', end='')
    #name the table of which the dataframe is imported to in sql
    data.to_sql(name='electric_substations', con=engine, index=False)
    rows_imported += len(data)

    # add elapsed time to final print out
    print(f'Done. {time.time() - start_time} total seconds elapsed')

importing rows 0 to 3371...Done. 0.17847824096679688 total seconds elapsed
