In [1]:
# import dependencies
import pandas as pd
from sqlalchemy import create_engine
from config import db_password
import time

- This script imports the raw data from github 
- Organizes and cleans the dataframe
- Exports the clean data to a csv
- Imports the clean data directly to SQL

In [2]:
# import the file from github to a dataframe
url = '../Raw_Data/cali_clean_energy.csv'
ce_df = pd.read_csv(url)
ce_df

Unnamed: 0,County,Biomass,Small Hydro,Geothermal,Solar Thermal,Solar Photovoltaic,Wind,Total
0,Alameda,116.0,,,,29.0,764.0,909
1,Alpine,0.0,0.0,0.0,0.0,0.0,0.0,0
2,Amador,,78.0,,,2.0,,80
3,Butte,3.0,225.0,,,16.0,,244
4,Calaveras,,202.0,,,3.0,,205
5,Colusa,188.0,,,,,,188
6,Contra Costa,31.0,,,,83.0,88.0,201
7,Del Norte,0.0,0.0,0.0,0.0,0.0,0.0,0
8,El Dorado,,190.0,,,,,190
9,Fresno,188.0,22.0,,,1981.0,,2190


In [3]:
# replace NaN's with 0's
ce_df = ce_df.fillna(0)

In [4]:
ce_df = ce_df.loc[ce_df['County'] != 'Total']

In [5]:
ce_df = ce_df.rename(columns={'County':'county'})

In [6]:
# check dtypes
ce_df.dtypes

county                  object
Biomass                float64
Small Hydro            float64
Geothermal             float64
Solar Thermal          float64
Solar Photovoltaic     float64
Wind                   float64
Total                    int64
dtype: object

In [7]:
# Export the cleaned data to a csv file change to "DataSource to: ../Clean_Data/"
ce_df.to_csv('../Clean_Data/CLEAN_cali_clean_energy.csv', index=False)

In [8]:
# import the dataframe to a table in sql

# Make sure to add a config.py file in the folder where this script is that has the following
# db_password = "Your Password for SQL"
# also change the database name to the database we are importing the table to.
# You must create the database prior to running this script
# We use final_project for our database name
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/final_project"

engine = create_engine(db_string)

rows_imported = 0
# get the start_time from time.time()
start_time = time.time()
#replace the file location to that of which the cleaned data was saved to csv.
#file line should match that of the export file location and name
for data in pd.read_csv(f'../Clean_Data/CLEAN_cali_clean_energy.csv', chunksize=1000000):
    print(f'importing rows {rows_imported} to {rows_imported + len(data)}...', end='')
    #name the table of which the dataframe is imported to in sql
    data.to_sql(name='clean_energy', con=engine, index=False)
    rows_imported += len(data)

    # add elapsed time to final print out
    print(f'Done. {time.time() - start_time} total seconds elapsed')

importing rows 0 to 58...Done. 0.05794119834899902 total seconds elapsed
