# SDGE 
Data is from [Data Source](https://energydata.sdge.com/)

In [2]:
# Notebook ssetup
import os
import numpy as np
import pandas as pd
import re

# directories
clean = 'Clean_Data/'
raw = 'Foundation/Raw Data/'
root = '/root/'

def concatenator(base_dir, folder):
  ''' 
  Takes in file directory and specific folder name.
  Returns all files in directory as a single pd.DF   
  '''

  files = os.listdir(base_dir + folder)

  # append all files to
  dfs = []

  # loop over files in folder and append to dfs list
  for file_ in files:
    df = pd.read_csv(os.path.join(base_dir, folder, file_))
    dfs.append(df)

  return pd.concat(dfs, axis=0, ignore_index=True)


# Zip Code Ground Truth
San Diego COUNTY Zip Code list is from [sdcourt.ca.gov/sdcourt/familyandchildren2/wheretofile/zipcodedirectory](https://www.sdcourt.ca.gov/sdcourt/familyandchildren2/wheretofile/zipcodedirectory).

<br><br>
**Zip Code Ground Truth will be utilize to filter out any unnecessary Zip Codes from other Data Tables.**

In [3]:
# read in GROUND TRUTH
file_path = root + raw + 'Other/' + 'Zip_Codes_by_Court_District_GROUND_TRUTH.csv'
official_zip_code_df = pd.read_csv(file_path)
official_zip_codes = official_zip_code_df['ZIP CODE']
official_zip_codes[:3]

0    91901.0
1    91902.0
2    91903.0
Name: ZIP CODE, dtype: float64

# Gas

In [4]:
base_sdge_dir = root + raw + 'SDGE/'
folder = 'Gas'
gas_df = concatenator(base_sdge_dir, folder)

print(f'Shape {gas_df.shape}\n', '--'*20)
gas_df.head(3)

Shape (29250, 8)
 ----------------------------------------


Unnamed: 0,ZipCode,Month,Year,CustomerClass,Combined,TotalCustomers,TotalTherms,AverageTherms
0,91901,10,2019,C,Y,0,0,0
1,91901,10,2019,R,Y,2098,42738,20
2,91902,10,2019,C,Y,0,0,0


In [5]:
gas_df = gas_df[gas_df['ZipCode'].isin(official_zip_codes)].copy()
gas_df.shape

(28548, 8)

In [6]:
# Drop Zip Codes/Rows with 0 Customers
gas_df = gas_df[gas_df['TotalCustomers'] != 0]
gas_df.shape

(13867, 8)

In [7]:
# group by the following to combine commercial & residential consumers into 1 row
gas_df = gas_df.groupby(['ZipCode', 'Year','Month']).sum()
gas_df = gas_df.reset_index()
gas_df.head(3)

Unnamed: 0,ZipCode,Year,Month,TotalCustomers,TotalTherms,AverageTherms
0,91901,2012,1,2711,148354,292
1,91901,2012,2,2704,138102,283
2,91901,2012,3,2674,159852,309


# Electric

In [None]:
base_sdge_dir = root + raw + 'SDGE/'
folder = 'Electric'
elect_df = concatenator(base_sdge_dir, folder)

print(f'Shape {elect_df.shape}\n', '--'*20)

elect_df.head(3)

In [None]:
elect_df = elect_df[elect_df['ZipCode'].isin(official_zip_codes)].copy()
elect_df.shape

In [None]:
# Drop Zip Codes/Rows with 0 Customers
elect_df = elect_df.loc[elect_df['TotalCustomers'] != 0]
elect_df.shape

In [None]:
# group by the following to combine commercial & residential consumers into 1 row
elect_df = elect_df.groupby(['ZipCode', 'Year','Month']).sum()
elect_df = elect_df.reset_index()
elect_df.head(3)

# Merge Gas & Electric

In [None]:
sdge_df = pd.merge(gas_df, elect_df, on=['ZipCode', 'Year', 'Month'], suffixes=('_gas','_elect'))
sdge_df.head(3)

In [None]:
sdge_df.shape

In [None]:
clean_data_dir = root + clean

sdge_df.to_parquet(clean_data_dir + 'sdge.parquet.gzip', compression='gzip', index=False)

In [None]:
print('SDGE Data is READY TO ROCK!!!!')

In [None]:
import boto3
import sagemaker

# Sagemaker session
sess = sagemaker.Session()

# bucket = "my-bucket"
bucket = sess.default_bucket()

file_name = 'sdge.parquet.gzip'
file_path = 'Clean_Data/' + file_name

# Upload flow to S3
s3_client = boto3.client("s3")
s3_client.upload_file(clean_data_dir + file_name, bucket, file_path)