## This script uploads manually downloaded data to AWS bucket for the California Climate Risk Index

In [1]:
# Import libraries
import boto3
import pandas as pd
import os
import sys

sys.path.append(os.path.expanduser('../../'))
from scripts.utils.write_metadata import append_metadata

In [2]:
# Set AWS credentials
s3 = boto3.resource('s3')
s3_cl = boto3.client('s3') # for lower-level processes
bucket_name = 'ca-climate-index'
raw_path = '1_pull_data/' # path to raw datafiles in AWS bucket

In [3]:
def aws_datasource_dirs(domain, datasource):
    """Creates a dir in the respective domain dir, if not already available"""
    bucket = s3.Bucket(bucket_name)

    # path to folder in aws
    datasource_dir = '{0}{1}/{2}/'.format(raw_path, domain, datasource)

    # # check if folder already exists
    dirs = []
    for item in bucket.objects.filter(Prefix=raw_path+domain+'/'):
        d = str(item.key)
        dirs += [d]

    if datasource_dir not in dirs:
        print('Creating folder for {}'.format(datasource_dir))
    #     bucket.put_object(Key=datasource_dir)

    return datasource_dir

@append_metadata
def manual_to_aws(domain, datasource, loc, filename, export=False, varname=''):
    """
    Uploads data that was manually downloaded to AWS bucket.
    
    Parameters
    ----------
    domain: string
        built_environment, governance, natural_systems, society_economy, climate_risk
    datasource: string
        Organization of datasource
    loc: string
        Local path to filename to upload
    export: bool
        If True, exports file to specified AWS bucket
    
    Script
    ------
    manual_pull_upload.ipynb

    Note:
    This function assumes users have configured the AWS CLI such that their access key / secret key pair are
    stored in ~/.aws/credentials.
    See https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html for guidance.
    """
    
    # extract the filename from path
    loc = loc.replace('\\', '/')
    fname = loc.split('/')[-1]
    path_to_save = aws_datasource_dirs(domain, datasource)
    
    if export == True:
        # point to location of file(s) locally and upload to aws
        try:
            s3_cl.upload_file(
                loc,
                bucket_name,
                aws_datasource_dirs(domain, datasource)+fname
            )
            print('{0} saved to {1}'.format(fname, path_to_save))
        except Exception as e:
            print(e)

    if export == False:
        # Metadata update optionality
        print(f'{filename} uploaded to AWS')

### Pulling data pipeline file to obtain all variable names for metadata generation

In [4]:
#ref_file = sys.path[-1]+'/metadata/Full Data Pipeline Notes - 1_ Pull.csv'
ref_file = f'C:/Users/jespi/eagle/carb-climate-index-7/metadata/Full Data Pipeline Notes - 1_ Pull.csv'
df = pd.read_csv(ref_file)
# drop empty columns
df = df.loc[:, df.columns.notna()]

df = df.drop(columns=['Link','Unnamed: 15'])
ref_df = df.fillna('N/A')

### Isolate for variable and file name column entries
* enter variables that used other pulling/uploading methods and exclude
* return the isolated list with manual pulled files and variable names

In [5]:
# Build list of variable names excluding 'N/A'
# Include variable and file name columns
variable_names = [(row['Variable'], row['File Name']) for _, row in df.iterrows() if row['Variable'] != 'N/A']

# Define files that used other methods for pulling/uploading to be excluded
skip_vars = ['natural_epa_air_quality',
             'governance_edd_responder_firefighter',
             'governance_edd_responder_nurse',
             'governance_edd_responder_parametics',
             'governance_edd_responder_police',
             'climate_noaa_flood_fatalities',
             'climate_usda_heat_crop_loss',
             'climate_usda_heat_crop_cost']

# Exclude variables from the list along with their corresponding file names
included_vars = [(var, fname) for var, fname in variable_names if var not in skip_vars]

# Store the remaining variable and corresponding file names to be run through the manual pull function
included_vars_df = pd.DataFrame(included_vars, columns=['Variable', 'File Name'])

included_vars_df


Unnamed: 0,Variable,File Name
0,built_cpuc_internet,broadband_internet.gdb.zip
1,built_hifld_radio_towers,FM_Transmission_Towers.zip
2,built_hifld_cellular_towers,Cellular_Towers.zip
3,built_hifld_microwave_towers,Microwave_Service_Towers.zip
4,built_hifld_paging_towers,Paging_Transmission_Towers.zip
...,...,...
97,society_acs_demographic_over_65,demographic_DP05.csv
98,society_acs_demographic_under_5,demographic_DP05.csv
99,society_acs_demographic_american_indian,demographic_DP05.csv
100,society_acs_health_insurance,health_insurance_B27010.csv


### Generating metadata for all metrics that were manually downloaded and uploaded to AWS
* loop through each variable name not excluded and generate metadata, including filename
* use var[1] for filenames and var[0] for varnames

In [6]:
for var in included_vars:
    manual_to_aws(domain='all', datasource='all', loc='any', filename=var[1], export=False, varname=var[0])