# BusObservatory Compactor (no Spark)

In [1]:
#TODO change print statements to logging?
#TODO convert to a lambda and test (will need a lot of memory)
#TODO debug the pip dependencies for boto3 and s3fs (notebook on laptop used conda)
#TODO put that lambda in a top-level folder in the project
#TODO add the lambda to template.yaml

In [2]:
# settings
region="us-east-1"
bucket="busobservatory-migration"
config_object_key = "_bus_observatory_config.json"

In [3]:
import json
import boto3
import pandas as pd
import s3fs
import datetime

In [4]:
# get config / list of systems
def get_system_list(bucket, region,config_object_key):
    s3 = boto3.resource('s3')
    obj = s3.Object(bucket, config_object_key)
    config = json.load(obj.get()['Body'])
    return config

In [5]:
def list_s3_files_using_paginator(bucket,system_id):
    s3_client = boto3.client("s3")
    paginator = s3_client.get_paginator("list_objects_v2")
    response = paginator.paginate(Bucket=bucket, 
                                  PaginationConfig={"PageSize": 1000}, 
                                  Prefix=f"incoming/{system_id}"
                                 )
    object_list = []                 
    for page in response:
        # print("getting 1000 files from S3")
        files = page.get("Contents")
        # for file in files:
        #     print(f"file_name: {file['Key']}, size: {file['Size']}")
        # print("#" * 10)
        object_list.extend([f['Key'] for f in files])
        
    return object_list

In [6]:
def delete_list_of_objects_s3(bucket, object_list):
    object_dict = [{"Key":f} for f in object_list]
    s3_client = boto3.client("s3")
    response = s3_client.delete_objects(
        Bucket=bucket,
        Delete={"Objects": object_dict},
    )
    return
    

In [7]:
# compact a single system
def compact_and_deliver_incoming(bucket, system_id):
    

    # get list of objects in incoming
    object_list = list_s3_files_using_paginator(bucket,system_id)
    
    # for f in object_list:
    #     print(f)
    
    # read each object_list file as parquet and append to a df
    df = pd.concat(
        pd.read_parquet(f's3://{bucket}/{object}')
        for object in object_list
    )
    print(f'{system_id}: read {df.shape} dataframe from {len(object_list)} files.')
    
    # write the new df to the lake s3://{bucket}/{system_id}
    timestamp = datetime.datetime.now().isoformat().replace(":","_").replace("-","_")
    outfile=f"s3://{bucket}/{system_id}/{timestamp}.parquet"
    
    #DEBUG create folder if its not there?
    df.to_parquet(outfile, compression='snappy')
    print(f'{system_id}: wrote {df.shape} dataframe to {outfile}')
    
    # delete all of the object_list files
    delete_list_of_objects_s3(bucket, object_list)
    print(f'{system_id}: deleted {len(object_list)} files from s3://{bucket}/incoming/{system_id}')

    return

In [8]:
# get config
bus_observatory_config = get_system_list(bucket,region,config_object_key)

# iterate over systems
for system_id, system_config in bus_observatory_config.items():
    compact_and_deliver_incoming(bucket, system_id)

TEST_tfnsw_bus: read (86311, 21) dataframe from 403 files.
TEST_tfnsw_bus: wrote (86311, 21) dataframe to s3://busobservatory-migration/TEST_tfnsw_bus/2022_08_30T16_02_03.443283.parquet
TEST_tfnsw_bus: deleted 403 files from s3://busobservatory-migration/incoming/TEST_tfnsw_bus
TEST_nyct_mta_bus_siri: read (730695, 23) dataframe from 400 files.
TEST_nyct_mta_bus_siri: wrote (730695, 23) dataframe to s3://busobservatory-migration/TEST_nyct_mta_bus_siri/2022_08_30T16_06_32.253397.parquet
TEST_nyct_mta_bus_siri: deleted 400 files from s3://busobservatory-migration/incoming/TEST_nyct_mta_bus_siri
TEST_nyct_mta_bus_gtfsrt: read (842342, 11) dataframe from 339 files.
TEST_nyct_mta_bus_gtfsrt: wrote (842342, 11) dataframe to s3://busobservatory-migration/TEST_nyct_mta_bus_gtfsrt/2022_08_30T16_10_18.256205.parquet
TEST_nyct_mta_bus_gtfsrt: deleted 339 files from s3://busobservatory-migration/incoming/TEST_nyct_mta_bus_gtfsrt
TEST_njtransit_bus: read (190901, 24) dataframe from 301 files.
TEST_