In [9]:
import pathlib
import getpass
import pandas as pd
import geopandas as gpd
from shapely import wkt
from mtcpy.geospatial import geom_to_hexwkb
from mtcpy.aws import post_df_to_s3, create_redshift_table_via_s3, list_s3_buckets
from mtcpy.analytics import create_column_type_dict

user = getpass.getuser()

In [3]:
work_dir = pathlib.Path(
    f"/Users/{user}/Library/CloudStorage/Box-Box/DataViz Projects/Data Services/FasTrak Data"
)
gc_data = work_dir / "Fastrak Accounts Cleaned" / "bay_area_fastrak_accounts_geocoded.csv"


In [16]:
def publish_df_s3_redshift(data_name, df, bucket=bucket, schema="accounts"):
    """Given a DataFrame, or a GeoDataFrame, and Data Name, Push to S3
    then publish to Redshift
    """
    df = df.copy()
    # get column types
    ctypes = create_column_type_dict(df)
    # convert geometry to well-known binary (wkb) format
    if "geometry" in ctypes.keys():
        if not df.crs == 4326:
            df = df.to_crs(4326)
        df["geometry"] = df["geometry"].apply(lambda x: geom_to_hexwkb(x) if x != None else None)
        ctypes["geometry"] = "geometry"

    ## Push to S3
    key = f"{schema}/{data_name}.csv"
    post_df_to_s3(df, bucket, key)

    ## Push to Redshift
    tablename = f"{schema}.{data_name}"
    s3_path = f"s3://{bucket}/{key}"
    create_redshift_table_via_s3(
        tablename=tablename, s3_path=s3_path, ctypes=ctypes, dbname="fastrak_ds"
    )

In [17]:
# read in gc_data and convert geometry with wkt to geodataframe
df = pd.read_csv(gc_data)
df['geometry'] = df['geometry'].apply(wkt.loads)
# Convert the DataFrame to a GeoDataFrame
gdf = gpd.GeoDataFrame(df, geometry='geometry', crs='EPSG:4326')


In [18]:
bucket = "eps-upload"
publish_df_s3_redshift(
    data_name="fastrak_addresses_geocoded", df=gdf, bucket=bucket, schema="accounts"
)

  self[k1] = value[k2]
  df["geometry"] = df["geometry"].apply(lambda x: geom_to_hexwkb(x) if x != None else None)


Info: Set AWS creds using 'default' creds.
dataframe on S3 at eps-upload:accounts/fastrak_addresses_geocoded.csv
DROP TABLE IF EXISTS accounts.fastrak_addresses_geocoded


CREATE TABLE accounts.fastrak_addresses_geocoded(
address_orig varchar(136),
formatted_address varchar(228),
geometry_location_type varchar(36),
types varchar(162),
partial_match varchar(8),
geometry geometry);


COPY accounts.fastrak_addresses_geocoded
                FROM 's3://eps-upload/accounts/fastrak_addresses_geocoded.csv'
                CREDENTIALS 'aws_access_key_id=XXX;aws_secret_access_key=XXX'
                EMPTYASNULL
                FILLRECORD
                TIMEFORMAT as auto
                DATEFORMAT as auto
                NULL AS nan
                CSV
                IGNOREHEADER 1;


table created on Redshift: accounts.fastrak_addresses_geocoded
