In [1]:
#!/usr/bin/env python
# vim: set fileencoding=utf8 :
#```

#!pip install -U boto3 retrying
#!export AWS_DEFAULT_PROFILE=test


## Use Athena to extract features on all data

<p>The dataset we are working with contains 55M records, making its handling too heavy for a single machine.</p>
<p>Using a distributed computing engine like&nbsp;<a href="https://aws.amazon.com/athena/">AWS Athena</a>&nbsp;will enable you to extract features and save data efficiently.&nbsp;</p>
<p>In order to work on the data, we upload it to S3, and than partition it using AWS Glue. Partitioning is critical to make Athena run efficiently. For examples on how to use Glue, go&nbsp;<a href="https://github.com/doitintl/aws-glue-workshop">HERE</a>.</p>

### Extract features 

<p>With the data partitioned (say, by year and month), run the following Athena query to extract the following features&nbsp;</p>
<p>After extracting features, partition the query results using Glue (again)</p>

CREATE DATABASE IF NOT EXISTS taxinyc;

CREATE EXTERNAL TABLE IF NOT EXISTS taxinyc.raw_data (
               key VARCHAR(255),
               fare_amount FLOAT,
               pickup_datetime VARCHAR(255),
               pickup_longitude FLOAT,
               pickup_latitude FLOAT,
               dropoff_longitude FLOAT,
               dropoff_latitude FLOAT,
               passenger_count INT
               )
               ROW FORMAT DELIMITED
               FIELDS TERMINATED BY ","
               LINES TERMINATED BY "\n"
               LOCATION 's3://aws-worskhop-data/taxi-nyc'
               TBLPROPERTIES (
               'skip.header.line.count' = '1'
               );

SELECT * FROM "taxinyc"."raw_data" limit 10;

In [7]:
SQL = '''
WITH 
    dataset AS 
    (SELECT CAST (pickup_datetime AS TIMESTAMP WITH time zone) AT TIME ZONE 'America/New_York' AS est, 
                  ST_POINT(pickup_longitude,pickup_latitude) pickup_point,
                  ST_POINT(dropoff_longitude,dropoff_latitude) dropoff_point,
                  to_unixtime( CAST (pickup_datetime AS TIMESTAMP WITH time zone) AT TIME ZONE 'America/New_York') AS                     epoch,
                  24*60*60 as seconds_in_day,
                  *
      FROM train_v3),
    
    airports AS (SELECT 
                  kv['LaGuardia'] AS LaGuardia,
                  kv['Downtown Manhattan/Wall St. Heliport'] AS Manhattan,
                  kv['John F Kennedy Intl'] AS JFK
    FROM (SELECT map_agg(name, point_location) kv
        FROM 
            (SELECT name,
         ST_POINT(longitude,
         latitude) point_location
            FROM usa_airports
            WHERE city = 'New York' )
            ))
        SELECT 
        
        -- Target
         fare_amount,
         
         -- time features
         day(est) day,
         day_of_week(est) dayofweek ,
         year(est) year ,
         month(est) month ,
         day_of_month(est) dayofmonth ,
         hour(est) hour ,
         minute(est) minute ,
         
         -- cyclclical variables
         sin(2*pi()*epoch/seconds_in_day) sin_day,
         cos(2*pi()*epoch/seconds_in_day) cos_day,
         sin(2*pi()*epoch/(seconds_in_day*7)) sin_week,
         cos(2*pi()*epoch/(seconds_in_day*7)) cos_week,
         
         
         -- Distance features
         pickup_longitude - dropoff_longitude diff_longitude,
         pickup_latitude - dropoff_latitude diff_latitude,
         ST_Distance(pickup_point, dropoff_point) dist,
         
         -- Airports features
         ST_DISTANCE(airports.LaGuardia, dropoff_point) dropoff_laguardia,
         ST_DISTANCE(airports.LaGuardia, pickup_point ) pickup_laguardia,
         ST_DISTANCE(airports.JFK, dropoff_point) dropoff_JFK,
         ST_DISTANCE(airports.JFK, pickup_point) pickup_JFK,
         ST_DISTANCE(airports.Manhattan, dropoff_point) dropoff_manhattan,
         ST_DISTANCE(airports.Manhattan, pickup_point) pickup_manhattan,
         
         -- Raw features
         pickup_longitude,
         pickup_latitude,
         dropoff_longitude,
         dropoff_latitude,
         passenger_count
         
    FROM dataset, airports
'''

In [8]:
!cat athena_taxi_raw.sql


WITH 
    dataset AS 
    (SELECT CAST (pickup_datetime AS TIMESTAMP WITH time zone) AT TIME ZONE 'America/New_York' AS est, 
                  ST_POINT(pickup_longitude,pickup_latitude) pickup_point,
                  ST_POINT(dropoff_longitude,dropoff_latitude) dropoff_point,
                  to_unixtime( CAST (pickup_datetime AS TIMESTAMP WITH time zone) AT TIME ZONE 'America/New_York') AS                     epoch,
                  24*60*60 as seconds_in_day,
                  *
     FROM raw_data)
    
     SELECT
     
        -- Target
        fare_amount,
        
        -- time features
        day(est) day,
        day_of_week(est) dayofweek ,
        year(est) year ,
        month(est) month ,
        day_of_month(est) dayofmonth ,
        hour(est) hour ,
        minute(est) minute ,
         
        -- cyclclical variables
        sin(2*pi()*epoch/seconds_in_day) sin_day,
        cos(2*pi()*epoch/seconds_in_day) cos_day,
        sin(2*pi()*ep

In [3]:
!python athena.py athena_taxi_raw.sql

athena_taxi_raw.sql


In [4]:
!python athena.py foo.sql

foo.sql


In [5]:
!ls -1

athena.log
athena.py
athena_taxi_raw.sql
athena_taxi_raw.sql.csv
athena_taxi_raw.sql.log
carparts49
foo.sql
foo.sql.csv
foo.sql.log
lost+found
run_athena_query.ipynb
taxi_fare_prediction.ipynb


athena.log  # program log
athena.py   # main program
foo.sql     # query execution result
foo.sql.csv # sql output