# Read data from bigquery and write them in GCS

Original data from public dataset in bigquery: bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018.

* create query to clean data and keep only columns I need
* create bigquery dataset within `event-driven-ml` project
* save dataset as partitioned csv in GCS (I don't know if I need this step yet)

In [1]:
from google.cloud import bigquery as bq

In [2]:
bq_client = bq.Client()

In [3]:
import os
PROJECT = 'event-driven-ml'    # CHANGE THIS
BUCKET = 'edml/data/taxi-trips' # REPLACE WITH YOUR BUCKET NAME. Use a regional bucket in the region you selected.
REGION = 'eu' # Choose an available region for Cloud MLE from https://cloud.google.com/ml-engine/docs/regions.

# for bash
os.environ['PROJECT'] = PROJECT
os.environ['BUCKET'] = BUCKET
os.environ['REGION'] = REGION

In [4]:
%%bash
gcloud config set project $PROJECT

Updated property [core/project].


In [None]:
possible_features = [uuid, dayofweek, hourofday, pickup_borough, dropoff_borough, trip_duration,
                     pickup_zone_id, dropoff_zone_id, pickup_zone_name, dropoff_zone_name]

## TRAINING SET

In [14]:
%%bash

bq query \
--destination_table event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_train \
--replace \
--use_legacy_sql=false \
--allow_large_results \
'SELECT GENERATE_UUID() as uuid, dayofweek, hourofday, pickup_zone_name, dropoff_zone_name, passenger_count, trip_duration
FROM
(SELECT
    pickup_datetime,
    dropoff_datetime,
    EXTRACT(DAYOFWEEK FROM pickup_datetime) -1 AS dayofweek,
    EXTRACT(HOUR FROM pickup_datetime) AS hourofday,
    DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) AS trip_duration,
    passenger_count,
    CAST(trip_distance AS FLOAT64) AS trip_distance,
    CAST(pickup_location_id AS INT64) AS pickup_location_id,
    CAST(dropoff_location_id AS INT64) AS dropoff_location_id,
    CAST(rate_code AS INT64) AS rate_code,
    CAST(payment_type AS INT64) AS payment_type,
    CAST(fare_amount AS FLOAT64) AS fare_amount,
    CAST(tolls_amount AS FLOAT64) AS tolls_amount,
    CASE WHEN tolls_amount > 0.0 THEN 1.0 ELSE 0.0 END AS flag_tolls,
    CAST(total_amount AS FLOAT64) AS total_amount
FROM
    `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018`
WHERE
    trip_distance > 0.0
    AND trip_distance < 3000.0
    AND fare_amount >= 2.5
    AND fare_amount < 6000.0
    AND total_amount > 0.0
    AND total_amount < 6000.0
    AND passenger_count > 0
    AND EXTRACT(YEAR from pickup_datetime) = 2018
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) > 0
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) < 1440 
    AND MOD(ABS(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING))), 10) < 8
) AS trips

INNER JOIN
(SELECT DISTINCT CAST(zone_id AS INT64) AS pickup_zone_id, zone_name AS pickup_zone_name, borough AS pickup_borough FROM 
 `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom`) AS pu_zones
ON trips.pickup_location_id = pu_zones.pickup_zone_id

INNER JOIN (SELECT DISTINCT CAST(zone_id AS INT64) AS dropoff_zone_id, zone_name AS dropoff_zone_name, borough AS dropoff_borough FROM
 `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom`) AS do_zones
ON trips.dropoff_location_id = do_zones.dropoff_zone_id'


+--------------------------------------+-----------+-----------+--------------------------------+--------------------------------+-----------------+---------------+
|                 uuid                 | dayofweek | hourofday |        pickup_zone_name        |       dropoff_zone_name        | passenger_count | trip_duration |
+--------------------------------------+-----------+-----------+--------------------------------+--------------------------------+-----------------+---------------+
| bdb1d1a9-aea8-45c9-98e7-f118a2529686 |         3 |        14 | JFK Airport                    | Bloomfield/Emerson Hill        |               1 |            50 |
| 9e99ff96-10f5-4737-ba09-6f840e030569 |         5 |        23 | LaGuardia Airport              | Times Sq/Theatre District      |               2 |            24 |
| 860232a7-646a-457a-a70c-b52d9203613e |         5 |        18 | Gramercy                       | Brooklyn Heights               |               1 |            26 |
| 6636ef4

Waiting on bqjob_r6a90b6fc6131891a_0000016e5c4ced2f_1 ... (136s) Current status: DONE   

In [15]:
%%bash

bq extract \
--destination_format CSV \
--compression GZIP \
--field_delimiter ',' \
--print_header=false \
'event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_train' gs://$BUCKET/train/tlc_yellow_trips_2018-000*.csv




Waiting on bqjob_r3a45e3a0a2e3bd0b_0000016e5c510f86_1 ... (110s) Current status: DONE   

## TEST SET

In [16]:
%%bash

bq query \
--destination_table event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_test \
--replace \
--use_legacy_sql=false \
--allow_large_results \
'SELECT GENERATE_UUID() as uuid, dayofweek, hourofday, pickup_zone_name, dropoff_zone_name, passenger_count, trip_duration
FROM
(SELECT
    pickup_datetime,
    dropoff_datetime,
    EXTRACT(DAYOFWEEK FROM pickup_datetime) -1 AS dayofweek,
    EXTRACT(HOUR FROM pickup_datetime) AS hourofday,
    DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) AS trip_duration,
    passenger_count,
    CAST(trip_distance AS FLOAT64) AS trip_distance,
    CAST(pickup_location_id AS INT64) AS pickup_location_id,
    CAST(dropoff_location_id AS INT64) AS dropoff_location_id,
    CAST(rate_code AS INT64) AS rate_code,
    CAST(payment_type AS INT64) AS payment_type,
    CAST(fare_amount AS FLOAT64) AS fare_amount,
    CAST(tolls_amount AS FLOAT64) AS tolls_amount,
    CASE WHEN tolls_amount > 0.0 THEN 1.0 ELSE 0.0 END AS flag_tolls,
    CAST(total_amount AS FLOAT64) AS total_amount
FROM
    `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018`
WHERE
    trip_distance > 0.0
    AND trip_distance < 3000.0
    AND fare_amount >= 2.5
    AND fare_amount < 6000.0
    AND total_amount > 0.0
    AND total_amount < 6000.0
    AND passenger_count > 0
    AND EXTRACT(YEAR from pickup_datetime) = 2018
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) > 0
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) < 1440 
    AND MOD(ABS(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING))), 10) = 8
) AS trips

INNER JOIN
(SELECT DISTINCT CAST(zone_id AS INT64) AS pickup_zone_id, zone_name AS pickup_zone_name, borough AS pickup_borough FROM 
 `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom`) AS pu_zones
ON trips.pickup_location_id = pu_zones.pickup_zone_id

INNER JOIN (SELECT DISTINCT CAST(zone_id AS INT64) AS dropoff_zone_id, zone_name AS dropoff_zone_name, borough AS dropoff_borough FROM
 `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom`) AS do_zones
ON trips.dropoff_location_id = do_zones.dropoff_zone_id'


+--------------------------------------+-----------+-----------+--------------------------------+-----------------------------+-----------------+---------------+
|                 uuid                 | dayofweek | hourofday |        pickup_zone_name        |      dropoff_zone_name      | passenger_count | trip_duration |
+--------------------------------------+-----------+-----------+--------------------------------+-----------------------------+-----------------+---------------+
| fa9534b6-301f-495e-8e5d-a4b0d1ddc06d |         6 |         0 | Seaport                        | East Harlem North           |               5 |            18 |
| 33c53879-1bbf-4c0b-92e5-932a84c000b9 |         5 |         0 | Maspeth                        | Fort Greene                 |               1 |            18 |
| 8ddcf41a-8baf-4bee-8ad4-8d320c134d8c |         0 |         0 | East Harlem North              | Yorkville West              |               3 |             3 |
| 91aee70f-70bc-43d4-89e4-1

Waiting on bqjob_r379f7987f8163685_0000016e5c542fac_1 ... (67s) Current status: DONE   

In [17]:
%%bash

bq extract \
--destination_format CSV \
--compression GZIP \
--field_delimiter ',' \
--print_header=false \
'event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_test' gs://$BUCKET/test/tlc_yellow_trips_2018-000*.csv




Waiting on bqjob_rfc177fd86d0cf57_0000016e5c554312_1 ... (49s) Current status: DONE   

## VALIDATION SET

In [18]:
%%bash

bq query \
--destination_table event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_eval \
--replace \
--use_legacy_sql=false \
--allow_large_results \
'SELECT GENERATE_UUID() as uuid, dayofweek, hourofday, pickup_zone_name, dropoff_zone_name, passenger_count, trip_duration
FROM
(SELECT
    pickup_datetime,
    dropoff_datetime,
    EXTRACT(DAYOFWEEK FROM pickup_datetime) -1 AS dayofweek,
    EXTRACT(HOUR FROM pickup_datetime) AS hourofday,
    DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) AS trip_duration,
    passenger_count,
    CAST(trip_distance AS FLOAT64) AS trip_distance,
    CAST(pickup_location_id AS INT64) AS pickup_location_id,
    CAST(dropoff_location_id AS INT64) AS dropoff_location_id,
    CAST(rate_code AS INT64) AS rate_code,
    CAST(payment_type AS INT64) AS payment_type,
    CAST(fare_amount AS FLOAT64) AS fare_amount,
    CAST(tolls_amount AS FLOAT64) AS tolls_amount,
    CASE WHEN tolls_amount > 0.0 THEN 1.0 ELSE 0.0 END AS flag_tolls,
    CAST(total_amount AS FLOAT64) AS total_amount
FROM
    `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018`
WHERE
    trip_distance > 0.0
    AND trip_distance < 3000.0
    AND fare_amount >= 2.5
    AND fare_amount < 6000.0
    AND total_amount > 0.0
    AND total_amount < 6000.0
    AND passenger_count > 0
    AND EXTRACT(YEAR from pickup_datetime) = 2018
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) > 0
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) < 1440 
    AND MOD(ABS(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING))), 10) = 9
) AS trips

INNER JOIN
(SELECT DISTINCT CAST(zone_id AS INT64) AS pickup_zone_id, zone_name AS pickup_zone_name, borough AS pickup_borough FROM 
 `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom`) AS pu_zones
ON trips.pickup_location_id = pu_zones.pickup_zone_id

INNER JOIN (SELECT DISTINCT CAST(zone_id AS INT64) AS dropoff_zone_id, zone_name AS dropoff_zone_name, borough AS dropoff_borough FROM
 `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom`) AS do_zones
ON trips.dropoff_location_id = do_zones.dropoff_zone_id'


+--------------------------------------+-----------+-----------+----------------------------------+-------------------------------+-----------------+---------------+
|                 uuid                 | dayofweek | hourofday |         pickup_zone_name         |       dropoff_zone_name       | passenger_count | trip_duration |
+--------------------------------------+-----------+-----------+----------------------------------+-------------------------------+-----------------+---------------+
| 28a2ecb4-1b87-4e39-99ff-d289533adb14 |         0 |         0 | Central Harlem North             | Inwood                        |               4 |            15 |
| 1095f0ee-9ece-4313-be40-33bdf7831069 |         6 |         0 | DUMBO/Vinegar Hill               | Union Sq                      |               1 |            13 |
| 6fb0db6b-59c5-40a7-87d3-dc48944f64a4 |         0 |         0 | East Williamsburg                | Williamsburg (South Side)     |               1 |             6 |
| 5

Waiting on bqjob_r23da41b0c5ec5275_0000016e5c560e4a_1 ... (67s) Current status: DONE   

In [19]:
%%bash

bq extract \
--destination_format CSV \
--compression GZIP \
--field_delimiter ',' \
--print_header=false \
'event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_eval' gs://$BUCKET/eval/tlc_yellow_trips_2018-000*.csv




Waiting on bqjob_r219e04bb237aec84_0000016e5c57202b_1 ... (49s) Current status: DONE   

## TABLE FOR LOLO

In [6]:
%%bash

bq query \
--destination_table event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_more_features \
--replace \
--use_legacy_sql=false \
--allow_large_results \
'SELECT GENERATE_UUID() as uuid, dayofweek, hourofday, pickup_borough, dropoff_borough, trip_duration, total_amount, trip_distance, pickup_datetime, dropoff_datetime FROM
(SELECT
    pickup_datetime,
    dropoff_datetime,
    EXTRACT(DAYOFWEEK FROM pickup_datetime) -1 AS dayofweek,
    EXTRACT(HOUR FROM pickup_datetime) AS hourofday,
    DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) AS trip_duration,
    passenger_count,
    CAST(trip_distance AS FLOAT64) AS trip_distance,
    CAST(pickup_location_id AS INT64) AS pickup_location_id,
    CAST(dropoff_location_id AS INT64) AS dropoff_location_id,
    CAST(rate_code AS INT64) AS rate_code,
    CAST(payment_type AS INT64) AS payment_type,
    CAST(fare_amount AS FLOAT64) AS fare_amount,
    CAST(tolls_amount AS FLOAT64) AS tolls_amount,
    CASE WHEN tolls_amount > 0.0 THEN 1.0 ELSE 0.0 END AS flag_tolls,
    CAST(total_amount AS FLOAT64) AS total_amount
FROM
    `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018`
WHERE
    trip_distance > 0.0
    AND trip_distance < 3000.0
    AND fare_amount >= 2.5
    AND fare_amount < 6000.0
    AND total_amount > 0.0
    AND total_amount < 6000.0
    AND passenger_count > 0
    AND EXTRACT(YEAR from pickup_datetime) = 2018
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) > 0
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) < 1440 
    -- AND MOD(ABS(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING))), 10) < 8
) AS trips

INNER JOIN
(SELECT DISTINCT CAST(zone_id AS INT64) AS zone_id, zone_name, borough AS pickup_borough FROM 
 `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom`) AS pu_zones
ON trips.pickup_location_id = pu_zones.zone_id

INNER JOIN (SELECT DISTINCT CAST(zone_id AS INT64) AS zone_id, zone_name, borough AS dropoff_borough FROM
 `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom`) AS do_zones
ON trips.dropoff_location_id = do_zones.zone_id'


+--------------------------------------+-----------+-----------+----------------+-----------------+---------------+--------------------+---------------+---------------------+---------------------+
|                 uuid                 | dayofweek | hourofday | pickup_borough | dropoff_borough | trip_duration |    total_amount    | trip_distance |   pickup_datetime   |  dropoff_datetime   |
+--------------------------------------+-----------+-----------+----------------+-----------------+---------------+--------------------+---------------+---------------------+---------------------+
| 9775268b-e7ca-4a7e-8dad-97630e11a385 |         0 |         7 | Queens         | Brooklyn        |          1384 |              48.96 |         13.58 | 2018-07-15T07:55:55 | 2018-07-16T06:59:46 |
| a80e7c2d-221b-4ac9-9832-8968e90e983a |         0 |        21 | Manhattan      | Manhattan       |            24 |              29.15 |           6.8 | 2018-03-25T21:38:44 | 2018-03-25T22:02:11 |
| c6608b7a-71a

Waiting on bqjob_r17f3a0e4c4b4090f_0000016e4c3860d6_1 ... (196s) Current status: DONE   

## Read zone csv 

To have a complete list of zone names that are present in `event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_*`

In [20]:
import pandas as pd

In [22]:
pwd

'/home/jupyter/event-driven-ml/edml-notebooks/gbi'

In [23]:
zones_df = pd.read_csv("/home/jupyter/event-driven-ml/edml-notebooks/gbi/bq-results-20191111-221646-lp2si3f88xlw.csv")

In [25]:
zones_df.pickup_zone_name.to_list()

['Allerton/Pelham Gardens',
 'Alphabet City',
 'Arden Heights',
 'Arrochar/Fort Wadsworth',
 'Astoria',
 'Astoria Park',
 'Auburndale',
 'Baisley Park',
 'Bath Beach',
 'Battery Park',
 'Battery Park City',
 'Bay Ridge',
 'Bay Terrace/Fort Totten',
 'Bayside',
 'Bedford',
 'Bedford Park',
 'Bellerose',
 'Belmont',
 'Bensonhurst East',
 'Bensonhurst West',
 'Bloomfield/Emerson Hill',
 'Bloomingdale',
 'Boerum Hill',
 'Borough Park',
 'Breezy Point/Fort Tilden/Riis Beach',
 'Briarwood/Jamaica Hills',
 'Brighton Beach',
 'Broad Channel',
 'Bronx Park',
 'Bronxdale',
 'Brooklyn Heights',
 'Brooklyn Navy Yard',
 'Brownsville',
 'Bushwick North',
 'Bushwick South',
 'Cambria Heights',
 'Canarsie',
 'Carroll Gardens',
 'Central Harlem',
 'Central Harlem North',
 'Central Park',
 'Charleston/Tottenville',
 'Chinatown',
 'City Island',
 'Claremont/Bathgate',
 'Clinton East',
 'Clinton Hill',
 'Clinton West',
 'Co-Op City',
 'Cobble Hill',
 'College Point',
 'Columbia Street',
 'Coney Island',
 

In [26]:
zones_df.count()

pickup_zone_name    259
dtype: int64