# Read data from bigquery and write them in GCS

Original data from public dataset in bigquery: bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018.

* create query to clean data and keep only columns I need
* create bigquery dataset within `event-driven-ml` project
* save dataset as partitioned csv in GCS (I don't know if I need this step yet)

In [1]:
from google.cloud import bigquery as bq

In [2]:
bq_client = bq.Client()

In [3]:
import os
PROJECT = 'event-driven-ml'    # CHANGE THIS
BUCKET = 'edml/data/taxi-trips' # REPLACE WITH YOUR BUCKET NAME. Use a regional bucket in the region you selected.
REGION = 'eu' # Choose an available region for Cloud MLE from https://cloud.google.com/ml-engine/docs/regions.

# for bash
os.environ['PROJECT'] = PROJECT
os.environ['BUCKET'] = BUCKET
os.environ['REGION'] = REGION

In [4]:
%%bash
gcloud config set project $PROJECT

Updated property [core/project].


## TRAINING SET

In [10]:
%%bash

bq query \
--destination_table event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_train \
--replace \
--use_legacy_sql=false \
--allow_large_results \
'SELECT GENERATE_UUID() as uuid, dayofweek, hourofday, pickup_borough, dropoff_borough, trip_duration FROM
(SELECT
    pickup_datetime,
    dropoff_datetime,
    EXTRACT(DAYOFWEEK FROM pickup_datetime) -1 AS dayofweek,
    EXTRACT(HOUR FROM pickup_datetime) AS hourofday,
    DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) AS trip_duration,
    passenger_count,
    CAST(trip_distance AS FLOAT64) AS trip_distance,
    CAST(pickup_location_id AS INT64) AS pickup_location_id,
    CAST(dropoff_location_id AS INT64) AS dropoff_location_id,
    CAST(rate_code AS INT64) AS rate_code,
    CAST(payment_type AS INT64) AS payment_type,
    CAST(fare_amount AS FLOAT64) AS fare_amount,
    CAST(tolls_amount AS FLOAT64) AS tolls_amount,
    CASE WHEN tolls_amount > 0.0 THEN 1.0 ELSE 0.0 END AS flag_tolls,
    CAST(total_amount AS FLOAT64) AS total_amount
FROM
    `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018`
WHERE
    trip_distance > 0.0
    AND trip_distance < 3000.0
    AND fare_amount >= 2.5
    AND fare_amount < 6000.0
    AND total_amount > 0.0
    AND total_amount < 6000.0
    AND passenger_count > 0
    AND EXTRACT(YEAR from pickup_datetime) = 2018
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) > 0
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) < 1440 
    AND MOD(ABS(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING))), 10) < 8
) AS trips

INNER JOIN
(SELECT DISTINCT CAST(zone_id AS INT64) AS zone_id, zone_name, borough AS pickup_borough FROM 
 `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom`) AS pu_zones
ON trips.pickup_location_id = pu_zones.zone_id

INNER JOIN (SELECT DISTINCT CAST(zone_id AS INT64) AS zone_id, zone_name, borough AS dropoff_borough FROM
 `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom`) AS do_zones
ON trips.dropoff_location_id = do_zones.zone_id'


+--------------------------------------+-----------+-----------+----------------+-----------------+---------------+
|                 uuid                 | dayofweek | hourofday | pickup_borough | dropoff_borough | trip_duration |
+--------------------------------------+-----------+-----------+----------------+-----------------+---------------+
| 3997a3fc-b655-4557-bf5e-525fe965e739 |         6 |         1 | Manhattan      | Brooklyn        |            31 |
| bf8b465f-c2df-4e2a-920e-50e7ca2509ac |         3 |        13 | Manhattan      | Brooklyn        |            37 |
| 63fae36d-7d79-4447-aa30-76973e0fb3a6 |         6 |        20 | Manhattan      | Brooklyn        |            29 |
| 42cc9786-6428-4d15-8346-d8d32f2eb207 |         3 |        18 | Queens         | Manhattan       |            54 |
| b81834ae-e662-42df-ad70-dcf14dc97946 |         1 |         9 | Manhattan      | Queens          |            49 |
| 9737eb18-9fe9-4a82-bb8c-6a5735c43f2e |         5 |        16 | Manhat

Waiting on bqjob_r374923bca1d2487d_0000016e41eab3e4_1 ... (87s) Current status: DONE   

In [11]:
%%bash

bq extract \
--destination_format CSV \
--compression GZIP \
--field_delimiter ',' \
--print_header=false \
'event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_train' gs://$BUCKET/train/tlc_yellow_trips_2018-000*.csv




Waiting on bqjob_r1a6e0c4a34fe586b_0000016e41ece2b4_1 ... (67s) Current status: DONE   

## TEST SET

In [12]:
%%bash

bq query \
--destination_table event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_test \
--replace \
--use_legacy_sql=false \
--allow_large_results \
'SELECT GENERATE_UUID() as uuid, dayofweek, hourofday, pickup_borough, dropoff_borough, trip_duration FROM
(SELECT
    pickup_datetime,
    dropoff_datetime,
    EXTRACT(DAYOFWEEK FROM pickup_datetime) -1 AS dayofweek,
    EXTRACT(HOUR FROM pickup_datetime) AS hourofday,
    DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) AS trip_duration,
    passenger_count,
    CAST(trip_distance AS FLOAT64) AS trip_distance,
    CAST(pickup_location_id AS INT64) AS pickup_location_id,
    CAST(dropoff_location_id AS INT64) AS dropoff_location_id,
    CAST(rate_code AS INT64) AS rate_code,
    CAST(payment_type AS INT64) AS payment_type,
    CAST(fare_amount AS FLOAT64) AS fare_amount,
    CAST(tolls_amount AS FLOAT64) AS tolls_amount,
    CASE WHEN tolls_amount > 0.0 THEN 1.0 ELSE 0.0 END AS flag_tolls,
    CAST(total_amount AS FLOAT64) AS total_amount
FROM
    `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018`
WHERE
    trip_distance > 0.0
    AND trip_distance < 3000.0
    AND fare_amount >= 2.5
    AND fare_amount < 6000.0
    AND total_amount > 0.0
    AND total_amount < 6000.0
    AND passenger_count > 0
    AND EXTRACT(YEAR from pickup_datetime) = 2018
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) > 0
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) < 1440 
    AND MOD(ABS(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING))), 10) = 8
) AS trips

INNER JOIN
(SELECT DISTINCT CAST(zone_id AS INT64) AS zone_id, zone_name, borough AS pickup_borough FROM 
 `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom`) AS pu_zones
ON trips.pickup_location_id = pu_zones.zone_id

INNER JOIN (SELECT DISTINCT CAST(zone_id AS INT64) AS zone_id, zone_name, borough AS dropoff_borough FROM
 `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom`) AS do_zones
ON trips.dropoff_location_id = do_zones.zone_id'


+--------------------------------------+-----------+-----------+----------------+-----------------+---------------+
|                 uuid                 | dayofweek | hourofday | pickup_borough | dropoff_borough | trip_duration |
+--------------------------------------+-----------+-----------+----------------+-----------------+---------------+
| 97a55009-d21b-413a-bfc2-99bd69a9f117 |         5 |         1 | Queens         | Queens          |            25 |
| 04f5b556-1f4a-4582-8cbc-aa46dfcc1e96 |         1 |        19 | Queens         | Manhattan       |            50 |
| 807b53c8-2f1e-4589-a549-2ff1be66922f |         3 |        13 | Manhattan      | Manhattan       |            20 |
| 16f5bf7d-e9d0-40f0-a59e-dfd1c84aaf6d |         5 |         5 | Manhattan      | Queens          |            20 |
| b00a8853-0846-4f09-97aa-5511883cbfc7 |         3 |        17 | Manhattan      | EWR             |            74 |
| 0cdb3806-195b-4d5a-864a-598470c4a4f4 |         2 |        21 | Manhat

Waiting on bqjob_r2d184e8cbc6b0af_0000016e41f0c816_1 ... (49s) Current status: DONE   

In [13]:
%%bash

bq extract \
--destination_format CSV \
--compression GZIP \
--field_delimiter ',' \
--print_header=false \
'event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_test' gs://$BUCKET/test/tlc_yellow_trips_2018-000*.csv




Waiting on bqjob_r681120a9dd727d8e_0000016e41f27f96_1 ... (66s) Current status: DONE   

## VALIDATION SET

In [8]:
%%bash

bq query \
--destination_table event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_eval \
--replace \
--use_legacy_sql=false \
--allow_large_results \
'SELECT GENERATE_UUID() as uuid, dayofweek, hourofday, pickup_borough, dropoff_borough, trip_duration FROM
(SELECT
    pickup_datetime,
    dropoff_datetime,
    EXTRACT(DAYOFWEEK FROM pickup_datetime) -1 AS dayofweek,
    EXTRACT(HOUR FROM pickup_datetime) AS hourofday,
    DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) AS trip_duration,
    passenger_count,
    CAST(trip_distance AS FLOAT64) AS trip_distance,
    CAST(pickup_location_id AS INT64) AS pickup_location_id,
    CAST(dropoff_location_id AS INT64) AS dropoff_location_id,
    CAST(rate_code AS INT64) AS rate_code,
    CAST(payment_type AS INT64) AS payment_type,
    CAST(fare_amount AS FLOAT64) AS fare_amount,
    CAST(tolls_amount AS FLOAT64) AS tolls_amount,
    CASE WHEN tolls_amount > 0.0 THEN 1.0 ELSE 0.0 END AS flag_tolls,
    CAST(total_amount AS FLOAT64) AS total_amount
FROM
    `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018`
WHERE
    trip_distance > 0.0
    AND trip_distance < 3000.0
    AND fare_amount >= 2.5
    AND fare_amount < 6000.0
    AND total_amount > 0.0
    AND total_amount < 6000.0
    AND passenger_count > 0
    AND EXTRACT(YEAR from pickup_datetime) = 2018
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) > 0
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) < 1440 
    AND MOD(ABS(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING))), 10) = 9
) AS trips

INNER JOIN
(SELECT DISTINCT CAST(zone_id AS INT64) AS zone_id, zone_name, borough AS pickup_borough FROM 
 `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom`) AS pu_zones
ON trips.pickup_location_id = pu_zones.zone_id

INNER JOIN (SELECT DISTINCT CAST(zone_id AS INT64) AS zone_id, zone_name, borough AS dropoff_borough FROM
 `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom`) AS do_zones
ON trips.dropoff_location_id = do_zones.zone_id'


+--------------------------------------+-----------+-----------+----------------+-----------------+---------------+
|                 uuid                 | dayofweek | hourofday | pickup_borough | dropoff_borough | trip_duration |
+--------------------------------------+-----------+-----------+----------------+-----------------+---------------+
| e1bdc6de-5ff2-447d-a42d-54e6a7e79929 |         2 |        14 | Manhattan      | Manhattan       |            36 |
| fedee430-20e8-48e9-b247-39b18a59f8b5 |         4 |        14 | Queens         | Brooklyn        |            46 |
| 63a7c7f5-1045-48be-8839-59fd060957b2 |         2 |         5 | Queens         | Queens          |            21 |
| 6b5c7beb-b71b-491e-8e31-0deda7e49bea |         4 |        14 | Manhattan      | Queens          |            38 |
| 7556bcc7-2f6f-4010-b650-0d189c9f2810 |         5 |        21 | Queens         | Manhattan       |            24 |
| 7efe8042-507e-4c18-b971-dabfef2c3b06 |         2 |        13 | Queens

Waiting on bqjob_r343f694b03424994_0000016e41e60915_1 ... (49s) Current status: DONE   

In [9]:
%%bash

bq extract \
--destination_format CSV \
--compression GZIP \
--field_delimiter ',' \
--print_header=false \
'event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_eval' gs://$BUCKET/eval/tlc_yellow_trips_2018-000*.csv




Waiting on bqjob_reaabd14f49d98e5_0000016e41e8a770_1 ... (66s) Current status: DONE   