# Read data from bigquery and write them in GCS

Original data from public dataset in bigquery: bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018.

* create query to clean data and keep only columns I need
* create bigquery dataset within `event-driven-ml` project
* save dataset as partitioned csv in GCS (I don't know if I need this step yet)

In [1]:
from google.cloud import bigquery as bq

In [2]:
bq_client = bq.Client()

In [3]:
import os
PROJECT = 'event-driven-ml'    # CHANGE THIS
BUCKET = 'edml/data/taxi-trips' # REPLACE WITH YOUR BUCKET NAME. Use a regional bucket in the region you selected.
REGION = 'eu' # Choose an available region for Cloud MLE from https://cloud.google.com/ml-engine/docs/regions.

# for bash
os.environ['PROJECT'] = PROJECT
os.environ['BUCKET'] = BUCKET
os.environ['REGION'] = REGION

In [4]:
%%bash
gcloud config set project $PROJECT

Updated property [core/project].


## TRAINING SET

In [12]:
%%bash

bq query \
--destination_table event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_train \
--replace \
--use_legacy_sql=false \
--allow_large_results \
'SELECT dayofweek, hourofday, pickup_borough, dropoff_borough, trip_duration FROM
(SELECT
    pickup_datetime,
    dropoff_datetime,
    EXTRACT(DAYOFWEEK FROM pickup_datetime) AS dayofweek,
    EXTRACT(HOUR FROM pickup_datetime) AS hourofday,
    DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) AS trip_duration,
    passenger_count,
    CAST(trip_distance AS FLOAT64) AS trip_distance,
    CAST(pickup_location_id AS INT64) AS pickup_location_id,
    CAST(dropoff_location_id AS INT64) AS dropoff_location_id,
    CAST(rate_code AS INT64) AS rate_code,
    CAST(payment_type AS INT64) AS payment_type,
    CAST(fare_amount AS FLOAT64) AS fare_amount,
    CAST(tolls_amount AS FLOAT64) AS tolls_amount,
    CASE WHEN tolls_amount > 0.0 THEN 1.0 ELSE 0.0 END AS flag_tolls,
    CAST(total_amount AS FLOAT64) AS total_amount
FROM
    `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018`
WHERE
    trip_distance > 0.0
    AND trip_distance < 3000.0
    AND fare_amount >= 2.5
    AND fare_amount < 6000.0
    AND total_amount > 0.0
    AND total_amount < 6000.0
    AND passenger_count > 0
    AND EXTRACT(YEAR from pickup_datetime) = 2018
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) > 0
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) < 1440 
    AND MOD(ABS(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING))), 10) < 8
) AS trips

INNER JOIN
(SELECT DISTINCT CAST(zone_id AS INT64) AS zone_id, zone_name, borough AS pickup_borough FROM 
 `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom`) AS pu_zones
ON trips.pickup_location_id = pu_zones.zone_id

INNER JOIN (SELECT DISTINCT CAST(zone_id AS INT64) AS zone_id, zone_name, borough AS dropoff_borough FROM
 `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom`) AS do_zones
ON trips.dropoff_location_id = do_zones.zone_id'


+-----------+-----------+---------------+--------------------+---------------------+
| dayofweek | hourofday | trip_duration | pickup_location_id | dropoff_location_id |
+-----------+-----------+---------------+--------------------+---------------------+
|         7 |         0 |            17 |                 80 |                  89 |
|         7 |         0 |            18 |                 41 |                  45 |
|         1 |         0 |            23 |                 88 |                 163 |
|         2 |         0 |            13 |                 25 |                 133 |
|         7 |         0 |            28 |                 88 |                  67 |
|         2 |         0 |            30 |                 49 |                 112 |
|         7 |         0 |            27 |                  7 |                 216 |
|         7 |         0 |            33 |                193 |                 225 |
|         7 |         0 |            40 |                 74 |  

Waiting on bqjob_r19971faa0b3d2781_0000016ca4711671_1 ... (66s) Current status: DONE   

In [19]:
%%bash

bq extract \
--destination_format CSV \
--compression GZIP \
--field_delimiter ',' \
--print_header=false \
'event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_train' gs://$BUCKET/train/tlc_yellow_trips_2018-000*.csv




Waiting on bqjob_r1bd025f531ecdec3_0000016ca4a80698_1 ... (49s) Current status: DONE   

## TEST SET

In [14]:
%%bash

bq query \
--destination_table event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_test \
--replace \
--use_legacy_sql=false \
--allow_large_results \
'SELECT dayofweek, hourofday, pickup_borough, dropoff_borough, trip_duration FROM
(SELECT
    pickup_datetime,
    dropoff_datetime,
    EXTRACT(DAYOFWEEK FROM pickup_datetime) AS dayofweek,
    EXTRACT(HOUR FROM pickup_datetime) AS hourofday,
    DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) AS trip_duration,
    passenger_count,
    CAST(trip_distance AS FLOAT64) AS trip_distance,
    CAST(pickup_location_id AS INT64) AS pickup_location_id,
    CAST(dropoff_location_id AS INT64) AS dropoff_location_id,
    CAST(rate_code AS INT64) AS rate_code,
    CAST(payment_type AS INT64) AS payment_type,
    CAST(fare_amount AS FLOAT64) AS fare_amount,
    CAST(tolls_amount AS FLOAT64) AS tolls_amount,
    CASE WHEN tolls_amount > 0.0 THEN 1.0 ELSE 0.0 END AS flag_tolls,
    CAST(total_amount AS FLOAT64) AS total_amount
FROM
    `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018`
WHERE
    trip_distance > 0.0
    AND trip_distance < 3000.0
    AND fare_amount >= 2.5
    AND fare_amount < 6000.0
    AND total_amount > 0.0
    AND total_amount < 6000.0
    AND passenger_count > 0
    AND EXTRACT(YEAR from pickup_datetime) = 2018
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) > 0
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) < 1440 
    AND MOD(ABS(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING))), 10) = 8
) AS trips

INNER JOIN
(SELECT DISTINCT CAST(zone_id AS INT64) AS zone_id, zone_name, borough AS pickup_borough FROM 
 `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom`) AS pu_zones
ON trips.pickup_location_id = pu_zones.zone_id

INNER JOIN (SELECT DISTINCT CAST(zone_id AS INT64) AS zone_id, zone_name, borough AS dropoff_borough FROM
 `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom`) AS do_zones
ON trips.dropoff_location_id = do_zones.zone_id'


+-----------+-----------+---------------+--------------------+---------------------+
| dayofweek | hourofday | trip_duration | pickup_location_id | dropoff_location_id |
+-----------+-----------+---------------+--------------------+---------------------+
|         7 |         0 |             2 |                255 |                 255 |
|         2 |         0 |            26 |                  7 |                 102 |
|         5 |         0 |            15 |                 26 |                  14 |
|         5 |         0 |            11 |                167 |                 159 |
|         5 |         0 |             6 |                116 |                  42 |
|         6 |         0 |             8 |                 12 |                  87 |
|         7 |         0 |            12 |                224 |                  79 |
|         6 |         0 |             4 |                209 |                  45 |
|         1 |         0 |            24 |                181 |  

Waiting on bqjob_r109edc43619f815b_0000016ca4741131_1 ... (49s) Current status: DONE   

In [20]:
%%bash

bq extract \
--destination_format CSV \
--compression GZIP \
--field_delimiter ',' \
--print_header=false \
'event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_test' gs://$BUCKET/test/tlc_yellow_trips_2018-000*.csv




Waiting on bqjob_r20f6ded4de78eb14_0000016ca4a9582b_1 ... (35s) Current status: DONE   

## VALIDATION SET

In [16]:
%%bash

bq query \
--destination_table event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_val \
--replace \
--use_legacy_sql=false \
--allow_large_results \
'SELECT dayofweek, hourofday, pickup_borough, dropoff_borough, trip_duration FROM
(SELECT
    pickup_datetime,
    dropoff_datetime,
    EXTRACT(DAYOFWEEK FROM pickup_datetime) AS dayofweek,
    EXTRACT(HOUR FROM pickup_datetime) AS hourofday,
    DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) AS trip_duration,
    passenger_count,
    CAST(trip_distance AS FLOAT64) AS trip_distance,
    CAST(pickup_location_id AS INT64) AS pickup_location_id,
    CAST(dropoff_location_id AS INT64) AS dropoff_location_id,
    CAST(rate_code AS INT64) AS rate_code,
    CAST(payment_type AS INT64) AS payment_type,
    CAST(fare_amount AS FLOAT64) AS fare_amount,
    CAST(tolls_amount AS FLOAT64) AS tolls_amount,
    CASE WHEN tolls_amount > 0.0 THEN 1.0 ELSE 0.0 END AS flag_tolls,
    CAST(total_amount AS FLOAT64) AS total_amount
FROM
    `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018`
WHERE
    trip_distance > 0.0
    AND trip_distance < 3000.0
    AND fare_amount >= 2.5
    AND fare_amount < 6000.0
    AND total_amount > 0.0
    AND total_amount < 6000.0
    AND passenger_count > 0
    AND EXTRACT(YEAR from pickup_datetime) = 2018
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) > 0
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) < 1440 
    AND MOD(ABS(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING))), 10) = 9
) AS trips

INNER JOIN
(SELECT DISTINCT CAST(zone_id AS INT64) AS zone_id, zone_name, borough AS pickup_borough FROM 
 `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom`) AS pu_zones
ON trips.pickup_location_id = pu_zones.zone_id

INNER JOIN (SELECT DISTINCT CAST(zone_id AS INT64) AS zone_id, zone_name, borough AS dropoff_borough FROM
 `bigquery-public-data.new_york_taxi_trips.taxi_zone_geom`) AS do_zones
ON trips.dropoff_location_id = do_zones.zone_id'


+-----------+-----------+---------------+--------------------+---------------------+
| dayofweek | hourofday | trip_duration | pickup_location_id | dropoff_location_id |
+-----------+-----------+---------------+--------------------+---------------------+
|         7 |         0 |            20 |                146 |                 209 |
|         7 |         0 |            29 |                 97 |                  90 |
|         1 |         0 |            23 |                 42 |                 119 |
|         4 |         0 |             9 |                 88 |                 144 |
|         7 |         0 |            10 |                 80 |                 217 |
|         2 |         0 |            15 |                181 |                 228 |
|         2 |         0 |             9 |                145 |                 112 |
|         4 |         0 |             1 |                 74 |                  75 |
|         2 |         0 |            15 |                181 |  

Waiting on bqjob_r272007d72ed7f4a6_0000016ca475deed_1 ... (49s) Current status: DONE   

In [21]:
%%bash

bq extract \
--destination_format CSV \
--compression GZIP \
--field_delimiter ',' \
--print_header=false \
'event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_val' gs://$BUCKET/val/tlc_yellow_trips_2018-000*.csv




Waiting on bqjob_r64d94997d5243573_0000016ca4a9eb33_1 ... (24s) Current status: DONE   

In [5]:
print("ciao")

ciao
