# Read data from bigquery and write them in GCS

Original data from public dataset in bigquery: bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018.

* create query to clean data and keep only columns I need
* create bigquery dataset within `event-driven-ml` project
* save dataset as partitioned csv in GCS (I don't know if I need this step yet)

In [1]:
from google.cloud import bigquery as bq

In [2]:
bq_client = bq.Client()

In [5]:
query = """
SELECT
    pickup_datetime,
    dropoff_datetime,
    EXTRACT(DAYOFWEEK FROM pickup_datetime) AS dayofweek,
    EXTRACT(HOUR FROM pickup_datetime) AS hourofday,
    DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) AS trip_duration,
    passenger_count,
    CAST(trip_distance AS FLOAT64) AS trip_distance,
    CAST(pickup_location_id AS INT64) AS pickup_location_id,
    CAST(dropoff_location_id AS INT64) AS dropoff_location_id,
    CAST(rate_code AS INT64) AS rate_code,
    CAST(payment_type AS INT64) AS payment_type,
    CAST(fare_amount AS FLOAT64) AS fare_amount,
    CAST(tolls_amount AS FLOAT64) AS tolls_amount,
    CASE WHEN tolls_amount > 0.0 THEN 1.0 ELSE 0.0 END AS flag_tolls,
    CAST(total_amount AS FLOAT64) AS total_amount
FROM
    `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018`
WHERE
    trip_distance > 0
    AND trip_distance < 3000
    AND fare_amount >= 2.5
    AND fare_amount < 6000.0
    AND total_amount > 0.0
    AND total_amount < 6000.0
    AND passenger_count > 0
    AND EXTRACT(YEAR from pickup_datetime) = 2018
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) > 0
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) < 1440
    
LIMIT 10
"""

In [6]:
yellow_2018_df = bq_client.query(query).to_dataframe()

In [14]:
yellow_2018_df.columns

Index(['pickup_datetime', 'dropoff_datetime', 'dayofweek', 'hourofday',
       'trip_duration', 'passenger_count', 'trip_distance',
       'pickup_location_id', 'dropoff_location_id', 'rate_code',
       'payment_type', 'fare_amount', 'tolls_amount', 'flag_tolls',
       'total_amount'],
      dtype='object')

In [7]:
yellow_2018_df.dtypes

pickup_datetime        datetime64[ns]
dropoff_datetime       datetime64[ns]
dayofweek                       int64
hourofday                       int64
trip_duration                   int64
passenger_count                 int64
trip_distance                 float64
pickup_location_id              int64
dropoff_location_id             int64
rate_code                       int64
payment_type                    int64
fare_amount                   float64
tolls_amount                  float64
flag_tolls                    float64
total_amount                  float64
dtype: object

In [8]:
yellow_2018_df.head()

Unnamed: 0,pickup_datetime,dropoff_datetime,dayofweek,hourofday,trip_duration,passenger_count,trip_distance,pickup_location_id,dropoff_location_id,rate_code,payment_type,fare_amount,tolls_amount,flag_tolls,total_amount
0,2018-11-27 21:07:28,2018-11-27 21:35:20,3,21,28,1,10.69,138,114,1,1,32.0,5.76,1.0,46.87
1,2018-11-27 19:22:39,2018-11-27 19:50:49,3,19,28,1,10.55,138,231,1,1,33.5,0.0,0.0,42.36
2,2018-11-27 18:09:38,2018-11-27 18:37:17,3,18,28,1,9.53,138,229,1,1,28.5,5.76,1.0,43.27
3,2018-11-27 18:08:46,2018-11-27 18:57:39,3,18,49,1,10.56,138,50,1,1,39.5,5.76,1.0,54.12
4,2018-11-27 17:43:48,2018-11-27 18:27:00,3,17,44,4,6.8,65,230,1,1,33.0,0.0,0.0,41.75


In [9]:
yellow_2018_df.describe()

Unnamed: 0,dayofweek,hourofday,trip_duration,passenger_count,trip_distance,pickup_location_id,dropoff_location_id,rate_code,payment_type,fare_amount,tolls_amount,flag_tolls,total_amount
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,3.0,17.2,42.4,1.6,11.165,126.9,138.2,1.6,1.0,45.05,5.394,0.6,61.527
std,0.0,1.932184,14.615061,1.074968,2.512198,23.311657,95.827159,1.349897,0.0,19.417132,5.955092,0.516398,26.260521
min,3.0,15.0,28.0,1.0,6.8,65.0,1.0,1.0,1.0,28.5,0.0,0.0,41.75
25%,3.0,15.5,29.25,1.0,10.1,128.25,64.0,1.0,1.0,33.125,0.0,0.0,44.17
50%,3.0,17.0,41.0,1.0,10.625,138.0,161.5,1.0,1.0,38.75,5.76,1.0,50.94
75%,3.0,18.0,50.5,1.75,12.2175,138.0,224.5,1.0,1.0,46.25,5.76,1.0,61.55
max,3.0,21.0,74.0,4.0,16.2,138.0,231.0,5.0,1.0,90.0,15.5,1.0,115.0


In [3]:
import os
PROJECT = 'event-driven-ml'    # CHANGE THIS
BUCKET = 'edml/data/taxi-trips' # REPLACE WITH YOUR BUCKET NAME. Use a regional bucket in the region you selected.
REGION = 'eu' # Choose an available region for Cloud MLE from https://cloud.google.com/ml-engine/docs/regions.

# for bash
os.environ['PROJECT'] = PROJECT
os.environ['BUCKET'] = BUCKET
os.environ['REGION'] = REGION

In [4]:
%%bash
gcloud config set project $PROJECT

Updated property [core/project].


## TRAINING SET

In [12]:
%%bash

bq query \
--destination_table event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_train \
--replace \
--use_legacy_sql=false \
--allow_large_results \
'SELECT
    -- pickup_datetime,
    -- dropoff_datetime,
    EXTRACT(DAYOFWEEK FROM pickup_datetime) AS dayofweek,
    EXTRACT(HOUR FROM pickup_datetime) AS hourofday,
    DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) AS trip_duration,
    -- passenger_count,
    -- CAST(trip_distance AS FLOAT64) AS trip_distance,
    CAST(pickup_location_id AS INT64) AS pickup_location_id,
    CAST(dropoff_location_id AS INT64) AS dropoff_location_id --,
    -- CAST(rate_code AS INT64) AS rate_code,
    -- CAST(payment_type AS INT64) AS payment_type,
    -- CAST(fare_amount AS FLOAT64) AS fare_amount,
    -- CAST(tolls_amount AS FLOAT64) AS tolls_amount,
    -- CASE WHEN tolls_amount > 0.0 THEN 1.0 ELSE 0.0 END AS flag_tolls,
    -- CAST(total_amount AS FLOAT64) AS total_amount
FROM
    `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018`
WHERE
    trip_distance > 0
    AND trip_distance < 3000
    AND fare_amount >= 2.5
    AND fare_amount < 6000
    AND total_amount > 0.0
    AND total_amount < 6000.0
    AND passenger_count > 0
    AND EXTRACT(YEAR from pickup_datetime) = 2018
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) > 0
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) < 1440 
    AND MOD(ABS(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING))), 10) < 8'


+-----------+-----------+---------------+--------------------+---------------------+
| dayofweek | hourofday | trip_duration | pickup_location_id | dropoff_location_id |
+-----------+-----------+---------------+--------------------+---------------------+
|         7 |         0 |            17 |                 80 |                  89 |
|         7 |         0 |            18 |                 41 |                  45 |
|         1 |         0 |            23 |                 88 |                 163 |
|         2 |         0 |            13 |                 25 |                 133 |
|         7 |         0 |            28 |                 88 |                  67 |
|         2 |         0 |            30 |                 49 |                 112 |
|         7 |         0 |            27 |                  7 |                 216 |
|         7 |         0 |            33 |                193 |                 225 |
|         7 |         0 |            40 |                 74 |  

Waiting on bqjob_r19971faa0b3d2781_0000016ca4711671_1 ... (66s) Current status: DONE   

In [19]:
%%bash

bq extract \
--destination_format CSV \
--compression GZIP \
--field_delimiter ',' \
--print_header=false \
'event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_train' gs://$BUCKET/train/tlc_yellow_trips_2018-000*.csv




Waiting on bqjob_r1bd025f531ecdec3_0000016ca4a80698_1 ... (49s) Current status: DONE   

## TEST SET

In [14]:
%%bash

bq query \
--destination_table event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_test \
--replace \
--use_legacy_sql=false \
--allow_large_results \
'SELECT
    -- pickup_datetime,
    -- dropoff_datetime,
    EXTRACT(DAYOFWEEK FROM pickup_datetime) AS dayofweek,
    EXTRACT(HOUR FROM pickup_datetime) AS hourofday,
    DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) AS trip_duration,
    -- passenger_count,
    -- CAST(trip_distance AS FLOAT64) AS trip_distance,
    CAST(pickup_location_id AS INT64) AS pickup_location_id,
    CAST(dropoff_location_id AS INT64) AS dropoff_location_id --,
    -- CAST(rate_code AS INT64) AS rate_code,
    -- CAST(payment_type AS INT64) AS payment_type,
    -- CAST(fare_amount AS FLOAT64) AS fare_amount,
    -- CAST(tolls_amount AS FLOAT64) AS tolls_amount,
    -- CASE WHEN tolls_amount > 0.0 THEN 1.0 ELSE 0.0 END AS flag_tolls,
    -- CAST(total_amount AS FLOAT64) AS total_amount
FROM
    `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018`
WHERE
    trip_distance > 0
    AND trip_distance < 3000
    AND fare_amount >= 2.5
    AND fare_amount < 6000
    AND total_amount > 0.0
    AND total_amount < 6000.0
    AND passenger_count > 0
    AND EXTRACT(YEAR from pickup_datetime) = 2018
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) > 0
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) < 1440 
    AND MOD(ABS(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING))), 10) = 8'


+-----------+-----------+---------------+--------------------+---------------------+
| dayofweek | hourofday | trip_duration | pickup_location_id | dropoff_location_id |
+-----------+-----------+---------------+--------------------+---------------------+
|         7 |         0 |             2 |                255 |                 255 |
|         2 |         0 |            26 |                  7 |                 102 |
|         5 |         0 |            15 |                 26 |                  14 |
|         5 |         0 |            11 |                167 |                 159 |
|         5 |         0 |             6 |                116 |                  42 |
|         6 |         0 |             8 |                 12 |                  87 |
|         7 |         0 |            12 |                224 |                  79 |
|         6 |         0 |             4 |                209 |                  45 |
|         1 |         0 |            24 |                181 |  

Waiting on bqjob_r109edc43619f815b_0000016ca4741131_1 ... (49s) Current status: DONE   

In [20]:
%%bash

bq extract \
--destination_format CSV \
--compression GZIP \
--field_delimiter ',' \
--print_header=false \
'event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_test' gs://$BUCKET/test/tlc_yellow_trips_2018-000*.csv




Waiting on bqjob_r20f6ded4de78eb14_0000016ca4a9582b_1 ... (35s) Current status: DONE   

## VALIDATION SET

In [16]:
%%bash

bq query \
--destination_table event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_val \
--replace \
--use_legacy_sql=false \
--allow_large_results \
'SELECT
    -- pickup_datetime,
    -- dropoff_datetime,
    EXTRACT(DAYOFWEEK FROM pickup_datetime) AS dayofweek,
    EXTRACT(HOUR FROM pickup_datetime) AS hourofday,
    DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) AS trip_duration,
    -- passenger_count,
    -- CAST(trip_distance AS FLOAT64) AS trip_distance,
    CAST(pickup_location_id AS INT64) AS pickup_location_id,
    CAST(dropoff_location_id AS INT64) AS dropoff_location_id --,
    -- CAST(rate_code AS INT64) AS rate_code,
    -- CAST(payment_type AS INT64) AS payment_type,
    -- CAST(fare_amount AS FLOAT64) AS fare_amount,
    -- CAST(tolls_amount AS FLOAT64) AS tolls_amount,
    -- CASE WHEN tolls_amount > 0.0 THEN 1.0 ELSE 0.0 END AS flag_tolls,
    -- CAST(total_amount AS FLOAT64) AS total_amount
FROM
    `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018`
WHERE
    trip_distance > 0
    AND trip_distance < 3000
    AND fare_amount >= 2.5
    AND fare_amount < 6000
    AND total_amount > 0.0
    AND total_amount < 6000.0
    AND passenger_count > 0
    AND EXTRACT(YEAR from pickup_datetime) = 2018
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) > 0
    AND DATETIME_DIFF(dropoff_datetime, pickup_datetime, MINUTE) < 1440 
    AND MOD(ABS(FARM_FINGERPRINT(CAST(pickup_datetime AS STRING))), 10) = 9'


+-----------+-----------+---------------+--------------------+---------------------+
| dayofweek | hourofday | trip_duration | pickup_location_id | dropoff_location_id |
+-----------+-----------+---------------+--------------------+---------------------+
|         7 |         0 |            20 |                146 |                 209 |
|         7 |         0 |            29 |                 97 |                  90 |
|         1 |         0 |            23 |                 42 |                 119 |
|         4 |         0 |             9 |                 88 |                 144 |
|         7 |         0 |            10 |                 80 |                 217 |
|         2 |         0 |            15 |                181 |                 228 |
|         2 |         0 |             9 |                145 |                 112 |
|         4 |         0 |             1 |                 74 |                  75 |
|         2 |         0 |            15 |                181 |  

Waiting on bqjob_r272007d72ed7f4a6_0000016ca475deed_1 ... (49s) Current status: DONE   

In [21]:
%%bash

bq extract \
--destination_format CSV \
--compression GZIP \
--field_delimiter ',' \
--print_header=false \
'event-driven-ml:new_york_taxi_trips.tlc_yellow_trips_2018_val' gs://$BUCKET/val/tlc_yellow_trips_2018-000*.csv




Waiting on bqjob_r64d94997d5243573_0000016ca4a9eb33_1 ... (24s) Current status: DONE   