In [4]:
from urllib.request import urlretrieve
import os

output_relative_dir = '../data/raw/'

# check if it exists as it makedir will raise an error if it does exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)

# create folders
target_dir = 'tlc_data/tlc_data_fhv'
if not os.path.exists(output_relative_dir + target_dir):
    os.makedirs(output_relative_dir + target_dir)

In [9]:
YEARS = ('2021', '2022')
MONTHS = range(1, 13)
# this is the URL template as of 07/2022
URL_TEMPLATE = "https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_"#year-month.parquet

In [10]:
tlc_output_dir = output_relative_dir + 'tlc_data/tlc_data_fhv'
for year in YEARS:
    for month in MONTHS:
        # 0-fill i.e 1 -> 01, 2 -> 02, etc
        month = str(month).zfill(2)
        print(f"Begin month {month}")

        # generate url
        url = f'{URL_TEMPLATE}{year}-{month}.parquet'
        # generate output location and filename
        output_dir = f"{tlc_output_dir}/{year}-{month}.parquet"
        # download
        urlretrieve(url, output_dir)

        print(f"Completed month {month}")

Begin month 01
Completed month 01
Begin month 02
Completed month 02
Begin month 03
Completed month 03
Begin month 04
Completed month 04
Begin month 05
Completed month 05
Begin month 06
Completed month 06
Begin month 07
Completed month 07
Begin month 08
Completed month 08
Begin month 09
Completed month 09
Begin month 10
Completed month 10
Begin month 11
Completed month 11
Begin month 12
Completed month 12
Begin month 01
Completed month 01
Begin month 02
Completed month 02
Begin month 03
Completed month 03
Begin month 04
Completed month 04
Begin month 05


HTTPError: HTTP Error 403: Forbidden

In [1]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034_FHV")
    .config("spark.sql.repl.eagerEval.enabled", True)
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.executor.memory", "2g")
    .config("spark.driver.memory", "4g")
    .getOrCreate()
)

22/08/15 16:34:09 WARN Utils: Your hostname, DESKTOP-85B961I resolves to a loopback address: 127.0.1.1; using 172.17.43.250 instead (on interface eth0)
22/08/15 16:34:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/15 16:34:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
sdf = spark.read.parquet('../data/raw/tlc_data/tlc_data_fhv')
sdf.show(1, vertical=True, truncate=100)




-RECORD 0-----------------------------------
 hvfhs_license_num    | HV0003              
 dispatching_base_num | B02682              
 originating_base_num | B02682              
 request_datetime     | 2021-01-01 11:28:09 
 on_scene_datetime    | 2021-01-01 11:31:42 
 pickup_datetime      | 2021-01-01 11:33:44 
 dropoff_datetime     | 2021-01-01 11:49:07 
 PULocationID         | 230                 
 DOLocationID         | 166                 
 trip_miles           | 5.26                
 trip_time            | 923                 
 base_passenger_fare  | 22.28               
 tolls                | 0.0                 
 bcf                  | 0.67                
 sales_tax            | 1.98                
 congestion_surcharge | 2.75                
 airport_fee          | null                
 tips                 | 0.0                 
 driver_pay           | 14.99               
 shared_request_flag  | N                   
 shared_match_flag    | N                   
 access_a_

                                                                                

In [2]:
sdf.count() # 2.4m samples

NameError: name 'sdf' is not defined

In [5]:
# subsample step for preliminary anylsis
sdf = sdf.sample(fraction=0.1, seed=0)

In [6]:
sdf.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- originating_base_num: string (nullable = true)
 |-- request_datetime: timestamp (nullable = true)
 |-- on_scene_datetime: timestamp (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- trip_time: long (nullable = true)
 |-- base_passenger_fare: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- bcf: double (nullable = true)
 |-- sales_tax: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- tips: double (nullable = true)
 |-- driver_pay: double (nullable = true)
 |-- shared_request_flag: string (nullable = true)
 |-- shared_match_flag: string (nullable = true)
 |-- access_a_ride_flag: string (nul

Useful features:
* hvfhs_license_num
* pickup_datetime
* dropoff_datetime
* request_datetime
* on_scene_datetime
* PULocationID
* DOLocationID
* trip_miles
* trip_time
* base_passenger_fare
* tolls
* bcf
* sales_tax
* congestion_surcharge
* airport_fee (change to a boolean)
* tips
* driver_pay
* shared_request_flag
* shared_match_flag
* access_a_ride_flag (inspect this)
* wav_request_flag (might remove this)
* wav_match_flag (and this)

In [7]:
from pyspark.sql import functions as F

In [31]:
# Cast to appropriate types
for field in ("PULocationID", "DOLocationID"):
    sdf = sdf.withColumn(
        field,
        F.col(field).cast('INT')
    )


# Cast to boolean ->
for field in ("shared_request_flag", "access_a_ride_flag", "wav_request_flag", "wav_match_flag"):
    sdf = sdf.withColumn(
        field,
        (F.col(field) == 'Y').cast('BOOLEAN')
    )

sdf = sdf.withColumn(
    'access_a_ride_flag',
    (F.col("access_a_ride_flag") == 'Y').cast('BOOLEAN')
)

In [52]:

for x in sdf.schema:
    print(x)

StructField('hvfhs_license_num', StringType(), True)
StructField('dispatching_base_num', StringType(), True)
StructField('originating_base_num', StringType(), True)
StructField('request_datetime', TimestampType(), True)
StructField('on_scene_datetime', TimestampType(), True)
StructField('pickup_datetime', TimestampType(), True)
StructField('dropoff_datetime', TimestampType(), True)
StructField('PULocationID', IntegerType(), True)
StructField('DOLocationID', IntegerType(), True)
StructField('trip_miles', DoubleType(), True)
StructField('trip_time', LongType(), True)
StructField('base_passenger_fare', DoubleType(), True)
StructField('tolls', DoubleType(), True)
StructField('bcf', DoubleType(), True)
StructField('sales_tax', DoubleType(), True)
StructField('congestion_surcharge', DoubleType(), True)
StructField('airport_fee', DoubleType(), True)
StructField('tips', DoubleType(), True)
StructField('driver_pay', DoubleType(), True)
StructField('shared_request_flag', BooleanType(), True)
Str

In [32]:
# remove records with this
sdf.groupBy('shared_request_flag').count().show()



+-------------------+------+
|shared_request_flag| count|
+-------------------+------+
|               true|   272|
|              false|240538|
+-------------------+------+



                                                                                

In [33]:
sdf.groupBy('access_a_ride_flag').count().show()



+------------------+------+
|access_a_ride_flag| count|
+------------------+------+
|              true|173812|
|             false| 66998|
+------------------+------+



                                                                                

In [14]:
# remove records with this
sdf.groupBy('wav_request_flag').count().show()



+----------------+------+
|wav_request_flag| count|
+----------------+------+
|               Y|   306|
|               N|240504|
+----------------+------+



                                                                                

In [39]:
# Not needed
# HV0002: Juno
# HV0003: Uber
# HV0004: Via
# HV0005: Lyft
sdf.groupBy('hvfhs_license_num').count().show()



+-----------------+------+
|hvfhs_license_num| count|
+-----------------+------+
|           HV0004|   892|
|           HV0005| 66106|
|           HV0003|173812|
+-----------------+------+



                                                                                

In [18]:
sdf.describe("trip_miles", "trip_time", "base_passenger_fare", "tolls", "bcf", "sales_tax", "tips", "driver_pay").show()



+-------+-----------------+-----------------+-------------------+------------------+------------------+------------------+------------------+------------------+
|summary|       trip_miles|        trip_time|base_passenger_fare|             tolls|               bcf|         sales_tax|              tips|        driver_pay|
+-------+-----------------+-----------------+-------------------+------------------+------------------+------------------+------------------+------------------+
|  count|           240810|           240810|             240810|            240810|            240810|            240810|            240810|            240810|
|   mean|4.842819193555083|1102.628262945891| 21.978488517918468|0.9759024957435193| 0.687381337984299|1.8801823844524836|0.8756362692579213| 17.60095863959137|
| stddev|5.625440051263941|775.0215288658828| 17.554128839895313| 3.562595956684641|0.5955054534976646|1.4977553101442909| 2.730340127146867|14.003614560919987|
|    min|              0.0|       

                                                                                

Flags
* trip_miles > 0
* 3*60*60 (3 hours) > trip_time > 0
* base_passenger_far > 0 (inspect this?)
* driver_pay > 0
* tolls < 20 seemed reasonable? more likely to change to 2 std deviations


* Shared rides removed
* Invalid PULocationID's removed
* Invalid DOLocationID's removed
* wheelchair requested rides removed

In [20]:
sdf.count()

                                                                                

240810

In [35]:
sdf = sdf.withColumn(
    'is_valid_record',
    F.when(
        (F.col('trip_miles') > 0)
        & (F.col('trip_time') > 0)
        & (F.col('trip_time') < 3*60*60)
        & (F.col('base_passenger_fare') > 0)
        & (F.col('tolls') < 20)
        & (F.col('PULocationID') > 0)
        & (F.col('PULocationID') < 264)
        & (F.col('DOLocationID') > 0)
        & (F.col('DOLocationID') < 264)
        & (F.col('shared_request_flag') == False)
        & (F.col('shared_match_flag') == False)
        & (F.col('wav_request_flag') == False)
        & (F.col('driver_pay') > 0),
        True
    ).otherwise(False)
)

In [36]:
sdf.groupBy('is_valid_record').count().show()



+---------------+------+
|is_valid_record| count|
+---------------+------+
|           true|228457|
|          false| 12353|
+---------------+------+



                                                                                

Drop invalid records

In [40]:
sdf = sdf.filter(sdf.is_valid_record == True)

In [41]:
sdf.count()

                                                                                

228457

In [80]:
# export data -> for analysis
export_relative_dir = '../data/curated/'

sdf.write.parquet(export_relative_dir + "fhv_cleaned.parquet")



                                                                                