In [19]:
import pyspark
from pyspark.sql import SparkSession
import warnings
from pyspark.sql.types import StructField, StructType, IntegerType, DateType, StringType, TimestampType, DoubleType, LongType
import pyspark.sql.functions as F
warnings.filterwarnings("ignore")

In [5]:
spark = SparkSession.builder.master("local[*]").appName("intro").getOrCreate()
spark.version

'3.3.2'

In [8]:
df = spark.read.option("header", "true").parquet("./data/fhvhv_tripdata_2021-06.parquet", inferschema = True)
df.show(5)



+-----------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+----+----------+-------------------+-----------------+------------------+----------------+--------------+
|hvfhs_license_num|dispatching_base_num|originating_base_num|   request_datetime|  on_scene_datetime|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|trip_miles|trip_time|base_passenger_fare|tolls| bcf|sales_tax|congestion_surcharge|airport_fee|tips|driver_pay|shared_request_flag|shared_match_flag|access_a_ride_flag|wav_request_flag|wav_match_flag|
+-----------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+--

                                                                                

In [14]:
schema = StructType(
    [StructField('hvfhs_license_num', StringType(), True),
     StructField('dispatching_base_num', StringType(), True),
     StructField('originating_base_num', StringType(), True),
     StructField('request_datetime', TimestampType(), True),
     StructField('on_scene_datetime', TimestampType(), True),
     StructField('pickup_datetime', TimestampType(), True),
     StructField('dropoff_datetime', TimestampType(), True),
     StructField('PULocationID', LongType(), True),
     StructField('DOLocationID', LongType(), True),
     StructField('trip_miles', DoubleType(), True),
     StructField('trip_time', LongType(), True),
     StructField('base_passenger_fare', DoubleType(), True),
     StructField('tolls', DoubleType(), True),
     StructField('bcf', DoubleType(), True),
     StructField('sales_tax', DoubleType(), True),
     StructField('congestion_surcharge', DoubleType(), True),
     StructField('airport_fee', DoubleType(), True),
     StructField('tips', DoubleType(), True),
     StructField('driver_pay', DoubleType(), True),
     StructField('shared_request_flag', StringType(), True),
     StructField('shared_match_flag', StringType(), True),
     StructField('access_a_ride_flag', StringType(), True),
     StructField('wav_request_flag', StringType(), True),
     StructField('wav_match_flag', StringType(), True)]
)

In [15]:
# repartition into 12 partition.
df.repartition(12).write.parquet("./data/fhvhv_tripdata_partitioned_2021-06.parquet")

                                                                                

In [18]:
df = spark.read.option("header", "true").parquet("./data/fhvhv_tripdata_partitioned_2021-06.parquet")
df.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- originating_base_num: string (nullable = true)
 |-- request_datetime: timestamp (nullable = true)
 |-- on_scene_datetime: timestamp (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- trip_time: long (nullable = true)
 |-- base_passenger_fare: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- bcf: double (nullable = true)
 |-- sales_tax: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- tips: double (nullable = true)
 |-- driver_pay: double (nullable = true)
 |-- shared_request_flag: string (nullable = true)
 |-- shared_match_flag: string (nullable = true)
 |-- access_a_ride_flag: string (nul

In [29]:
pre_df_1 = df.select("hvfhs_license_num", "pickup_datetime", "dropoff_datetime",
                     "PULocationID", "DOLocationID")\
             .withColumn("pickup_date", F.to_date(df["pickup_datetime"]))\
             .withColumn("dropoff_date", F.to_date(df["dropoff_datetime"]))\
             .select("hvfhs_license_num", "pickup_date", "dropoff_date",
                     "PULocationID", "DOLocationID")
pre_df_1.show(5)

+-----------------+-----------+------------+------------+------------+
|hvfhs_license_num|pickup_date|dropoff_date|PULocationID|DOLocationID|
+-----------------+-----------+------------+------------+------------+
|           HV0003| 2021-06-27|  2021-06-27|         255|         186|
|           HV0003| 2021-06-06|  2021-06-06|          95|         263|
|           HV0003| 2021-06-13|  2021-06-13|          76|          76|
|           HV0005| 2021-06-24|  2021-06-24|          76|          33|
|           HV0005| 2021-06-10|  2021-06-10|         148|          49|
+-----------------+-----------+------------+------------+------------+
only showing top 5 rows



In [32]:
pre_df_1.filter(pre_df_1["pickup_date"] == F.lit("2021-06-15")).show(5)

+-----------------+-----------+------------+------------+------------+
|hvfhs_license_num|pickup_date|dropoff_date|PULocationID|DOLocationID|
+-----------------+-----------+------------+------------+------------+
|           HV0003| 2021-06-15|  2021-06-15|           7|         229|
|           HV0005| 2021-06-15|  2021-06-15|         142|          75|
|           HV0003| 2021-06-15|  2021-06-15|          42|         244|
|           HV0005| 2021-06-15|  2021-06-15|          35|         101|
|           HV0003| 2021-06-15|  2021-06-15|         174|         169|
+-----------------+-----------+------------+------------+------------+
only showing top 5 rows



In [33]:
# udf: split location ids based on even/odd numbers.
def even_odd(x):
    if x % 2 == 0:
        return "even"
    else:
        return "odd"

In [35]:
udf1 = F.udf(even_odd, returnType = StringType())
pre_df_1.withColumn("PULocationID_Type", udf1(df["PULocationID"])).show(5)

[Stage 12:>                                                         (0 + 1) / 1]

+-----------------+-----------+------------+------------+------------+-----------------+
|hvfhs_license_num|pickup_date|dropoff_date|PULocationID|DOLocationID|PULocationID_Type|
+-----------------+-----------+------------+------------+------------+-----------------+
|           HV0003| 2021-06-27|  2021-06-27|         255|         186|              odd|
|           HV0003| 2021-06-06|  2021-06-06|          95|         263|              odd|
|           HV0003| 2021-06-13|  2021-06-13|          76|          76|             even|
|           HV0005| 2021-06-24|  2021-06-24|          76|          33|             even|
|           HV0005| 2021-06-10|  2021-06-10|         148|          49|             even|
+-----------------+-----------+------------+------------+------------+-----------------+
only showing top 5 rows



                                                                                