In [9]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession
from pyspark.sql import types

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/05 11:52:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Question 1:

Install Spark and PySpark

In [3]:
print(f'The PySpark {spark.version} version is running...')

The PySpark 3.4.0 version is running...


### Question 2:

Read FHV October 2019

Repartition the Dataframe and save it to parquet.

What is the average size of the Parquet (ending with .parquet extension) Files that were created (in MB)? 33MB

In [11]:
df = spark.read \
    .option("header", "true") \
    .parquet("fhvhv_tripdata_2021-02.parquet")

In [12]:
df.head(1)

                                                                                

[Row(hvfhs_license_num='HV0003', dispatching_base_num='B02764', originating_base_num='B02764', request_datetime=datetime.datetime(2021, 1, 31, 23, 59), on_scene_datetime=datetime.datetime(2021, 2, 1, 0, 10, 19), pickup_datetime=datetime.datetime(2021, 2, 1, 0, 10, 40), dropoff_datetime=datetime.datetime(2021, 2, 1, 0, 21, 9), PULocationID=35, DOLocationID=39, trip_miles=2.06, trip_time=629, base_passenger_fare=17.14, tolls=0.0, bcf=0.51, sales_tax=1.52, congestion_surcharge=0.0, airport_fee=None, tips=0.0, driver_pay=9.79, shared_request_flag='N', shared_match_flag='N', access_a_ride_flag=' ', wav_request_flag='N', wav_match_flag='N')]

In [13]:
df = df.repartition(24)

In [15]:
df.write.parquet('data/pq/fhvhv/2021/02/', mode='overwrite')

                                                                                

In [16]:
df = spark.read.parquet('data/pq/fhvhv/2021/02/')

### Question 3:
How many taxi trips were there on February 15?


In [17]:
# wrong way
df \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .filter(df.pickup_datetime == '2021-02-15') \
    .count()

5

In [18]:
df \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .filter("pickup_date = '2021-02-15'") \
    .count()

                                                                                

367170

In [19]:
df.registerTempTable('fhvhv_2021_02')



In [20]:
spark.sql("""
SELECT
    COUNT(1)
FROM 
    fhvhv_2021_02
WHERE
    to_date(pickup_datetime) = '2021-02-15';
""").show()



+--------+
|count(1)|
+--------+
|  367170|
+--------+



                                                                                

### Question 4:
Longest trip for each day


In [21]:
df \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .groupBy('pickup_date') \
        .max('trip_time') \
    .orderBy('max(trip_time)', ascending=False) \
    .limit(5) \
    .show()



+-----------+--------------+
|pickup_date|max(trip_time)|
+-----------+--------------+
| 2021-02-11|         75540|
| 2021-02-17|         57220|
| 2021-02-20|         44038|
| 2021-02-03|         40653|
| 2021-02-19|         37577|
+-----------+--------------+



                                                                                

In [25]:
spark.sql("""
SELECT
    to_date(pickup_datetime) AS pickup_date,
    MAX(trip_time) as day_trip_time
FROM 
    fhvhv_2021_02
GROUP BY
    pickup_date
ORDER BY
    day_trip_time DESC
LIMIT
    5
;
""").show()



+-----------+-------------+
|pickup_date|day_trip_time|
+-----------+-------------+
| 2021-02-11|        75540|
| 2021-02-17|        57220|
| 2021-02-20|        44038|
| 2021-02-03|        40653|
| 2021-02-19|        37577|
+-----------+-------------+



                                                                                

### Question 5:
Most frequent dispatching_base_num

How many stages this spark job has? 1

In [26]:
df \
    .groupBy('dispatching_base_num') \
        .count() \
    .orderBy('count', ascending=False) \
    .limit(1) \
    .show()



+--------------------+-------+
|dispatching_base_num|  count|
+--------------------+-------+
|              B02510|3233664|
+--------------------+-------+



                                                                                

In [29]:
spark.sql("""
SELECT
    dispatching_base_num,
    COUNT(1) as freq
FROM 
    fhvhv_2021_02
GROUP BY
    dispatching_base_num
ORDER BY
    freq DESC
LIMIT
    5
;
""").show()



+--------------------+-------+
|dispatching_base_num|   freq|
+--------------------+-------+
|              B02510|3233664|
|              B02764| 965568|
|              B02872| 882689|
|              B02875| 685390|
|              B02765| 559768|
+--------------------+-------+



                                                                                

### Question 6:
Most common locations pair

In [30]:
df_zones = spark.read.parquet('zones')

In [31]:
df_zones.head(1)

[Row(LocationID='1', Borough='EWR', Zone='Newark Airport', service_zone='EWR')]

In [33]:
df_zones.registerTempTable('zones')

In [34]:
spark.sql("""
SELECT
    CONCAT(pul.Zone, ' / ', dol.Zone) AS pu_do_pair,
    COUNT(1) as freq
FROM 
    fhvhv_2021_02 as fhv LEFT JOIN zones as pul ON fhv.PULocationID = pul.LocationID
                         LEFT JOIN zones dol ON fhv.DOLocationID = dol.LocationID
GROUP BY 
    pu_do_pair
ORDER BY
    freq DESC
LIMIT 5;
""").show()





+--------------------+-----+
|          pu_do_pair| freq|
+--------------------+-----+
|East New York / E...|45041|
|Borough Park / Bo...|37329|
| Canarsie / Canarsie|28026|
|Crown Heights Nor...|25976|
|Bay Ridge / Bay R...|17934|
+--------------------+-----+



                                                                                