In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('24121003_yellowtaxi_trip_count').getOrCreate()

In [5]:
import os
trip_files = '/trips/*'
zone_file = 'taxi+_zone_lookup.csv'
directory = os.path.join(os.getcwd(), 'data')

In [6]:
trips_df = spark.read.csv(f'file:///{directory}/{trip_files}', inferSchema = True, header = True)

                                                                                

In [7]:
zone_df = spark.read.csv(f'file:///{directory}/{zone_file}', inferSchema = True, header = True)

In [12]:
trips_df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [13]:
zone_df.printSchema()

root
 |-- LocationID: integer (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- service_zone: string (nullable = true)



In [14]:
trips_df.createOrReplaceTempView('trips')
zone_df.createOrReplaceTempView('zone')

In [26]:
comb_df = spark.sql(
    '''
    select t.VendorID, 
        TO_DATE(t.tpep_pickup_datetime) as pickup_date,
        TO_DATE(t.tpep_dropoff_datetime) as dropoff_date,
        HOUR(t.tpep_pickup_datetime) as pickup_time,
        HOUR(t.tpep_dropoff_datetime) as dropoff_time,
        t.passenger_count,
        t.trip_distance,
        t.payment_type,
        t.tip_amount,
        t.total_amount,
        t.payment_type,
        pz.Zone as pickup_zone,
        dz.Zone as dropoff_zone
        
        
    from trips t
    LEFT JOIN zone pz ON t.PULocationID = pz.LocationID
    LEFT JOIN zone dz ON t.DOLocationID = dz.LocationID
    '''
)

In [27]:
comb_df.count()

                                                                                

8921358

In [28]:
comb_df.show(5)

+--------+-----------+------------+-----------+------------+---------------+-------------+------------+----------+------------+------------+-----------------+--------------+
|VendorID|pickup_date|dropoff_date|pickup_time|dropoff_time|passenger_count|trip_distance|payment_type|tip_amount|total_amount|payment_type|      pickup_zone|  dropoff_zone|
+--------+-----------+------------+-----------+------------+---------------+-------------+------------+----------+------------+------------+-----------------+--------------+
|       2| 2021-03-01|  2021-03-01|          0|           0|              1|          0.0|           2|       0.0|         4.3|           2|               NV|            NV|
|       2| 2021-03-01|  2021-03-01|          0|           0|              1|          0.0|           2|       0.0|         3.8|           2|   Manhattanville|Manhattanville|
|       2| 2021-03-01|  2021-03-01|          0|           0|              1|          0.0|           2|       0.0|         4.8|   

In [30]:
comb_df.createOrReplaceTempView('comb')

In [35]:
spark.sql(
    '''
    select distinct pickup_time
    from comb
    order by pickup_time asc
    '''
).show(30)



+-----------+
|pickup_time|
+-----------+
|       null|
|          0|
|          1|
|          2|
|          3|
|          4|
|          5|
|          6|
|          7|
|          8|
|          9|
|         10|
|         11|
|         12|
|         13|
|         14|
|         15|
|         16|
|         17|
|         18|
|         19|
|         20|
|         21|
|         22|
|         23|
+-----------+



                                                                                

In [37]:
comb_df.describe().show()



+-------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-------------------+--------------------+--------------------+
|summary|          VendorID|       pickup_time|      dropoff_time|   passenger_count|     trip_distance|       payment_type|        tip_amount|      total_amount|       payment_type|         pickup_zone|        dropoff_zone|
+-------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-------------------+--------------------+--------------------+
|  count|           8596840|           8921357|           8921357|           8596839|           8921357|            8596838|           8921355|           8921354|            8596838|             8921357|             8921356|
|   mean| 1.682631874037437|14.074268185882485|14.178853844768234|1.4186727237767276|3.7238418706928

                                                                                

In [38]:
# 실행계획, 실행결과(4040)
query2 = '''
select pickup_date, pickup_time
from comb
where pickup_time > 0 and pickup_time <= 12
'''
spark.sql(query2).show(5)

+-----------+-----------+
|pickup_date|pickup_time|
+-----------+-----------+
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
| 2021-03-01|          1|
+-----------+-----------+
only showing top 5 rows



In [39]:
spark.sql(query2).explain()

== Physical Plan ==
*(3) Project [cast(tpep_pickup_datetime#17 as date) AS pickup_date#419, hour(cast(tpep_pickup_datetime#17 as timestamp), Some(Asia/Seoul)) AS pickup_time#421]
+- *(3) BroadcastHashJoin [DOLocationID#24], [LocationID#425], LeftOuter, BuildRight, false
   :- *(3) Project [tpep_pickup_datetime#17, DOLocationID#24]
   :  +- *(3) BroadcastHashJoin [PULocationID#23], [LocationID#68], LeftOuter, BuildRight, false
   :     :- *(3) Filter ((isnotnull(tpep_pickup_datetime#17) AND (hour(cast(tpep_pickup_datetime#17 as timestamp), Some(Asia/Seoul)) > 0)) AND (hour(cast(tpep_pickup_datetime#17 as timestamp), Some(Asia/Seoul)) <= 12))
   :     :  +- FileScan csv [tpep_pickup_datetime#17,PULocationID#23,DOLocationID#24] Batched: false, DataFilters: [isnotnull(tpep_pickup_datetime#17), (hour(cast(tpep_pickup_datetime#17 as timestamp), Some(Asia/..., Format: CSV, Location: InMemoryFileIndex[file:/home/lab09/git/src/data/trips/yellow_tripdata_2021-01.csv, file:/home/lab..., Partition

In [40]:
# 실행계획, 실행결과(4040)
query3 = '''
select pickup_date, count(*) as trip_count
from comb
where pickup_time > 0
group by pickup_date
order by pickup_date
'''
spark.sql(query3).show(5)



+-----------+----------+
|pickup_date|trip_count|
+-----------+----------+
| 2002-12-31|         1|
| 2004-04-04|         1|
| 2008-12-31|        15|
| 2009-01-01|        23|
| 2020-12-31|        16|
+-----------+----------+
only showing top 5 rows



                                                                                

In [41]:
spark.sql(query3).explain()

== Physical Plan ==
*(5) Sort [pickup_date#419 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(pickup_date#419 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [id=#1840]
   +- *(4) HashAggregate(keys=[pickup_date#419], functions=[count(1)])
      +- Exchange hashpartitioning(pickup_date#419, 200), ENSURE_REQUIREMENTS, [id=#1836]
         +- *(3) HashAggregate(keys=[pickup_date#419], functions=[partial_count(1)])
            +- *(3) Project [cast(tpep_pickup_datetime#17 as date) AS pickup_date#419]
               +- *(3) BroadcastHashJoin [DOLocationID#24], [LocationID#425], LeftOuter, BuildRight, false
                  :- *(3) Project [tpep_pickup_datetime#17, DOLocationID#24]
                  :  +- *(3) BroadcastHashJoin [PULocationID#23], [LocationID#68], LeftOuter, BuildRight, false
                  :     :- *(3) Filter (isnotnull(tpep_pickup_datetime#17) AND (hour(cast(tpep_pickup_datetime#17 as timestamp), Some(Asia/Seoul)) > 0))
                  :     :  +- FileScan csv [