In [1]:
from pyspark.sql import SparkSession, types, functions as F

In [2]:
spark = SparkSession.builder\
          .master("local[*]")\
          .appName("test")\
          .getOrCreate()

25/04/05 10:45:28 WARN Utils: Your hostname, Bastiens-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.243.218.32 instead (on interface en0)
25/04/05 10:45:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/05 10:45:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Group By
## Green Trips

In [3]:
df_green = spark.read.parquet('./data/pq/green/*/*')

                                                                                

In [4]:
df_green.createTempView('green')

In [5]:
df_green_revenue = spark.sql("""
  SELECT 
      date_trunc('hour', lpep_pickup_datetime) AS hour, 
      PULocationID AS zone,

      SUM(total_amount) AS amount,
      COUNT(1) AS number_records
  FROM
      green
  WHERE
      lpep_pickup_datetime >= '2020-01-01 00:00:00'
  GROUP BY
      1, 2
""")

In [6]:
df_green_revenue\
  .repartition(20)\
  .write.parquet('./data/report/revenue/green', mode='overwrite')

25/04/05 10:45:48 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/04/05 10:45:49 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

## Yellow Trips

In [7]:
df_yellow = spark.read.parquet('./data/pq/yellow/*/*')

In [8]:
df_yellow.createTempView('yellow')

In [9]:
df_yellow_revenue = spark.sql("""
  SELECT 
      date_trunc('hour', tpep_pickup_datetime) AS hour, 
      PULocationID AS zone,

      SUM(total_amount) AS amount,
      COUNT(1) AS number_records
  FROM
      yellow
  WHERE
      tpep_pickup_datetime >= '2020-01-01 00:00:00'
  GROUP BY
      1, 2
""")

In [None]:
df_yellow_revenue\
  .repartition(20)\
  .write.parquet('./data/report/revenue/yellow', mode='overwrite')

25/04/05 10:46:06 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/04/05 10:46:07 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

In [None]:
df_green_revenue = spark.read.parquet('./data/report/revenue/green')
df_yellow_revenue = spark.read.parquet('./data/report/revenue/yellow')

# Join
## External merge join

In [11]:
df_green_revenue_tmp = df_green_revenue\
                        .withColumnRenamed('amount', 'green_amount')\
                        .withColumnRenamed('number_records', 'green_number_records')

In [12]:
df_yellow_revenue_tmp = df_yellow_revenue\
                        .withColumnRenamed('amount', 'yellow_amount')\
                        .withColumnRenamed('number_records', 'yellow_number_records')

In [13]:
df_join = df_green_revenue_tmp.join(df_yellow_revenue_tmp, on=['hour', 'zone'], how='outer')

In [14]:
df_join.show(5)

                                                                                

+-------------------+----+------------------+--------------------+------------------+---------------------+
|               hour|zone|      green_amount|green_number_records|     yellow_amount|yellow_number_records|
+-------------------+----+------------------+--------------------+------------------+---------------------+
|2020-01-01 00:00:00|  22|              15.8|                   1|              NULL|                 NULL|
|2020-01-01 00:00:00|  25|             531.0|                  26|            324.35|                   16|
|2020-01-01 00:00:00|  55|129.29000000000002|                   4|              NULL|                 NULL|
|2020-01-01 00:00:00|  56|             99.69|                   3|              18.1|                    2|
|2020-01-01 00:00:00|  60|            160.04|                   6|57.620000000000005|                    2|
+-------------------+----+------------------+--------------------+------------------+---------------------+
only showing top 5 rows



In [15]:
df_join.write.parquet('./data/report/revenue/total', mode='overwrite')

25/04/05 10:46:45 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

## Broadcast join

In [19]:
df_zones = spark.read\
  .option("header", True)\
  .csv('./data/raw/taxi_zone_lookup.csv')

In [20]:
df_zones.show(5)

+----------+-------------+--------------------+------------+
|LocationID|      Borough|                Zone|service_zone|
+----------+-------------+--------------------+------------+
|         1|          EWR|      Newark Airport|         EWR|
|         2|       Queens|         Jamaica Bay|   Boro Zone|
|         3|        Bronx|Allerton/Pelham G...|   Boro Zone|
|         4|    Manhattan|       Alphabet City| Yellow Zone|
|         5|Staten Island|       Arden Heights|   Boro Zone|
+----------+-------------+--------------------+------------+
only showing top 5 rows



In [24]:
df_result = df_join\
  .join(df_zones, df_join.zone == df_zones.LocationID)\
  .drop('LocationID', 'zone')

In [25]:
df_result.write.parquet('tmp/revenue-zones', mode='overwrite')

25/04/05 11:02:27 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

In [73]:
spark.stop()