In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col
spark = SparkSession.builder.appName("Jupyter") \
                            .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
                            .config("spark.sql.catalog.local.catalog-impl", "org.apache.iceberg.hadoop.HadoopCatalog") \
                            .config("spark.sql.catalog.local.warehouse", "file:///tmp/iceberg_warehouse") \
                            .getOrCreate()
                        

events = spark.read.option("header", "true").csv("/home/iceberg/data/events.csv").withColumn("event_date", expr("DATE_TRUNC('day', event_time)"))
devices = spark.read.option("header","true").csv("/home/iceberg/data/devices.csv")

df = events.join(devices,on="device_id",how="left")
df = df.withColumnsRenamed({'browser_type': 'browser_family', 'os_type': 'os_family'})

df.show(1)

25/07/28 19:23:12 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+---------+----------+--------+-------------------+---+--------------------+-------------------+--------------+---------+-----------+
|device_id|   user_id|referrer|               host|url|          event_time|         event_date|browser_family|os_family|device_type|
+---------+----------+--------+-------------------+---+--------------------+-------------------+--------------+---------+-----------+
|532630305|1037710827|    NULL|www.zachwilson.tech|  /|2021-03-08 17:27:...|2021-03-08 00:00:00|         Other|    Other|      Other|
+---------+----------+--------+-------------------+---+--------------------+-------------------+--------------+---------+-----------+
only showing top 1 row



In [1]:
spark

In [3]:
sorted = df.repartition(10, col("event_date"))\
    .sortWithinPartitions(col("event_date"), col("host"))\
    .withColumn("event_time", col("event_time").cast("timestamp")) 

sortedTwo = df.repartition(10, col("event_date"))\
    .sort(col("event_date"), col("host"))\
    .withColumn("event_time", col("event_time").cast("timestamp")) 

sorted.show(1)
sortedTwo.show(1)


                                                                                

+---------+----------+--------+--------------------+---+--------------------+-------------------+--------------+---------+-----------+
|device_id|   user_id|referrer|                host|url|          event_time|         event_date|browser_family|os_family|device_type|
+---------+----------+--------+--------------------+---+--------------------+-------------------+--------------+---------+-----------+
|532630305|1129583063|    NULL|admin.zachwilson....|  /|2021-01-07 09:21:...|2021-01-07 00:00:00|         Other|    Other|      Other|
+---------+----------+--------+--------------------+---+--------------------+-------------------+--------------+---------+-----------+
only showing top 1 row

+----------+----------+--------+--------------------+---+--------------------+-------------------+--------------+---------+-----------+
| device_id|   user_id|referrer|                host|url|          event_time|         event_date|browser_family|os_family|device_type|
+----------+----------+------

In [None]:
# .sortWithinPartitions() sorts within partitions, whereas .sort() is a global sort, which is very slow

# Note - exchange is synonymous with Shuffle

In [4]:
sorted = df.repartition(10, col("event_date"))\
    .sortWithinPartitions(col("event_date"), col("host"))\
    .withColumn("event_time", col("event_time").cast("timestamp")) 

sortedTwo = df.repartition(10, col("event_date"))\
    .sort(col("event_date"), col("host"))\
    .withColumn("event_time", col("event_time").cast("timestamp")) 

sorted.explain()
sortedTwo.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [device_id#155, user_id#154, referrer#156, host#157, url#158, cast(event_time#159 as timestamp) AS event_time#398, event_date#166, browser_family#211, os_family#212, device_type#195]
   +- Sort [event_date#166 ASC NULLS FIRST, host#157 ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(event_date#166, 10), REPARTITION_BY_NUM, [plan_id=525]
         +- Project [device_id#155, user_id#154, referrer#156, host#157, url#158, event_time#159, event_date#166, browser_type#193 AS browser_family#211, os_type#194 AS os_family#212, device_type#195]
            +- BroadcastHashJoin [device_id#155], [device_id#192], LeftOuter, BuildRight, false
               :- Project [user_id#154, device_id#155, referrer#156, host#157, url#158, event_time#159, date_trunc(day, cast(event_time#159 as timestamp), Some(Etc/UTC)) AS event_date#166]
               :  +- FileScan csv [user_id#154,device_id#155,referrer#156,host#157,url#158,eve

In [5]:
%%sql

CREATE DATABASE IF NOT EXISTS bootcamp

In [6]:
%%sql

DROP TABLE IF EXISTS bootcamp.events

In [7]:
%%sql

DROP TABLE IF EXISTS bootcamp.events_sorted

In [12]:
%%sql

CREATE TABLE IF NOT EXISTS local.bootcamp.events (
    url STRING,
    referrer STRING,
    browser_family STRING,
    os_family STRING,
    device_family STRING,
    host STRING,
    event_time TIMESTAMP,
    event_date DATE
)
USING iceberg
PARTITIONED BY (years(event_date));


In [13]:
%%sql


CREATE TABLE IF NOT EXISTS local.bootcamp.events_sorted (
    url STRING,
    referrer STRING,
    browser_family STRING,
    os_family STRING,
    device_family STRING,
    host STRING,
    event_time TIMESTAMP,
    event_date DATE
)
USING iceberg
PARTITIONED BY (years(event_date));

In [14]:
%%sql


CREATE TABLE IF NOT EXISTS local.bootcamp.events_unsorted (
    url STRING,
    referrer STRING,
    browser_family STRING,
    os_family STRING,
    device_family STRING,
    host STRING,
    event_time TIMESTAMP,
    event_date DATE
)
USING iceberg
PARTITIONED BY (year(event_date));

In [19]:

start_df = df.repartition(4, col("event_date")).withColumn("event_time", col("event_time").cast("timestamp")) \
    
first_sort_df = start_df.sortWithinPartitions(col("event_date"), col('os_family'),col("host"))

start_df.write.mode("overwrite").saveAsTable("local.bootcamp.events_unsorted")
first_sort_df.write.mode("overwrite").saveAsTable("local.bootcamp.events_sorted")

                                                                                

In [20]:
%%sql

SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'sorted' 
FROM local.bootcamp.events_sorted.files

UNION ALL
SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'unsorted' 
FROM local.bootcamp.events_unsorted.files

size,num_files,sorted
5194260,4,sorted
5556664,4,unsorted


In [90]:
%%sql
SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files FROM demo.bootcamp.events.files;

size,num_files
3145713,5


In [None]:
%%sql 
SELECT COUNT(1) FROM bootcamp.matches_bucketed.files

count(1)
3665
