In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col
spark = SparkSession.builder.appName("Jupyter").getOrCreate()

spark

events = spark.read.option("header", "true").csv("/home/iceberg/data/events.csv").withColumn("event_date", expr("DATE_TRUNC('day', event_time)"))
devices = spark.read.option("header","true").csv("/home/iceberg/data/devices.csv")

df = events.join(devices,on="device_id",how="left")
df = df.withColumnsRenamed({'browser_type': 'browser_family', 'os_type': 'os_family'})

df.show()

25/08/20 11:07:53 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+----------+-----------+--------+--------------------+----------+--------------------+-------------------+--------------+---------+-----------+
| device_id|    user_id|referrer|                host|       url|          event_time|         event_date|browser_family|os_family|device_type|
+----------+-----------+--------+--------------------+----------+--------------------+-------------------+--------------+---------+-----------+
| 532630305| 1037710827|    NULL| www.zachwilson.tech|         /|2021-03-08 17:27:...|2021-03-08 00:00:00|         Other|    Other|      Other|
| 532630305|  925588856|    NULL|    www.eczachly.com|         /|2021-05-10 11:26:...|2021-05-10 00:00:00|         Other|    Other|      Other|
| 532630305|-1180485268|    NULL|admin.zachwilson....|         /|2021-02-17 16:19:...|2021-02-17 00:00:00|         Other|    Other|      Other|
| 532630305|-1044833855|    NULL| www.zachwilson.tech|         /|2021-09-24 15:53:...|2021-09-24 00:00:00|         Other|    Other|     

In [2]:
sorted = df.repartition(10, col("event_date"))\
    .sortWithinPartitions(col("event_date"), col("host"))\
    .withColumn("event_time", col("event_time").cast("timestamp")) 

sortedTwo = df.repartition(10, col("event_date"))\
    .sort(col("event_date"), col("host"))\
    .withColumn("event_time", col("event_time").cast("timestamp")) 

sorted.show()
sortedTwo.show()


                                                                                

+-----------+-----------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------+---------+------------------+
|  device_id|    user_id|            referrer|                host|                 url|          event_time|         event_date|browser_family|os_family|       device_type|
+-----------+-----------+--------------------+--------------------+--------------------+--------------------+-------------------+--------------+---------+------------------+
|  532630305| 1129583063|                NULL|admin.zachwilson....|                   /|2021-01-07 09:21:...|2021-01-07 00:00:00|         Other|    Other|             Other|
| 1088283544| -648945006|                NULL|    www.eczachly.com|                   /|2021-01-07 02:58:...|2021-01-07 00:00:00|      PetalBot|  Android|Generic Smartphone|
| -158310583|-1871780024|                NULL|    www.eczachly.com|                   /|2021-01-07 04:17:...|2021-01-07 00:00:00| 

In [None]:
# .sortWithinPartitions() sorts within partitions, whereas .sort() is a global sort, which is very slow

# Note - exchange is synonymous with Shuffle

In [3]:
sorted = df.repartition(10, col("event_date"))\
    .sortWithinPartitions(col("event_date"), col("host"))\
    .withColumn("event_time", col("event_time").cast("timestamp")) 

sortedTwo = df.repartition(10, col("event_date"))\
    .sort(col("event_date"), col("host"))\
    .withColumn("event_time", col("event_time").cast("timestamp")) 

sorted.explain()
sortedTwo.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [device_id#18, user_id#17, referrer#19, host#20, url#21, cast(event_time#22 as timestamp) AS event_time#261, event_date#29, browser_family#74, os_family#75, device_type#58]
   +- Sort [event_date#29 ASC NULLS FIRST, host#20 ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(event_date#29, 10), REPARTITION_BY_NUM, [plan_id=410]
         +- Project [device_id#18, user_id#17, referrer#19, host#20, url#21, event_time#22, event_date#29, browser_type#56 AS browser_family#74, os_type#57 AS os_family#75, device_type#58]
            +- BroadcastHashJoin [device_id#18], [device_id#55], LeftOuter, BuildRight, false
               :- Project [user_id#17, device_id#18, referrer#19, host#20, url#21, event_time#22, date_trunc(day, cast(event_time#22 as timestamp), Some(Etc/UTC)) AS event_date#29]
               :  +- FileScan csv [user_id#17,device_id#18,referrer#19,host#20,url#21,event_time#22] Batched: false, DataFilters:

In [4]:
%%sql

CREATE DATABASE IF NOT EXISTS bootcamp

In [5]:
%%sql

DROP TABLE IF EXISTS bootcamp.events

In [6]:
%%sql

DROP TABLE IF EXISTS bootcamp.events_sorted

In [7]:
%%sql

CREATE TABLE IF NOT EXISTS bootcamp.events (
    url STRING,
    referrer STRING,
    browser_family STRING,
    os_family STRING,
    device_family STRING,
    host STRING,
    event_time TIMESTAMP,
    event_date DATE
)
USING iceberg
PARTITIONED BY (years(event_date));


In [8]:
%%sql


CREATE TABLE IF NOT EXISTS bootcamp.events_sorted (
    url STRING,
    referrer STRING,
    browser_family STRING,
    os_family STRING,
    device_family STRING,
    host STRING,
    event_time TIMESTAMP,
    event_date DATE
)
USING iceberg
PARTITIONED BY (years(event_date));

In [9]:
%%sql


CREATE TABLE IF NOT EXISTS bootcamp.events_unsorted (
    url STRING,
    referrer STRING,
    browser_family STRING,
    os_family STRING,
    device_family STRING,
    host STRING,
    event_time TIMESTAMP,
    event_date DATE
)
USING iceberg
PARTITIONED BY (year(event_date));

In [10]:

start_df = df.repartition(4, col("event_date")).withColumn("event_time", col("event_time").cast("timestamp")) \
    
first_sort_df = start_df.sortWithinPartitions(col("event_date"), col("host"))

start_df.write.mode("overwrite").saveAsTable("bootcamp.events_unsorted")
first_sort_df.write.mode("overwrite").saveAsTable("bootcamp.events_sorted")

                                                                                

In [11]:
%%sql

SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'sorted' 
FROM demo.bootcamp.events_sorted.files

UNION ALL
SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'unsorted' 
FROM demo.bootcamp.events_unsorted.files





size,num_files,sorted
5444946,4,sorted
5556664,4,unsorted


In [17]:
%%sql
SELECT count(1) FROM demo.bootcamp.events_sorted.files;

count(1)
4


In [3]:
%%sql 
SELECT COUNT(1) FROM bootcamp.matches_bucketed.files

                                                                                

count(1)
3665


In [2]:
%%sql 
SELECT * FROM bootcamp.match_details_bucketed.files


content,file_path,file_format,spec_id,partition,record_count,file_size_in_bytes,column_sizes,value_counts,null_value_counts,nan_value_counts,lower_bounds,upper_bounds,key_metadata,split_offsets,equality_ids,sort_order_id,referenced_data_file,content_offset,content_size_in_bytes,readable_metrics
0,s3://warehouse/bootcamp/match_details_bucketed/data/match_id_bucket=6/00000-9799-c6e5223a-0d53-4b9e-aa65-ddc616062327-0-00004.parquet,PARQUET,0,Row(match_id_bucket=6),9377,121677,"{1: 29615, 2: 77986, 3: 6550, 4: 6118}","{1: 9377, 2: 9377, 3: 9377, 4: 9377}","{1: 0, 2: 0, 3: 0, 4: 0}",{},"{1: bytearray(b'004acec5-dced-4d'), 2: bytearray(b'A 2 tailed fox'), 3: bytearray(b'\x00\x00\x00\x00'), 4: bytearray(b'\x00\x00\x00\x00')}","{1: bytearray(b'ff28b78b-4073-4:'), 2: bytearray(b'zzzKusohakokids'), 3: bytearray(b'=\x00\x00\x00'), 4: bytearray(b';\x00\x00\x00')}",,[4],,0,,,,"Row(match_id=Row(column_size=29615, value_count=9377, null_value_count=0, nan_value_count=None, lower_bound='004acec5-dced-4d', upper_bound='ff28b78b-4073-4:'), player_gamertag=Row(column_size=77986, value_count=9377, null_value_count=0, nan_value_count=None, lower_bound='A 2 tailed fox', upper_bound='zzzKusohakokids'), player_total_deaths=Row(column_size=6118, value_count=9377, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=59), player_total_kills=Row(column_size=6550, value_count=9377, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=61))"
0,s3://warehouse/bootcamp/match_details_bucketed/data/match_id_bucket=12/00000-9799-c6e5223a-0d53-4b9e-aa65-ddc616062327-0-00001.parquet,PARQUET,0,Row(match_id_bucket=12),9679,125048,"{1: 30496, 2: 80204, 3: 6577, 4: 6367}","{1: 9679, 2: 9679, 3: 9679, 4: 9679}","{1: 0, 2: 0, 3: 0, 4: 0}",{},"{1: bytearray(b'00592b3d-ae26-45'), 2: bytearray(b'A American Monk'), 3: bytearray(b'\x00\x00\x00\x00'), 4: bytearray(b'\x00\x00\x00\x00')}","{1: bytearray(b'fff9b2dc-1391-4e'), 2: bytearray(b'zxXDEATH3Xxz'), 3: bytearray(b':\x00\x00\x00'), 4: bytearray(b'/\x00\x00\x00')}",,[4],,0,,,,"Row(match_id=Row(column_size=30496, value_count=9679, null_value_count=0, nan_value_count=None, lower_bound='00592b3d-ae26-45', upper_bound='fff9b2dc-1391-4e'), player_gamertag=Row(column_size=80204, value_count=9679, null_value_count=0, nan_value_count=None, lower_bound='A American Monk', upper_bound='zxXDEATH3Xxz'), player_total_deaths=Row(column_size=6367, value_count=9679, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=47), player_total_kills=Row(column_size=6577, value_count=9679, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=58))"
0,s3://warehouse/bootcamp/match_details_bucketed/data/match_id_bucket=13/00000-9799-c6e5223a-0d53-4b9e-aa65-ddc616062327-0-00003.parquet,PARQUET,0,Row(match_id_bucket=13),9785,126191,"{1: 30863, 2: 80768, 3: 6699, 4: 6473}","{1: 9785, 2: 9785, 3: 9785, 4: 9785}","{1: 0, 2: 0, 3: 0, 4: 0}",{},"{1: bytearray(b'0019da4a-125b-4d'), 2: bytearray(b'A BALLS86'), 3: bytearray(b'\x00\x00\x00\x00'), 4: bytearray(b'\x00\x00\x00\x00')}","{1: bytearray(b'ffddb9c6-8c61-48'), 2: bytearray(b'zzRICKEYzz'), 3: bytearray(b'G\x00\x00\x00'), 4: bytearray(b'@\x00\x00\x00')}",,[4],,0,,,,"Row(match_id=Row(column_size=30863, value_count=9785, null_value_count=0, nan_value_count=None, lower_bound='0019da4a-125b-4d', upper_bound='ffddb9c6-8c61-48'), player_gamertag=Row(column_size=80768, value_count=9785, null_value_count=0, nan_value_count=None, lower_bound='A BALLS86', upper_bound='zzRICKEYzz'), player_total_deaths=Row(column_size=6473, value_count=9785, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=64), player_total_kills=Row(column_size=6699, value_count=9785, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=71))"
0,s3://warehouse/bootcamp/match_details_bucketed/data/match_id_bucket=1/00000-9799-c6e5223a-0d53-4b9e-aa65-ddc616062327-0-00002.parquet,PARQUET,0,Row(match_id_bucket=1),9017,117085,"{1: 28687, 2: 74555, 3: 6338, 4: 6107}","{1: 9017, 2: 9017, 3: 9017, 4: 9017}","{1: 0, 2: 0, 3: 0, 4: 0}",{},"{1: bytearray(b'00227b83-7a2c-4a'), 2: bytearray(b'A 2tha nimal'), 3: bytearray(b'\x00\x00\x00\x00'), 4: bytearray(b'\x00\x00\x00\x00')}","{1: bytearray(b'fffc65f4-bc88-41'), 2: bytearray(b'zzachnurface'), 3: bytearray(b'G\x00\x00\x00'), 4: bytearray(b'<\x00\x00\x00')}",,[4],,0,,,,"Row(match_id=Row(column_size=28687, value_count=9017, null_value_count=0, nan_value_count=None, lower_bound='00227b83-7a2c-4a', upper_bound='fffc65f4-bc88-41'), player_gamertag=Row(column_size=74555, value_count=9017, null_value_count=0, nan_value_count=None, lower_bound='A 2tha nimal', upper_bound='zzachnurface'), player_total_deaths=Row(column_size=6107, value_count=9017, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=60), player_total_kills=Row(column_size=6338, value_count=9017, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=71))"
0,s3://warehouse/bootcamp/match_details_bucketed/data/match_id_bucket=9/00001-9800-c6e5223a-0d53-4b9e-aa65-ddc616062327-0-00004.parquet,PARQUET,0,Row(match_id_bucket=9),9405,120962,"{1: 29841, 2: 76982, 3: 6512, 4: 6233}","{1: 9405, 2: 9405, 3: 9405, 4: 9405}","{1: 0, 2: 0, 3: 0, 4: 0}",{},"{1: bytearray(b'0000e3cf-727c-49'), 2: bytearray(b'A BRIGHT SHADOW'), 3: bytearray(b'\x00\x00\x00\x00'), 4: bytearray(b'\x00\x00\x00\x00')}","{1: bytearray(b'ffecf124-637c-42'), 2: bytearray(b'zztonii'), 3: bytearray(b'<\x00\x00\x00'), 4: bytearray(b'0\x00\x00\x00')}",,[4],,0,,,,"Row(match_id=Row(column_size=29841, value_count=9405, null_value_count=0, nan_value_count=None, lower_bound='0000e3cf-727c-49', upper_bound='ffecf124-637c-42'), player_gamertag=Row(column_size=76982, value_count=9405, null_value_count=0, nan_value_count=None, lower_bound='A BRIGHT SHADOW', upper_bound='zztonii'), player_total_deaths=Row(column_size=6233, value_count=9405, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=48), player_total_kills=Row(column_size=6512, value_count=9405, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=60))"
0,s3://warehouse/bootcamp/match_details_bucketed/data/match_id_bucket=15/00001-9800-c6e5223a-0d53-4b9e-aa65-ddc616062327-0-00003.parquet,PARQUET,0,Row(match_id_bucket=15),9482,123263,"{1: 29752, 2: 79300, 3: 6544, 4: 6275}","{1: 9482, 2: 9482, 3: 9482, 4: 9482}","{1: 0, 2: 0, 3: 0, 4: 0}",{},"{1: bytearray(b'000d9c16-68a1-48'), 2: bytearray(b'A Blue TV'), 3: bytearray(b'\x00\x00\x00\x00'), 4: bytearray(b'\x00\x00\x00\x00')}","{1: bytearray(b'ffded49f-acf3-48'), 2: bytearray(b'zz MaTeRiiAL'), 3: bytearray(b'S\x00\x00\x00'), 4: bytearray(b'<\x00\x00\x00')}",,[4],,0,,,,"Row(match_id=Row(column_size=29752, value_count=9482, null_value_count=0, nan_value_count=None, lower_bound='000d9c16-68a1-48', upper_bound='ffded49f-acf3-48'), player_gamertag=Row(column_size=79300, value_count=9482, null_value_count=0, nan_value_count=None, lower_bound='A Blue TV', upper_bound='zz MaTeRiiAL'), player_total_deaths=Row(column_size=6275, value_count=9482, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=60), player_total_kills=Row(column_size=6544, value_count=9482, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=83))"
0,s3://warehouse/bootcamp/match_details_bucketed/data/match_id_bucket=5/00001-9800-c6e5223a-0d53-4b9e-aa65-ddc616062327-0-00002.parquet,PARQUET,0,Row(match_id_bucket=5),9435,121878,"{1: 30423, 2: 77442, 3: 6399, 4: 6220}","{1: 9435, 2: 9435, 3: 9435, 4: 9435}","{1: 0, 2: 0, 3: 0, 4: 0}",{},"{1: bytearray(b'0002e220-f603-4d'), 2: bytearray(b'A COW 9'), 3: bytearray(b'\x00\x00\x00\x00'), 4: bytearray(b'\x00\x00\x00\x00')}","{1: bytearray(b'fff8b649-17a1-4g'), 2: bytearray(b'zzzBUSDRIVERzzz'), 3: bytearray(b'm\x00\x00\x00'), 4: bytearray(b'R\x00\x00\x00')}",,[4],,0,,,,"Row(match_id=Row(column_size=30423, value_count=9435, null_value_count=0, nan_value_count=None, lower_bound='0002e220-f603-4d', upper_bound='fff8b649-17a1-4g'), player_gamertag=Row(column_size=77442, value_count=9435, null_value_count=0, nan_value_count=None, lower_bound='A COW 9', upper_bound='zzzBUSDRIVERzzz'), player_total_deaths=Row(column_size=6220, value_count=9435, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=82), player_total_kills=Row(column_size=6399, value_count=9435, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=109))"
0,s3://warehouse/bootcamp/match_details_bucketed/data/match_id_bucket=3/00001-9800-c6e5223a-0d53-4b9e-aa65-ddc616062327-0-00001.parquet,PARQUET,0,Row(match_id_bucket=3),9795,125915,"{1: 31234, 2: 79868, 3: 6859, 4: 6560}","{1: 9795, 2: 9795, 3: 9795, 4: 9795}","{1: 0, 2: 0, 3: 0, 4: 0}",{},"{1: bytearray(b'005a11f3-2adc-44'), 2: bytearray(b'A BOOTY TAP'), 3: bytearray(b'\x00\x00\x00\x00'), 4: bytearray(b'\x00\x00\x00\x00')}","{1: bytearray(b'ffea4b1d-24bf-47'), 2: bytearray(b'zzzgameszzz'), 3: bytearray(b'Z\x00\x00\x00'), 4: bytearray(b'L\x00\x00\x00')}",,[4],,0,,,,"Row(match_id=Row(column_size=31234, value_count=9795, null_value_count=0, nan_value_count=None, lower_bound='005a11f3-2adc-44', upper_bound='ffea4b1d-24bf-47'), player_gamertag=Row(column_size=79868, value_count=9795, null_value_count=0, nan_value_count=None, lower_bound='A BOOTY TAP', upper_bound='zzzgameszzz'), player_total_deaths=Row(column_size=6560, value_count=9795, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=76), player_total_kills=Row(column_size=6859, value_count=9795, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=90))"
0,s3://warehouse/bootcamp/match_details_bucketed/data/match_id_bucket=8/00002-9801-c6e5223a-0d53-4b9e-aa65-ddc616062327-0-00002.parquet,PARQUET,0,Row(match_id_bucket=8),9302,119858,"{1: 29612, 2: 76537, 3: 6346, 4: 5971}","{1: 9302, 2: 9302, 3: 9302, 4: 9302}","{1: 0, 2: 0, 3: 0, 4: 0}",{},"{1: bytearray(b'0022accb-4bd2-4c'), 2: bytearray(b'A BRIGHT SHADOW'), 3: bytearray(b'\x00\x00\x00\x00'), 4: bytearray(b'\x00\x00\x00\x00')}","{1: bytearray(b'ff837e2a-2709-46'), 2: bytearray(b'zzSOzz'), 3: bytearray(b'I\x00\x00\x00'), 4: bytearray(b'E\x00\x00\x00')}",,[4],,0,,,,"Row(match_id=Row(column_size=29612, value_count=9302, null_value_count=0, nan_value_count=None, lower_bound='0022accb-4bd2-4c', upper_bound='ff837e2a-2709-46'), player_gamertag=Row(column_size=76537, value_count=9302, null_value_count=0, nan_value_count=None, lower_bound='A BRIGHT SHADOW', upper_bound='zzSOzz'), player_total_deaths=Row(column_size=5971, value_count=9302, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=69), player_total_kills=Row(column_size=6346, value_count=9302, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=73))"
0,s3://warehouse/bootcamp/match_details_bucketed/data/match_id_bucket=7/00002-9801-c6e5223a-0d53-4b9e-aa65-ddc616062327-0-00003.parquet,PARQUET,0,Row(match_id_bucket=7),9642,123864,"{1: 30595, 2: 79121, 3: 6543, 4: 6209}","{1: 9642, 2: 9642, 3: 9642, 4: 9642}","{1: 0, 2: 0, 3: 0, 4: 0}",{},"{1: bytearray(b'00dfb4ef-bafd-45'), 2: bytearray(b'A 0 N Eclipse'), 3: bytearray(b'\x00\x00\x00\x00'), 4: bytearray(b'\x00\x00\x00\x00')}","{1: bytearray(b'ff5590a3-d8ca-4b'), 2: bytearray(b'zzbucketzz'), 3: bytearray(b'^\x00\x00\x00'), 4: bytearray(b';\x00\x00\x00')}",,[4],,0,,,,"Row(match_id=Row(column_size=30595, value_count=9642, null_value_count=0, nan_value_count=None, lower_bound='00dfb4ef-bafd-45', upper_bound='ff5590a3-d8ca-4b'), player_gamertag=Row(column_size=79121, value_count=9642, null_value_count=0, nan_value_count=None, lower_bound='A 0 N Eclipse', upper_bound='zzbucketzz'), player_total_deaths=Row(column_size=6209, value_count=9642, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=59), player_total_kills=Row(column_size=6543, value_count=9642, null_value_count=0, nan_value_count=None, lower_bound=0, upper_bound=94))"
