In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col, lit
spark = SparkSession.builder.appName("Jupyter").getOrCreate()

df = spark.read.option("header", "true") \
        .csv("/home/iceberg/data/events.csv") \
        .withColumn("event_date", expr("DATE_TRUNC('day', event_time)"))

df2 = spark.read.option("header", "true") \
        .csv("/home/iceberg/data/devices.csv")


df3 = df.join(df2, df['device_id'] == df2['device_id'], 'left').drop(df2.device_id)

df3.show()

+-----------+----------+--------+--------------------+----------+--------------------+-------------------+------------+-------+-----------+
|    user_id| device_id|referrer|                host|       url|          event_time|         event_date|browser_type|os_type|device_type|
+-----------+----------+--------+--------------------+----------+--------------------+-------------------+------------+-------+-----------+
| 1037710827| 532630305|    NULL| www.zachwilson.tech|         /|2021-03-08 17:27:...|2021-03-08 00:00:00|       Other|  Other|      Other|
|  925588856| 532630305|    NULL|    www.eczachly.com|         /|2021-05-10 11:26:...|2021-05-10 00:00:00|       Other|  Other|      Other|
|-1180485268| 532630305|    NULL|admin.zachwilson....|         /|2021-02-17 16:19:...|2021-02-17 00:00:00|       Other|  Other|      Other|
|-1044833855| 532630305|    NULL| www.zachwilson.tech|         /|2021-09-24 15:53:...|2021-09-24 00:00:00|       Other|  Other|      Other|
|  747494706| 532630

In [11]:
sorted = df3.repartition(10, col("event_date")) \
        .sortWithinPartitions(col("event_date"), col("host"), col("browser_type")) \
        .withColumn("event_time", col("event_time").cast("timestamp")) \

sorted.show()



+-----------+-----------+--------------------+--------------------+--------------------+--------------------+-------------------+------------+-------+------------------+
|    user_id|  device_id|            referrer|                host|                 url|          event_time|         event_date|browser_type|os_type|       device_type|
+-----------+-----------+--------------------+--------------------+--------------------+--------------------+-------------------+------------+-------+------------------+
| 1129583063|  532630305|                NULL|admin.zachwilson....|                   /|2021-01-07 09:21:...|2021-01-07 00:00:00|       Other|  Other|             Other|
|-1180485268|  532630305|                NULL|    www.eczachly.com|                   /|2021-01-07 18:45:...|2021-01-07 00:00:00|       Other|  Other|             Other|
| 1129583063|  532630305|                NULL|    www.eczachly.com|                   /|2021-01-07 21:57:...|2021-01-07 00:00:00|       Other|  Other|

                                                                                

In [12]:
%%sql

CREATE DATABASE IF NOT EXISTS bootcamp

In [36]:
%%sql

DROP TABLE IF EXISTS bootcamp.events

In [37]:
%%sql

CREATE TABLE IF NOT EXISTS bootcamp.events (
    url STRING,
    referrer STRING,
    browser_family STRING,
    os_family STRING,
    device_family STRING,
    host STRING,
    event_time TIMESTAMP,
    event_date DATE
)
USING iceberg
PARTITIONED BY (event_date);


In [38]:
%%sql


CREATE TABLE IF NOT EXISTS bootcamp.events_sorted (
    url STRING,
    referrer STRING,
    browser_family STRING,
    os_family STRING,
    device_family STRING,
    host STRING,
    event_time TIMESTAMP,
    event_date DATE
)
USING iceberg
    PARTITIONED BY (event_date);

In [39]:
%%sql


CREATE TABLE IF NOT EXISTS bootcamp.events_unsorted (
    url STRING,
    referrer STRING,
    browser_family STRING,
    os_family STRING,
    device_family STRING,
    host STRING,
    event_time TIMESTAMP,
    event_date DATE
)
USING iceberg
PARTITIONED BY (event_date);

In [40]:

start_df = df3.repartition(4, col("event_date")).withColumn("event_time", col("event_time").cast("timestamp")) \
    

first_sort_df = start_df.sortWithinPartitions(col("event_date"), col("browser_type"), col("host"))

start_df.write.mode("overwrite").saveAsTable("bootcamp.events_unsorted")
first_sort_df.write.mode("overwrite").saveAsTable("bootcamp.events_sorted")

                                                                                

In [41]:
%%sql

SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'sorted' 
FROM demo.bootcamp.events_sorted.files

UNION ALL
SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'unsorted' 
FROM demo.bootcamp.events_unsorted.files





size,num_files,sorted
5091412,4,sorted
5552970,4,unsorted


In [42]:
%%sql
SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files 
FROM demo.bootcamp.events.files;

size,num_files
,0


In [43]:
%%sql 
SELECT COUNT(1) FROM bootcamp.events.files

                                                                                

count(1)
0


count(1)
3665
