In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col
spark = SparkSession.builder.appName("Jupyter").getOrCreate()

spark

df = spark.read.option("header", "true").csv("/home/iceberg/data/events.csv").withColumn("event_date", expr("DATE_TRUNC('day', event_time)"))

df.show()

+--------------------+--------------------+--------------+---------+-------------+-------------------+--------------------+-------------------+
|                 url|            referrer|browser_family|os_family|device_family|               host|          event_time|         event_date|
+--------------------+--------------------+--------------+---------+-------------+-------------------+--------------------+-------------------+
|                   /|                null|        Chrome| Mac OS X|        Other|     localhost:3000|2019-01-12 20:04:...|2019-01-12 00:00:00|
|                   /|                null|        Chrome| Mac OS X|        Other|www.zachwilson.tech|2019-01-13 04:06:...|2019-01-13 00:00:00|
|              /about|https://www.zachw...|        Chrome| Mac OS X|        Other|www.zachwilson.tech|2019-01-13 04:06:...|2019-01-13 00:00:00|
|/images/zach-prof...|https://www.zachw...|        Chrome| Mac OS X|        Other|www.zachwilson.tech|2019-01-13 04:06:...|2019-01-13 00

In [54]:
sorted = df.repartition(10, col("event_date")) \
        .sortWithinPartitions(col("event_date"), col("host"), col("browser_family")) \
        .withColumn("event_time", col("event_time").cast("timestamp")) \

sorted.show()

+--------------------+--------+----------------+---------+-------------+-------------------+--------------------+-------------------+
|                 url|referrer|  browser_family|os_family|device_family|               host|          event_time|         event_date|
+--------------------+--------+----------------+---------+-------------+-------------------+--------------------+-------------------+
|                   /|    null|          Chrome| Mac OS X|        Other|   www.eczachly.com|2019-01-14 11:23:...|2019-01-14 00:00:00|
|               /blog|    null|     FacebookBot|    Other|       Spider|   www.eczachly.com|2019-01-14 04:41:...|2019-01-14 00:00:00|
|               /blog|    null|     FacebookBot|    Other|       Spider|   www.eczachly.com|2019-01-14 04:41:...|2019-01-14 00:00:00|
|         /robots.txt|    null|       Googlebot|    Other|       Spider|   www.eczachly.com|2019-01-14 01:10:...|2019-01-14 00:00:00|
|        /sitemap.xml|    null|       Googlebot|    Other|    

In [9]:
%%sql

CREATE DATABASE IF NOT EXISTS bootcamp

In [55]:
%%sql

DROP TABLE IF EXISTS bootcamp.events

In [56]:
%%sql

CREATE TABLE IF NOT EXISTS bootcamp.events (
    url STRING,
    referrer STRING,
    browser_family STRING,
    os_family STRING,
    device_family STRING,
    host STRING,
    event_time TIMESTAMP,
    event_date DATE
)
USING iceberg
PARTITIONED BY (years(event_date));


In [112]:
%%sql


CREATE TABLE IF NOT EXISTS bootcamp.events_sorted (
    url STRING,
    referrer STRING,
    browser_family STRING,
    os_family STRING,
    device_family STRING,
    host STRING,
    event_time TIMESTAMP,
    event_date DATE
)
USING iceberg;

In [113]:
%%sql


CREATE TABLE IF NOT EXISTS bootcamp.events_unsorted (
    url STRING,
    referrer STRING,
    browser_family STRING,
    os_family STRING,
    device_family STRING,
    host STRING,
    event_time TIMESTAMP,
    event_date DATE
)
USING iceberg;

In [120]:

start_df = df.repartition(4, col("event_date")).withColumn("event_time", col("event_time").cast("timestamp")) \
    

first_sort_df = start_df.sortWithinPartitions(col("event_date"), col("browser_family"), col("host"))

sorted = df.repartition(10, col("event_date")) \
        .sortWithinPartitions(col("event_date")) \
        .withColumn("event_time", col("event_time").cast("timestamp")) \

start_df.write.mode("overwrite").saveAsTable("bootcamp.events_unsorted")
first_sort_df.write.mode("overwrite").saveAsTable("bootcamp.events_sorted")

In [121]:
%%sql

SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'sorted' 
FROM demo.bootcamp.events_sorted.files

UNION ALL
SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'unsorted' 
FROM demo.bootcamp.events_unsorted.files





size,num_files,sorted
2896920,4,sorted
3211534,4,unsorted


In [90]:
%%sql
SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files FROM demo.bootcamp.events.files;

size,num_files
3145713,5


In [None]:
%%sql 
SELECT COUNT(1) FROM bootcamp.matches_bucketed.files

count(1)
3665
