In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col
spark = SparkSession.builder.appName("Jupyter").getOrCreate()

spark

df = spark.read.option("header", "true").csv("/home/iceberg/data/events.csv").withColumn("event_date", expr("DATE_TRUNC('day', event_time)"))

df.show()
#df.collect() never do that alwats filter down the data and never pull all the data into disc. use show or take instead

+-----------+----------+--------+--------------------+----------+--------------------+-------------------+
|    user_id| device_id|referrer|                host|       url|          event_time|         event_date|
+-----------+----------+--------+--------------------+----------+--------------------+-------------------+
| 1037710827| 532630305|    NULL| www.zachwilson.tech|         /|2021-03-08 17:27:...|2021-03-08 00:00:00|
|  925588856| 532630305|    NULL|    www.eczachly.com|         /|2021-05-10 11:26:...|2021-05-10 00:00:00|
|-1180485268| 532630305|    NULL|admin.zachwilson....|         /|2021-02-17 16:19:...|2021-02-17 00:00:00|
|-1044833855| 532630305|    NULL| www.zachwilson.tech|         /|2021-09-24 15:53:...|2021-09-24 00:00:00|
|  747494706| 532630305|    NULL| www.zachwilson.tech|         /|2021-09-26 16:03:...|2021-09-26 00:00:00|
|  747494706| 532630305|    NULL|admin.zachwilson....|         /|2021-02-21 16:08:...|2021-02-21 00:00:00|
| -824540328| 532630305|    NULL|admi

In [14]:
sorted = df.repartition(10, col("event_date")) \
        .sortWithinPartitions(col("event_date"), col("host")) \
        .withColumn("event_time", col("event_time").cast("timestamp"))

sortedTwo = df.repartition(10, col("event_date")) \
        .sort(col("event_date"), col("host")) \
        .withColumn("event_time", col("event_time").cast("timestamp"))

#for big scale
#sorted.show() # it will the 10 paritions and sort data within the partition
#sortedTwo.show() # global sort not within the partitions - very slow

sorted.explain() 
sortedTwo.explain()
# exchange in the int explain means shuffle

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [user_id#134, device_id#135, referrer#136, host#137, url#138, cast(event_time#139 as timestamp) AS event_time#319, event_date#146]
   +- Sort [event_date#146 ASC NULLS FIRST, host#137 ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(event_date#146, 10), REPARTITION_BY_NUM, [plan_id=244]
         +- Project [user_id#134, device_id#135, referrer#136, host#137, url#138, event_time#139, date_trunc(day, cast(event_time#139 as timestamp), Some(Etc/UTC)) AS event_date#146]
            +- FileScan csv [user_id#134,device_id#135,referrer#136,host#137,url#138,event_time#139] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/iceberg/data/events.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<user_id:string,device_id:string,referrer:string,host:string,url:string,event_time:string>


== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [user_id#

In [15]:
%%sql

CREATE DATABASE IF NOT EXISTS bootcamp

In [16]:
%%sql

DROP TABLE IF EXISTS bootcamp.events

In [17]:
%%sql

CREATE TABLE IF NOT EXISTS bootcamp.events (
    url STRING,
    referrer STRING,
    browser_family STRING,
    os_family STRING,
    device_family STRING,
    host STRING,
    event_time TIMESTAMP,
    event_date DATE
)
USING iceberg
PARTITIONED BY (years(event_date));


In [21]:
%%sql


CREATE TABLE IF NOT EXISTS bootcamp.events_sorted (
    url STRING,
    referrer STRING,
    browser_family STRING,
    os_family STRING,
    device_family STRING,
    host STRING,
    event_time TIMESTAMP,
    event_date DATE
)
USING iceberg
PARTITIONED BY (years(event_date));

In [19]:
%%sql


CREATE TABLE IF NOT EXISTS bootcamp.events_unsorted (
    url STRING,
    referrer STRING,
    browser_family STRING,
    os_family STRING,
    device_family STRING,
    host STRING,
    event_time TIMESTAMP,
    event_date DATE
)
USING iceberg;

In [22]:

start_df = df.repartition(4, col("event_date")).withColumn("event_time", col("event_time").cast("timestamp")) \
    

first_sort_df = start_df.sortWithinPartitions(col("event_date"),  col("host")) #col("browser_family"),

#sorted = df.repartition(10, col("event_date")) \
#        .sortWithinPartitions(col("event_date")) \
#        .withColumn("event_time", col("event_time").cast("timestamp")) \

start_df.write.mode("overwrite").saveAsTable("bootcamp.events_unsorted")
first_sort_df.write.mode("overwrite").saveAsTable("bootcamp.events_sorted")

                                                                                

In [24]:
%%sql

SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'sorted' 
FROM demo.bootcamp.events_sorted.files

UNION ALL
SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'unsorted' 
FROM demo.bootcamp.events_unsorted.files





size,num_files,sorted
4955506,4,sorted
5050846,4,unsorted


In [90]:
%%sql
SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files FROM demo.bootcamp.events.files;

size,num_files
3145713,5


In [None]:
%%sql 
SELECT COUNT(1) FROM bootcamp.matches_bucketed.files

count(1)
3665
