In [2]:
import org.apache.spark.sql.functions.{broadcast, split, lit}

import org.apache.spark.sql.functions.{broadcast, split, lit}


In [3]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col}
import org.apache.spark.storage.StorageLevel

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.col
import org.apache.spark.storage.StorageLevel


In [4]:
val spark = SparkSession.builder()
  .appName("IcebergTableManagement") 
  .config("spark.executor.memory", "4g")
  .config("spark.driver.memory", "4g")
  .config("spark.sql.shuffle.partitions", "200") // Fine for large datasets
  .config("spark.sql.files.maxPartitionBytes", "134217728") // Optional: 128 MB is default
  .config("spark.sql.autoBroadcastJoinThreshold", "-1") // Optional: Disable broadcast join
  .config("spark.dynamicAllocation.enabled", "true") // Helps with resource allocation
  .config("spark.dynamicAllocation.minExecutors", "1") // Ensure minimum resources
  .config("spark.dynamicAllocation.maxExecutors", "50") // Scalable resource allocation
  .getOrCreate()

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@28497b49


In [5]:
val matchesBucketed = spark.read.option("header", "true") 
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/matches.csv")
// matchesBucketed is of type org.apache.spark.sql.DataFrame

matchesBucketed: org.apache.spark.sql.DataFrame = [match_id: string, mapid: string ... 8 more fields]


In [6]:
spark.sql("""DROP TABLE IF EXISTS bootcamp.matches_bucketed""") // bootcamp.matches_bucketed is an iceberg table
val bucketedDDL = """
CREATE TABLE IF NOT EXISTS bootcamp.matches_bucketed (
     match_id STRING,
     is_team_game BOOLEAN,
     playlist_id STRING,
     completion_date TIMESTAMP
 )
 USING iceberg
 PARTITIONED BY (completion_date, bucket(16, match_id));
 """
spark.sql(bucketedDDL)                        

bucketedDDL: String =
"
CREATE TABLE IF NOT EXISTS bootcamp.matches_bucketed (
     match_id STRING,
     is_team_game BOOLEAN,
     playlist_id STRING,
     completion_date TIMESTAMP
 )
 USING iceberg
 PARTITIONED BY (completion_date, bucket(16, match_id));
 "
res1: org.apache.spark.sql.DataFrame = []


In [7]:
// optimizing loading the data 
// Get distinct completion dates
val distinctDates = matchesBucketed.select("completion_date").distinct().collect()
// Process data in chunks based on completion_date
distinctDates.foreach { row =>
  val date = row.getAs[java.sql.Timestamp]("completion_date")
  val filteredMatches = matchesBucketed.filter(col("completion_date") === date)
  
  // Repartition and persist the filtered data
  val optimizedMatches = filteredMatches
    .select($"match_id", $"is_team_game", $"playlist_id", $"completion_date")
    .repartition(16, $"match_id")
    .persist(StorageLevel.MEMORY_AND_DISK)
    
  optimizedMatches.write
    .mode("append")
    .bucketBy(16, "match_id")
    .partitionBy("completion_date")
    .saveAsTable("bootcamp.matches_bucketed")
}

distinctDates: Array[org.apache.spark.sql.Row] = Array([2016-03-13 00:00:00.0], [2016-03-11 00:00:00.0], [2016-03-10 00:00:00.0], [2016-01-30 00:00:00.0], [2016-03-27 00:00:00.0], [2016-04-10 00:00:00.0], [2016-01-18 00:00:00.0], [2016-02-01 00:00:00.0], [2015-12-14 00:00:00.0], [2016-02-03 00:00:00.0], [2016-04-30 00:00:00.0], [2016-03-05 00:00:00.0], [2016-04-15 00:00:00.0], [2016-05-21 00:00:00.0], [2015-10-31 00:00:00.0], [2016-01-22 00:00:00.0], [2016-02-09 00:00:00.0], [2016-03-17 00:00:00.0], [2016-04-04 00:00:00.0], [2016-05-08 00:00:00.0], [2016-01-21 00:00:00.0], [2015-10-28 00:00:00.0], [2016-03-30 00:00:00.0], [2016-05-03 00:00:00.0], [2016-02-04 00:00:00.0], [2015-11-25 00:00:00.0], [2016-01-13 00:00:00.0], [2016-04-29 00:00:00.0], [2016-05-18 00:00:00.0], [2016-03-24 00:00...


In [8]:
// the following code takes ages:
// matchesBucketed.select(
//  $"match_id", $"is_team_game", $"playlist_id", $"completion_date"
   //   )
 //     .write.mode("append")
//      .partitionBy("completion_date")
//    .bucketBy(16, "match_id").saveAsTable("bootcamp.matches_bucketed")

In [9]:
// Verify the data in the table
val result = spark.sql("SELECT * FROM bootcamp.matches_bucketed")
result.show()

+--------------------+------------+--------------------+-------------------+
|            match_id|is_team_game|         playlist_id|    completion_date|
+--------------------+------------+--------------------+-------------------+
|eb119803-e635-499...|        true|c98949ae-60a8-43d...|2016-05-29 00:00:00|
|78a58006-c7fc-4c7...|        true|f72e0ef0-7c4a-430...|2016-07-24 00:00:00|
|4ba179a4-f2ff-4b1...|        true|f72e0ef0-7c4a-430...|2016-07-24 00:00:00|
|9527669f-a292-4f6...|        true|c98949ae-60a8-43d...|2016-08-17 00:00:00|
|5e701faf-1462-48e...|        true|f72e0ef0-7c4a-430...|2016-07-24 00:00:00|
|c8b5b039-4889-4e2...|        true|f72e0ef0-7c4a-430...|2016-05-29 00:00:00|
|2c330fcd-c1dc-405...|        true|f72e0ef0-7c4a-430...|2016-07-24 00:00:00|
|cc671941-8a2d-40e...|        true|f72e0ef0-7c4a-430...|2016-07-24 00:00:00|
|d384060a-ea52-42b...|        true|c98949ae-60a8-43d...|2016-05-29 00:00:00|
|1f1aed8d-9c52-458...|        true|f72e0ef0-7c4a-430...|2016-05-29 00:00:00|

result: org.apache.spark.sql.DataFrame = [match_id: string, is_team_game: boolean ... 2 more fields]


In [10]:
// To know the number of distinct dates:
val distinctDatesCount = matchesBucketed.select("completion_date").distinct().count()

distinctDatesCount: Long = 269


In [11]:
spark.sql("select count(1) from bootcamp.matches_bucketed.partitions").show()

+--------+
|count(1)|
+--------+
|    3665|
+--------+



In [12]:
val df = spark.sql("SELECT * FROM bootcamp.matches_bucketed.partitions")
println(df.schema.fieldNames.mkString(", "))
df.collect().foreach { row =>
  println(row.mkString(", "))
}
// spark.sql("select * from bootcamp.matches_bucketed.partitions").show() gives difficult-to-read output

partition, spec_id, record_count, file_count, total_data_file_size_in_bytes, position_delete_record_count, position_delete_file_count, equality_delete_record_count, equality_delete_file_count, last_updated_at, last_updated_snapshot_id
[2016-01-13 00:00:00.0,6], 0, 11, 1, 1930, 0, 0, 0, 0, 2024-12-09 12:44:11.326, 1307418336203784632
[2016-01-13 00:00:00.0,7], 0, 4, 1, 1724, 0, 0, 0, 0, 2024-12-09 12:44:11.326, 1307418336203784632
[2016-01-13 00:00:00.0,8], 0, 9, 1, 1860, 0, 0, 0, 0, 2024-12-09 12:44:11.326, 1307418336203784632
[2016-01-13 00:00:00.0,9], 0, 7, 1, 1844, 0, 0, 0, 0, 2024-12-09 12:44:11.326, 1307418336203784632
[2016-09-09 00:00:00.0,3], 0, 1, 1, 1679, 0, 0, 0, 0, 2024-12-09 12:47:46.507, 8849439159814292096
[2016-01-13 00:00:00.0,2], 0, 10, 1, 1930, 0, 0, 0, 0, 2024-12-09 12:44:11.326, 1307418336203784632
[2016-01-13 00:00:00.0,3], 0, 10, 1, 1908, 0, 0, 0, 0, 2024-12-09 12:44:11.326, 1307418336203784632
[2016-01-13 00:00:00.0,4], 0, 5, 1, 1751, 0, 0, 0, 0, 2024-12-09 12:4

df: org.apache.spark.sql.DataFrame = [partition: struct<completion_date: timestamp, match_id_bucket: int>, spec_id: int ... 9 more fields]


In [13]:
spark.sql("select * from bootcamp.matches_bucketed.files").show()

+-------+--------------------+-----------+-------+--------------------+------------+------------------+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+------------+-------------+------------+-------------+--------------------+
|content|           file_path|file_format|spec_id|           partition|record_count|file_size_in_bytes|        column_sizes|        value_counts|   null_value_counts|nan_value_counts|        lower_bounds|        upper_bounds|key_metadata|split_offsets|equality_ids|sort_order_id|    readable_metrics|
+-------+--------------------+-----------+-------+--------------------+------------+------------------+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+------------+-------------+------------+-------------+--------------------+
|      0|s3://warehouse/bo...|    PARQUET|      0|{2016-08-31 00:00...|           1|             

In [14]:
spark.sql("SELECT COUNT(1) as num_files FROM bootcamp.matches_bucketed.files").show()

+---------+
|num_files|
+---------+
|     3665|
+---------+



In [15]:
val matchDetailsBucketed =  spark.read.option("header", "true")
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/match_details.csv")
spark.sql("""DROP TABLE IF EXISTS bootcamp.match_details_bucketed""")
val bucketedDetailsDDL = """
 CREATE TABLE IF NOT EXISTS bootcamp.match_details_bucketed (
     match_id STRING,
     player_gamertag STRING,
     player_total_kills INTEGER,
     player_total_deaths INTEGER
 )
 USING iceberg
 PARTITIONED BY (bucket(16, match_id));
 """
 spark.sql(bucketedDetailsDDL)

matchDetailsBucketed: org.apache.spark.sql.DataFrame = [match_id: string, player_gamertag: string ... 34 more fields]
bucketedDetailsDDL: String =
"
 CREATE TABLE IF NOT EXISTS bootcamp.match_details_bucketed (
     match_id STRING,
     player_gamertag STRING,
     player_total_kills INTEGER,
     player_total_deaths INTEGER
 )
 USING iceberg
 PARTITIONED BY (bucket(16, match_id));
 "
res9: org.apache.spark.sql.DataFrame = []


In [16]:
matchDetailsBucketed.select(
   $"match_id", $"player_gamertag", $"player_total_kills", $"player_total_deaths")
   .write.mode("append")
  .bucketBy(16, "match_id").saveAsTable("bootcamp.match_details_bucketed")

In [17]:
spark.sql("SELECT COUNT(1) as num_files FROM bootcamp.match_details_bucketed.files").show()

+---------+
|num_files|
+---------+
|       16|
+---------+



In [18]:
spark.sql("select * from bootcamp.match_details_bucketed.files").show()

+-------+--------------------+-----------+-------+---------+------------+------------------+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+------------+-------------+------------+-------------+--------------------+
|content|           file_path|file_format|spec_id|partition|record_count|file_size_in_bytes|        column_sizes|        value_counts|   null_value_counts|nan_value_counts|        lower_bounds|        upper_bounds|key_metadata|split_offsets|equality_ids|sort_order_id|    readable_metrics|
+-------+--------------------+-----------+-------+---------+------------+------------------+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+------------+-------------+------------+-------------+--------------------+
|      0|s3://warehouse/bo...|    PARQUET|      0|      {6}|        9377|            121547|{1 -> 29609, 2 ->...|{1 -> 9377, 2 -> 

In [17]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")
//disables Sparks auto broadcast join mechanism

matchesBucketed.createOrReplaceTempView("matches")
matchDetailsBucketed.createOrReplaceTempView("match_details")
// once you've created the temporary view of a Spark data frame, you can use it directly in your SQL queries as if it were a table

spark.sql("""
    SELECT * FROM bootcamp.match_details_bucketed mdb JOIN bootcamp.matches_bucketed md 
    ON mdb.match_id = md.match_id
    AND md.completion_date = DATE('2016-01-01')
        
""").explain()
// this does merge join but within the buckets
// Batch scan means bucketed scan.
// There are no exchange

spark.sql("""
    SELECT * FROM match_details mdb JOIN matches md ON mdb.match_id = md.match_id
        
""").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [match_id#37356], [match_id#37360], Inner
   :- Sort [match_id#37356 ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(match_id#37356, 200), ENSURE_REQUIREMENTS, [plan_id=16105]
   :     +- BatchScan demo.bootcamp.match_details_bucketed[match_id#37356, player_gamertag#37357, player_total_kills#37358, player_total_deaths#37359] demo.bootcamp.match_details_bucketed (branch=null) [filters=match_id IS NOT NULL, groupedBy=] RuntimeFilters: []
   +- Sort [match_id#37360 ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(match_id#37360, 200), ENSURE_REQUIREMENTS, [plan_id=16106]
         +- BatchScan demo.bootcamp.matches_bucketed[match_id#37360, is_team_game#37361, playlist_id#37362, completion_date#37363] demo.bootcamp.matches_bucketed (branch=null) [filters=completion_date IS NOT NULL, completion_date = 1451606400000000, match_id IS NOT NULL, groupedBy=] RuntimeFilters: []


== Physical Plan ==


In [16]:
# // we didnt do any of the following rest of code in the lab
# // spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "1000000000000")

# // val broadcastFromThreshold = matches.as("m").join(matchDetails.as("md"), $"m.match_id" === $"md.match_id")
# //   .select($"m.completion_date", $"md.player_gamertag",  $"md.player_total_kills")
# //   .take(5)

# // val explicitBroadcast = matches.as("m").join(broadcast(matchDetails).as("md"), $"m.match_id" === $"md.match_id")
# //   .select($"md.*", split($"completion_date", " ").getItem(0).as("ds"))

# // val bucketedValues = matchDetailsBucketed.as("mdb").join(matchesBucketed.as("mb"), $"mb.match_id" === $"mdb.match_id").explain()
# // // .take(5)

# // val values = matchDetailsBucketed.as("m").join(matchesBucketed.as("md"), $"m.match_id" === $"md.match_id").explain()

# // explicitBroadcast.write.mode("overwrite").insertInto("match_details_bucketed")

# // matches.withColumn("ds", split($"completion_date", " ").getItem(0)).write.mode("overwrite").insertInto("matches_bucketed")

# // spark.sql(bucketedSQL)

<console>: 2: error: ';' expected but '#' found.