In [None]:

spark.sql("DROP table bootcamp.matches_bucketed")

In [1]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col}
import org.apache.spark.storage.StorageLevel

val spark = SparkSession.builder()
  .appName("IcebergTableManagement") 
  .config("spark.executor.memory", "4g")
  .config("spark.driver.memory", "4g")
  .config("spark.sql.shuffle.partitions", "200") // Fine for large datasets
  .config("spark.sql.files.maxPartitionBytes", "134217728") // Optional: 128 MB is default
  .config("spark.sql.autoBroadcastJoinThreshold", "-1") // Optional: Disable broadcast join
  .config("spark.dynamicAllocation.enabled", "true") // Helps with resource allocation
  .config("spark.dynamicAllocation.minExecutors", "1") // Ensure minimum resources
  .config("spark.dynamicAllocation.maxExecutors", "50") // Scalable resource allocation
  .getOrCreate()

Intitializing Scala interpreter ...

Spark Web UI available at http://1ea50b4f30b2:4041
SparkContext available as 'sc' (version = 3.5.5, master = local[*], app id = local-1755388306260)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.col
import org.apache.spark.storage.StorageLevel
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@7c29127a


In [1]:



val matchesBucketedselect = spark.read.option("header", "true")
  .option("inferSchema", "true")
  .csv("/home/iceberg/data/matches.csv")

// Get distinct completion dates
val distinctDates = matchesBucketedselect.select("completion_date").distinct().collect()

// Create the Iceberg table if it doesn't exist
spark.sql("""DROP TABLE IF EXISTS bootcamp.matches_bucketed""")
val bucketedDDL = """
CREATE TABLE IF NOT EXISTS bootcamp.matches_bucketed (
    match_id STRING,
    is_team_game BOOLEAN,
    playlist_id STRING,
    completion_date TIMESTAMP
)
USING iceberg
PARTITIONED BY (completion_date, bucket(16, match_id))
"""
spark.sql(bucketedDDL)

// Process data in chunks based on completion_date
distinctDates.foreach { row =>
  val date = row.getAs[java.sql.Timestamp]("completion_date")
  val filteredMatches = matchesBucketedselect.filter(col("completion_date") === date)
  
  // Repartition and persist the filtered data
  val optimizedMatches = filteredMatches
    .select($"match_id", $"is_team_game", $"playlist_id", $"completion_date")
    .repartition(16, $"match_id")
    .persist(StorageLevel.MEMORY_AND_DISK)
    
  optimizedMatches.write
    .mode("append")
    .bucketBy(16, "match_id")
    .partitionBy("completion_date")
    .saveAsTable("bootcamp.matches_bucketed")
}

// Verify the data in the table
val result = spark.sql("SELECT * FROM bootcamp.matches_bucketed")
result.show()


Intitializing Scala interpreter ...

Spark Web UI available at http://1ea50b4f30b2:4045
SparkContext available as 'sc' (version = 3.5.5, master = local[*], app id = local-1755367894001)
SparkSession available as 'spark'


+--------------------+------------+--------------------+-------------------+
|            match_id|is_team_game|         playlist_id|    completion_date|
+--------------------+------------+--------------------+-------------------+
|d6f3478c-255f-43d...|        true|892189e9-d712-4bd...|2016-08-28 00:00:00|
|2b7b17c4-716f-42d...|        true|355dc154-9809-4ed...|2016-02-25 00:00:00|
|821762ff-49b7-4a7...|        true|f72e0ef0-7c4a-430...|2016-02-25 00:00:00|
|d683cc2b-eef6-442...|        true|355dc154-9809-4ed...|2016-02-25 00:00:00|
|383f4a85-eb1f-49d...|        true|355dc154-9809-4ed...|2016-02-25 00:00:00|
|da4d676b-7e95-4f7...|        true|f72e0ef0-7c4a-430...|2016-02-25 00:00:00|
|a1160c68-bbcb-4ab...|        true|f72e0ef0-7c4a-430...|2016-02-25 00:00:00|
|c72bc126-4eba-4df...|        true|f72e0ef0-7c4a-430...|2016-02-25 00:00:00|
|ab0f875b-0d43-4b9...|        true|355dc154-9809-4ed...|2016-02-25 00:00:00|
|7958d054-1d78-4ff...|        true|355dc154-9809-4ed...|2016-02-25 00:00:00|

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.col
import org.apache.spark.storage.StorageLevel
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@6bdbcfff
matchesBucketedselect: org.apache.spark.sql.DataFrame = [match_id: string, mapid: string ... 8 more fields]
distinctDates: Array[org.apache.spark.sql.Row] = Array([2016-03-13 00:00:00.0], [2016-03-11 00:00:00.0], [2016-03-10 00:00:00.0], [2016-01-30 00:00:00.0], [2016-03-27 00:00:00.0], [2016-04-10 00:00:00.0], [2016-01-18 00:00:00.0], [2016-02-01 00:00:00.0], [2015-12-14 00:00:00.0], [2016-02-03 00:00:00.0], [2016-04-30 00:00:00.0], [2016-03-05 00:00:00.0], [2016-04-15 00:00:00.0], [2016-05-21 00:00:00.0], [2015-10-31 00:00:00.0], [2016-01-22 00:00:00.0], [2016-02-09 00:00:00...


In [2]:

spark.sql("SELECT COUNT(1) as num_files FROM bootcamp.matches_bucketed.files").show()

+---------+
|num_files|
+---------+
|     3665|
+---------+



In [4]:
val matchDetailsBucketed =  spark.read.option("header", "true")
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/match_details.csv")

val bucketedDetailsDDL = """
 CREATE TABLE IF NOT EXISTS bootcamp.match_details_bucketed (
     match_id STRING,
     player_gamertag STRING,
     player_total_kills INTEGER,
     player_total_deaths INTEGER
 )
 USING iceberg
 PARTITIONED BY (bucket(16, match_id));
 """
spark.sql(bucketedDetailsDDL)

matchDetailsBucketed.select(
    $"match_id", $"player_gamertag", $"player_total_kills", $"player_total_deaths")
    .write.mode("append")
    .bucketBy(16, "match_id").saveAsTable("bootcamp.match_details_bucketed")

matchDetailsBucketed: org.apache.spark.sql.DataFrame = [match_id: string, player_gamertag: string ... 34 more fields]
bucketedDetailsDDL: String =
"
 CREATE TABLE IF NOT EXISTS bootcamp.match_details_bucketed (
     match_id STRING,
     player_gamertag STRING,
     player_total_kills INTEGER,
     player_total_deaths INTEGER
 )
 USING iceberg
 PARTITIONED BY (bucket(16, match_id));
 "


In [6]:
// Verify the data in the table
val result = spark.sql("SELECT * FROM bootcamp.match_details_bucketed")
result.show()


+--------------------+---------------+------------------+-------------------+
|            match_id|player_gamertag|player_total_kills|player_total_deaths|
+--------------------+---------------+------------------+-------------------+
|f8852913-2ccf-46f...|    OneWingKing|                 7|                  6|
|155cfd23-4f97-4f1...|   BigChubSmith|                15|                 11|
|155cfd23-4f97-4f1...|  JakeWilson801|                18|                  9|
|155cfd23-4f97-4f1...|      taterbase|                 1|                 12|
|155cfd23-4f97-4f1...| BeyondHumanx39|                13|                 14|
|155cfd23-4f97-4f1...|   Twinsnakes05|                16|                 11|
|155cfd23-4f97-4f1...|  Maverick62011|                 9|                 14|
|155cfd23-4f97-4f1...|       EcZachly|                16|                 16|
|155cfd23-4f97-4f1...|      WhiteSpic|                10|                 12|
|b8d81721-befb-427...|  JakeWilson801|                16|       

result: org.apache.spark.sql.DataFrame = [match_id: string, player_gamertag: string ... 2 more fields]


In [5]:
spark.sql("SELECT COUNT(1) as num_files FROM bootcamp.match_details_bucketed.files").show()

+---------+
|num_files|
+---------+
|       16|
+---------+



In [8]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

matchesBucketedselect.createOrReplaceTempView("matches")
matchDetailsBucketed.createOrReplaceTempView("match_details")

spark.sql("""
    SELECT * FROM bootcamp.match_details_bucketed mdb JOIN bootcamp.matches_bucketed md 
    ON mdb.match_id = md.match_id
    AND md.completion_date = DATE('2016-01-01')
        
""").explain()


spark.sql("""
    SELECT * FROM match_details mdb JOIN matches md ON mdb.match_id = md.match_id
        
""").explain()


== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [match_id#37221], [match_id#37225], Inner
   :- Sort [match_id#37221 ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(match_id#37221, 200), ENSURE_REQUIREMENTS, [plan_id=15575]
   :     +- BatchScan demo.bootcamp.match_details_bucketed[match_id#37221, player_gamertag#37222, player_total_kills#37223, player_total_deaths#37224] demo.bootcamp.match_details_bucketed (branch=null) [filters=match_id IS NOT NULL, groupedBy=] RuntimeFilters: []
   +- Sort [match_id#37225 ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(match_id#37225, 200), ENSURE_REQUIREMENTS, [plan_id=15576]
         +- BatchScan demo.bootcamp.matches_bucketed[match_id#37225, is_team_game#37226, playlist_id#37227, completion_date#37228] demo.bootcamp.matches_bucketed (branch=null) [filters=completion_date IS NOT NULL, completion_date = 1451606400000000, match_id IS NOT NULL, groupedBy=] RuntimeFilters: []


== Physical Plan ==
