### Giving more resources to SPARK

In [1]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col}
import org.apache.spark.storage.StorageLevel

val spark = SparkSession.builder()
  .appName("IcebergTableManagement") 
  .config("spark.executor.memory", "32g")
  .config("spark.driver.memory", "32g")
  .config("spark.sql.shuffle.partitions", "200") // Fine for large datasets
  .config("spark.sql.files.maxPartitionBytes", "134217728") // Optional: 128 MB is default
  .config("spark.sql.autoBroadcastJoinThreshold", "-1") // Optional: Disable broadcast join
  .config("spark.dynamicAllocation.enabled", "true") // Helps with resource allocation
  .config("spark.dynamicAllocation.minExecutors", "1") // Ensure minimum resources
  .config("spark.dynamicAllocation.maxExecutors", "50") // Scalable resource allocation
  .getOrCreate()

Intitializing Scala interpreter ...

Spark Web UI available at http://d772c787a954:4041
SparkContext available as 'sc' (version = 3.5.1, master = local[*], app id = local-1734576327409)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.col
import org.apache.spark.storage.StorageLevel
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@73d0495


### Loading Data

In [62]:
// In python use: from pyspark.sql.functions import broadcast, split, lit
import org.apache.spark.sql.functions.{broadcast, split, lit}


val matchesBucketed = spark.read.option("header", "true")
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/matches.csv")
                        
val matchDetailsBucketed =  spark.read.option("header", "true")
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/match_details.csv")

import org.apache.spark.sql.functions.{broadcast, split, lit}
matchesBucketed: org.apache.spark.sql.DataFrame = [match_id: string, mapid: string ... 8 more fields]
matchDetailsBucketed: org.apache.spark.sql.DataFrame = [match_id: string, player_gamertag: string ... 34 more fields]


In [63]:
matchesBucketed.select($"match_id", $"is_team_game", $"playlist_id", $"completion_date").show(5)
matchesBucketed.count()

+--------------------+------------+--------------------+-------------------+
|            match_id|is_team_game|         playlist_id|    completion_date|
+--------------------+------------+--------------------+-------------------+
|11de1a94-8d07-416...|        true|f72e0ef0-7c4a-430...|2016-02-22 00:00:00|
|d3643e71-3e51-43e...|       false|d0766624-dbd7-453...|2016-02-14 00:00:00|
|d78d2aae-36e4-48a...|        true|f72e0ef0-7c4a-430...|2016-03-24 00:00:00|
|b440069e-ec5f-4f5...|        true|f72e0ef0-7c4a-430...|2015-12-23 00:00:00|
|1dd475fc-ee6b-4e1...|        true|0e39ead4-383b-445...|2016-04-07 00:00:00|
+--------------------+------------+--------------------+-------------------+
only showing top 5 rows



res41: Long = 24025


In [64]:
matchDetailsBucketed.select($"match_id", $"player_gamertag", $"player_total_kills", $"player_total_deaths").show(5)
matchDetailsBucketed.count()

+--------------------+---------------+------------------+-------------------+
|            match_id|player_gamertag|player_total_kills|player_total_deaths|
+--------------------+---------------+------------------+-------------------+
|71d79b23-4143-435...|      taterbase|                 6|                 13|
|71d79b23-4143-435...| SuPeRSaYaInG0D|                 7|                 18|
|71d79b23-4143-435...|       EcZachly|                12|                 10|
|71d79b23-4143-435...|    johnsnake04|                13|                  9|
|71d79b23-4143-435...| Super Mac Bros|                13|                 15|
+--------------------+---------------+------------------+-------------------+
only showing top 5 rows



res42: Long = 151761


In [65]:
matchDetailsBucketed.select($"match_id", $"player_gamertag", $"player_total_kills", $"player_total_deaths").describe()

res43: org.apache.spark.sql.DataFrame = [summary: string, match_id: string ... 3 more fields]


In [66]:
val matches = matchesBucketed.select($"match_id", $"is_team_game", $"playlist_id", $"completion_date")
val matchDetails = matchDetailsBucketed.select($"match_id", $"player_gamertag", $"player_total_kills", $"player_total_deaths")

matches: org.apache.spark.sql.DataFrame = [match_id: string, is_team_game: boolean ... 2 more fields]
matchDetails: org.apache.spark.sql.DataFrame = [match_id: string, player_gamertag: string ... 2 more fields]


In [67]:
matches.show(5)
matchDetails.show(5)

+--------------------+------------+--------------------+-------------------+
|            match_id|is_team_game|         playlist_id|    completion_date|
+--------------------+------------+--------------------+-------------------+
|11de1a94-8d07-416...|        true|f72e0ef0-7c4a-430...|2016-02-22 00:00:00|
|d3643e71-3e51-43e...|       false|d0766624-dbd7-453...|2016-02-14 00:00:00|
|d78d2aae-36e4-48a...|        true|f72e0ef0-7c4a-430...|2016-03-24 00:00:00|
|b440069e-ec5f-4f5...|        true|f72e0ef0-7c4a-430...|2015-12-23 00:00:00|
|1dd475fc-ee6b-4e1...|        true|0e39ead4-383b-445...|2016-04-07 00:00:00|
+--------------------+------------+--------------------+-------------------+
only showing top 5 rows

+--------------------+---------------+------------------+-------------------+
|            match_id|player_gamertag|player_total_kills|player_total_deaths|
+--------------------+---------------+------------------+-------------------+
|71d79b23-4143-435...|      taterbase|          

### Loading data from CSV -> Iceberg Table with buckets

**Table 1 - Matches**

In [5]:
spark.sql("""DROP TABLE IF EXISTS bootcamp.matches_bucketed;""")

res2: org.apache.spark.sql.DataFrame = []


In [6]:
val bucketedDDL = """
CREATE TABLE IF NOT EXISTS bootcamp.matches_bucketed (
     match_id STRING,
     is_team_game BOOLEAN,
     playlist_id STRING,
     completion_date TIMESTAMP
 )
 USING iceberg
 PARTITIONED BY (completion_date, bucket(16, match_id));
 """
spark.sql(bucketedDDL)

bucketedDDL: String =
"
CREATE TABLE IF NOT EXISTS bootcamp.matches_bucketed (
     match_id STRING,
     is_team_game BOOLEAN,
     playlist_id STRING,
     completion_date TIMESTAMP
 )
 USING iceberg
 PARTITIONED BY (completion_date, bucket(16, match_id));
 "
res3: org.apache.spark.sql.DataFrame = []


**Directly trying to save does not work!**

In [7]:
//matchesBucketed.select($"match_id", $"is_team_game", $"playlist_id", $"completion_date")
//.write.mode("append")
//.partitionBy("completion_date")
//.bucketBy(4, "match_id")
//.saveAsTable("bootcamp.matches_bucketed")

**Processing in Batches**

In [8]:
// Get distinct completion dates
val distinctDates = matchesBucketed.select("completion_date").distinct().collect()

distinctDates: Array[org.apache.spark.sql.Row] = Array([2016-03-13 00:00:00.0], [2016-03-11 00:00:00.0], [2016-03-10 00:00:00.0], [2016-01-30 00:00:00.0], [2016-03-27 00:00:00.0], [2016-04-10 00:00:00.0], [2016-01-18 00:00:00.0], [2016-02-01 00:00:00.0], [2015-12-14 00:00:00.0], [2016-02-03 00:00:00.0], [2016-04-30 00:00:00.0], [2016-03-05 00:00:00.0], [2016-04-15 00:00:00.0], [2016-05-21 00:00:00.0], [2015-10-31 00:00:00.0], [2016-01-22 00:00:00.0], [2016-02-09 00:00:00.0], [2016-03-17 00:00:00.0], [2016-04-04 00:00:00.0], [2016-05-08 00:00:00.0], [2016-01-21 00:00:00.0], [2015-10-28 00:00:00.0], [2016-03-30 00:00:00.0], [2016-05-03 00:00:00.0], [2016-02-04 00:00:00.0], [2015-11-25 00:00:00.0], [2016-01-13 00:00:00.0], [2016-04-29 00:00:00.0], [2016-05-18 00:00:00.0], [2016-03-24 00:00...


In [9]:
// Process data in chunks based on completion_date
distinctDates.foreach { row =>
  val date = row.getAs[java.sql.Timestamp]("completion_date")
  val filteredMatches = matchesBucketed.filter(col("completion_date") === date)
  
  // Repartition and persist the filtered data
  val optimizedMatches = filteredMatches
    .select($"match_id", $"is_team_game", $"playlist_id", $"completion_date")
    .repartition(16, $"match_id")
    .persist(StorageLevel.MEMORY_AND_DISK)
    
  optimizedMatches.write
    .mode("append")
    .bucketBy(16, "match_id")
    .partitionBy("completion_date")
    .saveAsTable("bootcamp.matches_bucketed")
}

In [10]:
// Verify the data in the table
spark.sql("SELECT * FROM bootcamp.matches_bucketed").show(5)

+--------------------+------------+--------------------+-------------------+
|            match_id|is_team_game|         playlist_id|    completion_date|
+--------------------+------------+--------------------+-------------------+
|faf37c7f-3f3a-4f0...|        true|b617e24f-71aa-432...|2016-05-16 00:00:00|
|cbb50ffc-714d-438...|       false|d0766624-dbd7-453...|2016-09-26 00:00:00|
|d6aea3be-8d6f-408...|        true|2323b76a-db98-4e0...|2016-08-13 00:00:00|
|9be0f082-b7fa-47f...|        true|892189e9-d712-4bd...|2015-11-12 00:00:00|
|4a7fcf11-1d90-4c9...|        true|2323b76a-db98-4e0...|2016-09-22 00:00:00|
+--------------------+------------+--------------------+-------------------+
only showing top 5 rows



In [11]:
// Verify number of files
spark.sql("SELECT COUNT(1) as num_files FROM bootcamp.matches_bucketed.files").show()

+---------+
|num_files|
+---------+
|     3665|
+---------+



**Table 2 - Match Details**

In [12]:
spark.sql("""DROP TABLE IF EXISTS bootcamp.match_details_bucketed;""")

res8: org.apache.spark.sql.DataFrame = []


In [13]:
val bucketedDetailsDDL = """
CREATE TABLE IF NOT EXISTS bootcamp.match_details_bucketed (
     match_id STRING,
     player_gamertag STRING,
     player_total_kills INTEGER,
     player_total_deaths INTEGER
)
USING iceberg
PARTITIONED BY (bucket(16, match_id));
"""
spark.sql(bucketedDetailsDDL)


bucketedDetailsDDL: String =
"
CREATE TABLE IF NOT EXISTS bootcamp.match_details_bucketed (
     match_id STRING,
     player_gamertag STRING,
     player_total_kills INTEGER,
     player_total_deaths INTEGER
)
USING iceberg
PARTITIONED BY (bucket(16, match_id));
"
res9: org.apache.spark.sql.DataFrame = []


In [53]:
//matchDetailsBucketed.select($"match_id", $"player_gamertag", $"player_total_kills", $"player_total_deaths")
//.write.mode("append")
//.bucketBy(16, "match_id").saveAsTable("bootcamp.match_details_bucketed")

In [45]:
val someField = matchDetails.select("player_total_kills").distinct().collect()

someField: Array[org.apache.spark.sql.Row] = Array([31], [34], [28], [27], [26], [44], [12], [22], [47], [1], [13], [6], [16], [3], [20], [57], [48], [5], [19], [64], [41], [15], [37], [9], [17], [35], [4], [8], [23], [39], [7], [10], [50], [45], [38], [25], [24], [29], [21], [32], [11], [33], [14], [42], [2], [30], [46], [0], [18], [36], [52], [40], [94], [54], [43], [61], [59], [49], [51], [63], [82], [62], [60], [75], [109], [58], [83], [67], [69], [56], [71], [53], [76], [96], [55], [73], [90], [66], [65], [68])


In [52]:
// Process data in chunks based on completion_date
someField.foreach { row =>
  val fieldValue = row.getAs[Int]("player_total_kills") 
  val filteredData = matchDetailsBucketed.filter(col("player_total_kills") === fieldValue)

  // Repartition and persist the filtered data
  val optimizedData = filteredData
    .select($"match_id", $"player_gamertag", $"player_total_kills", $"player_total_deaths")
    .repartition(16, $"match_id")
    .persist(StorageLevel.MEMORY_AND_DISK)
    
  optimizedData.write
    .mode("append")
    .bucketBy(16, "match_id")
    //.partitionBy("completion_date")
    .saveAsTable("bootcamp.match_details_bucketed")
}

In [54]:
// Shutting off Broadcast join to have Spark pick Bucketing 
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

//matchesBucketed.createOrReplaceTempView("matches")
//matchDetailsBucketed.createOrReplaceTempView("match_details")

spark.sql("""
    SELECT * FROM bootcamp.match_details_bucketed mdb JOIN bootcamp.matches_bucketed md 
    ON mdb.match_id = md.match_id
    AND md.completion_date = DATE('2016-01-01')        
""").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [match_id#48578], [match_id#48582], Inner
   :- Sort [match_id#48578 ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(match_id#48578, 200), ENSURE_REQUIREMENTS, [plan_id=21199]
   :     +- BatchScan demo.bootcamp.match_details_bucketed[match_id#48578, player_gamertag#48579, player_total_kills#48580, player_total_deaths#48581] demo.bootcamp.match_details_bucketed (branch=null) [filters=match_id IS NOT NULL, groupedBy=] RuntimeFilters: []
   +- Sort [match_id#48582 ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(match_id#48582, 200), ENSURE_REQUIREMENTS, [plan_id=21200]
         +- BatchScan demo.bootcamp.matches_bucketed[match_id#48582, is_team_game#48583, playlist_id#48584, completion_date#48585] demo.bootcamp.matches_bucketed (branch=null) [filters=completion_date IS NOT NULL, completion_date = 1451606400000000, match_id IS NOT NULL, groupedBy=] RuntimeFilters: []




In [59]:
//Not needed to save to table. we can join on csv files directly.
//matches.write.mode("overwrite").saveAsTable("bootcamp.matches_raw")
//matchDetails.write.mode("overwrite").saveAsTable("bootcamp.match_details_raw")

In [68]:
matchesBucketed.createOrReplaceTempView("matches")
matchDetailsBucketed.createOrReplaceTempView("match_details")

In [70]:
spark.sql("""
    SELECT * FROM match_details mdb JOIN matches md ON mdb.match_id = md.match_id    
""").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- SortMergeJoin [match_id#48790], [match_id#48753], Inner
   :- Sort [match_id#48790 ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(match_id#48790, 200), ENSURE_REQUIREMENTS, [plan_id=21459]
   :     +- Filter isnotnull(match_id#48790)
   :        +- FileScan csv [match_id#48790,player_gamertag#48791,previous_spartan_rank#48792,spartan_rank#48793,previous_total_xp#48794,total_xp#48795,previous_csr_tier#48796,previous_csr_designation#48797,previous_csr#48798,previous_csr_percent_to_next_tier#48799,previous_csr_rank#48800,current_csr_tier#48801,current_csr_designation#48802,current_csr#48803,current_csr_percent_to_next_tier#48804,current_csr_rank#48805,player_rank_on_team#48806,player_finished#48807,player_average_life#48808,player_total_kills#48809,player_total_headshots#48810,player_total_weapon_damage#48811,player_total_shots_landed#48812,player_total_melee_kills#48813,... 12 more fields] Batched: false, DataFilte

In [None]:


// spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "1000000000000")

// val broadcastFromThreshold = matches.as("m").join(matchDetails.as("md"), $"m.match_id" === $"md.match_id")
//   .select($"m.completion_date", $"md.player_gamertag",  $"md.player_total_kills")
//   .take(5)

// val explicitBroadcast = matches.as("m").join(broadcast(matchDetails).as("md"), $"m.match_id" === $"md.match_id")
//   .select($"md.*", split($"completion_date", " ").getItem(0).as("ds"))

// val bucketedValues = matchDetailsBucketed.as("mdb").join(matchesBucketed.as("mb"), $"mb.match_id" === $"mdb.match_id").explain()
// // .take(5)

// val values = matchDetailsBucketed.as("m").join(matchesBucketed.as("md"), $"m.match_id" === $"md.match_id").explain()

// explicitBroadcast.write.mode("overwrite").insertInto("match_details_bucketed")

// matches.withColumn("ds", split($"completion_date", " ").getItem(0)).write.mode("overwrite").insertInto("matches_bucketed")

// spark.sql(bucketedSQL)

