In [1]:
spark.sql("DROP table bootcamp.matches_bucketed")

Intitializing Scala interpreter ...

Spark Web UI available at http://88fd5f590760:4042
SparkContext available as 'sc' (version = 3.5.1, master = local[*], app id = local-1733674486493)
SparkSession available as 'spark'


res0: org.apache.spark.sql.DataFrame = []


In [2]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col}
import org.apache.spark.storage.StorageLevel

val spark = SparkSession.builder()
  .appName("IcebergTableManagement") 
  .config("spark.executor.memory", "4g")
  .config("spark.driver.memory", "4g")
  .config("spark.sql.shuffle.partitions", "200") // Fine for large datasets
  .config("spark.sql.files.maxPartitionBytes", "134217728") // Optional: 128 MB is default
  .config("spark.sql.autoBroadcastJoinThreshold", "-1") // Optional: Disable broadcast join
  .config("spark.dynamicAllocation.enabled", "true") // Helps with resource allocation
  .config("spark.dynamicAllocation.minExecutors", "1") // Ensure minimum resources
  .config("spark.dynamicAllocation.maxExecutors", "50") // Scalable resource allocation
  .getOrCreate()


val matchesBucketedselect = spark.read.option("header", "true")
  .option("inferSchema", "true")
  .csv("/home/iceberg/data/matches.csv")

// Get distinct completion dates
val distinctDates = matchesBucketed.select("completion_date").distinct().collect()

// Create the Iceberg table if it doesn't exist
val bucketedDDL = """
CREATE TABLE IF NOT EXISTS bootcamp.matches_bucketed (
    match_id STRING,
    is_team_game BOOLEAN,
    playlist_id STRING,
    completion_date TIMESTAMP
)
USING iceberg
PARTITIONED BY (completion_date, bucket(16, match_id))
"""
spark.sql(bucketedDDL)

// Process data in chunks based on completion_date
distinctDates.foreach { row =>
  val date = row.getAs[java.sql.Timestamp]("completion_date")
  val filteredMatches = matchesBucketed.filter(col("completion_date") === date)
  
  // Repartition and persist the filtered data
  val optimizedMatches = filteredMatches
    .select($"match_id", $"is_team_game", $"playlist_id", $"completion_date")
    .repartition(16, $"match_id")
    .persist(StorageLevel.MEMORY_AND_DISK)
    
  optimizedMatches.write
    .mode("append")
    .bucketBy(16, "match_id")
    .partitionBy("completion_date")
    .saveAsTable("bootcamp.matches_bucketed")
}

// Verify the data in the table
val result = spark.sql("SELECT * FROM bootcamp.matches_bucketed")
result.show()

+--------------------+------------+--------------------+-------------------+
|            match_id|is_team_game|         playlist_id|    completion_date|
+--------------------+------------+--------------------+-------------------+
|0df7e36f-9501-483...|        true|2323b76a-db98-4e0...|2016-08-07 00:00:00|
|a582acd7-aea5-419...|        NULL|f72e0ef0-7c4a-430...|2015-12-26 00:00:00|
|7d2b104b-af02-49b...|        NULL|f72e0ef0-7c4a-430...|2015-12-26 00:00:00|
|fe41a901-7afe-408...|        NULL|2323b76a-db98-4e0...|2015-12-26 00:00:00|
|0e05752a-10f2-493...|        true|bc0f8ad6-31e6-4a1...|2015-12-26 00:00:00|
|ceeeefd4-ce81-49e...|        NULL|2323b76a-db98-4e0...|2015-12-26 00:00:00|
|d7a45423-226b-47a...|        NULL|d0766624-dbd7-453...|2015-12-26 00:00:00|
|7d72b72e-3864-403...|        NULL|f72e0ef0-7c4a-430...|2015-12-26 00:00:00|
|340905d8-f5ce-45c...|        true|bc0f8ad6-31e6-4a1...|2015-12-26 00:00:00|
|6e49636a-e9d1-4f1...|        true|2323b76a-db98-4e0...|2016-08-07 00:00:00|

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.col
import org.apache.spark.storage.StorageLevel
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@7e56f5e9
matchesBucketedselect: org.apache.spark.sql.DataFrame = [match_id: string, mapid: string ... 8 more fields]
distinctDates: Array[org.apache.spark.sql.Row] = Array([2016-03-13 00:00:00.0], [2016-03-11 00:00:00.0], [2016-03-10 00:00:00.0], [2016-01-30 00:00:00.0], [2016-03-27 00:00:00.0], [2016-04-10 00:00:00.0], [2016-01-18 00:00:00.0], [2016-02-01 00:00:00.0], [2015-12-14 00:00:00.0], [2016-02-03 00:00:00.0], [2016-04-30 00:00:00.0], [2016-03-05 00:00:00.0], [2016-04-15 00:00:00.0], [2016-05-21 00:00:00.0], [2015-10-31 00:00:00.0], [2016-01-22 00:00:00.0], [2016-02-09 00:00:00...


In [3]:
spark.sql("SELECT COUNT(1) as num_files FROM bootcamp.matches_bucketed.files").show()

+---------+
|num_files|
+---------+
|     3665|
+---------+

