In [21]:
# spark_job_pyspark.py
# PySpark script (no main function) for Spark Fundamentals Week
from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast, split, col, avg, count, lit

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Spark Fundamentals Week - HW1") \
    .config("spark.sql.autoBroadcastJoinThreshold", "-1") \
    .config("spark.ui.port", "4050") \
    .config("spark.sql.catalog.bootcamp", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.bootcamp.type", "hadoop") \
    .config("spark.sql.catalog.bootcamp.warehouse", "/tmp/iceberg_warehouse") \
    .getOrCreate()

# Paths to source CSVs
data_dir = "/home/iceberg/data"
matches_csv = f"{data_dir}/matches.csv"
details_csv = f"{data_dir}/match_details.csv"
mmp_csv     = f"{data_dir}/medals_matches_players.csv"
medals_csv  = f"{data_dir}/medals.csv"
maps_csv    = f"{data_dir}/maps.csv"

# Read raw CSVs
matches_df = spark.read.option("header", True).option("inferSchema", True).csv(matches_csv)
details_df = spark.read.option("header", True).option("inferSchema", True).csv(details_csv)
mmp_df     = spark.read.option("header", True).option("inferSchema", True).csv(mmp_csv)
medals_df  = spark.read.option("header", True).option("inferSchema", True).csv(medals_csv)
maps_df    = spark.read.option("header", True).option("inferSchema", True).csv(maps_csv)



                                                                                

In [22]:
maps_df.show(1)

+--------------------+-----+--------------------+
|               mapid| name|         description|
+--------------------+-----+--------------------+
|c93d708f-f206-11e...|Urban|Andesia was the c...|
+--------------------+-----+--------------------+
only showing top 1 row



In [23]:
spark.sql("DROP TABLE IF EXISTS bootcamp.matches_bucketed")
spark.sql("DROP TABLE IF EXISTS bootcamp.match_details_bucketed")
spark.sql("DROP TABLE IF EXISTS bootcamp.medal_matches_players_bucketed")


spark.sql("""
CREATE TABLE IF NOT EXISTS bootcamp.matches_bucketed (
    match_id STRING,
    is_team_game BOOLEAN,
    mapid STRING,
    playlist_id STRING,
    completion_date TIMESTAMP
)
USING iceberg
PARTITIONED BY (bucket(16, match_id))
""")

spark.sql("""
CREATE TABLE IF NOT EXISTS bootcamp.match_details_bucketed (
    match_id STRING,
    player_id STRING,
    kills INT,
    deaths INT
)
USING iceberg
PARTITIONED BY (bucket(16, match_id))
""")

spark.sql("""
CREATE TABLE IF NOT EXISTS bootcamp.medal_matches_players_bucketed (
    match_id STRING,
    player_id STRING,
    medal_id STRING
)
USING iceberg
PARTITIONED BY (bucket(16, match_id))
""")

DataFrame[]

In [24]:
matches_df.select(
    col("match_id"), col("is_team_game"), col('mapid'), col("playlist_id"), col("completion_date")
).write\
  .mode("overwrite")\
  .insertInto("bootcamp.matches_bucketed")

details_df.select(
    col("match_id"),
    col("player_gamertag"),
    col("player_total_kills").alias("kills"),
    col("player_total_deaths").alias("deaths")
).write\
  .mode("overwrite")\
  .insertInto("bootcamp.match_details_bucketed")

mmp_df.select(
    col("match_id"), col("player_gamertag"), col("medal_id")
).write\
  .mode("overwrite")\
  .insertInto("bootcamp.medal_matches_players_bucketed")

                                                                                

In [25]:
# Prepare broadcast dims
broadcast_medals = broadcast(medals_df)
broadcast_maps   = broadcast(maps_df)

# Load bucketed tables
matches_bucketed  = spark.table("bootcamp.matches_bucketed")
match_details_bucketed  = spark.table("bootcamp.match_details_bucketed")
medal_matches_players_bucketed = spark.table("bootcamp.medal_matches_players_bucketed")

# 1) Explain bucketed join plan
print("=== Bucketed join plan ===")
matches_bucketed.join(match_details_bucketed, on="match_id").explain(True)

# 2) Explain non-bucketed temp view join
matches_df.createOrReplaceTempView("matches")
details_df.createOrReplaceTempView("details")
print("=== TempView non-bucketed join plan ===")
spark.sql("SELECT * FROM details d JOIN matches m ON d.match_id = m.match_id").explain(True)

# 3) Explain explicit broadcast join
print("=== Explicit broadcast join plan ===")
matches_bucketed.join(broadcast(match_details_bucketed), on="match_id").explain(True)

medal_matches_players_bucketed = medal_matches_players_bucketed.withColumnRenamed("player_id", "mpm_player_id")
broadcast_maps = broadcast_maps.withColumnRenamed("description", "maps_description")\
                               .withColumnRenamed("name", "map_name")

# 4) Multi-way join with dimensional tables
joined = match_details_bucketed \
    .join(matches_bucketed, on="match_id") \
    .join(medal_matches_players_bucketed, on="match_id") \
    .join(broadcast_medals, on="medal_id") \
    .join(broadcast_maps, on = "mapid")

=== Bucketed join plan ===
== Parsed Logical Plan ==
'Join UsingJoin(Inner, [match_id])
:- SubqueryAlias bootcamp.matches_bucketed
:  +- RelationV2[match_id#1915, is_team_game#1916, mapid#1917, playlist_id#1918, completion_date#1919] bootcamp.matches_bucketed bootcamp.matches_bucketed
+- SubqueryAlias bootcamp.match_details_bucketed
   +- RelationV2[match_id#1925, player_id#1926, kills#1927, deaths#1928] bootcamp.match_details_bucketed bootcamp.match_details_bucketed

== Analyzed Logical Plan ==
match_id: string, is_team_game: boolean, mapid: string, playlist_id: string, completion_date: timestamp, player_id: string, kills: int, deaths: int
Project [match_id#1915, is_team_game#1916, mapid#1917, playlist_id#1918, completion_date#1919, player_id#1926, kills#1927, deaths#1928]
+- Join Inner, (match_id#1915 = match_id#1925)
   :- SubqueryAlias bootcamp.matches_bucketed
   :  +- RelationV2[match_id#1915, is_team_game#1916, mapid#1917, playlist_id#1918, completion_date#1919] bootcamp.matches

In [26]:
joined.head(1)

                                                                                

[Row(mapid='cc040aa1-f206-11e4-a3e0-24be05e24f7e', medal_id='3261908037', match_id='00169217-cca6-4b47-8df0-559ee424143f', player_id='King Terror V', kills=14, deaths=7, is_team_game=True, playlist_id='2323b76a-db98-4e03-aa37-e171cfbdd1a4', completion_date=datetime.datetime(2016, 3, 13, 0, 0), mpm_player_id='King Terror V', sprite_uri='https://content.halocdn.com/media/Default/games/halo-5-guardians/sprites/medalspritesheet-be288ea5c0994a4e9d36f43aee7bc631.png', sprite_left=375, sprite_top=525, sprite_sheet_width=74, sprite_sheet_height=74, sprite_width=1125, sprite_height=899, classification='WeaponProficiency', description='Kill an opponent by shooting them in the head.', name='Headshot', difficulty=60, map_name='Fathom', maps_description='The UNSC explores Beta Gabriel’s vast oceans in hopes of recovering wreckage hidden in the crushing depths.')]

In [27]:
# Aggregations
# a) Player with highest avg kills
top_kills = joined.groupBy(match_details_bucketed["player_id"]).agg(avg("kills").alias("avg_kills")).orderBy(col("avg_kills").desc())
print("Top players by average kills:")
top_kills.show(1)

# b) Most-played playlist
top_playlists = joined.groupBy(col("playlist_id")).agg(count("match_id").alias("play_count")).orderBy(col("play_count").desc())
print("Top playlists:")
top_playlists.show(1)

# c) Most-played map
top_maps = joined.groupBy(broadcast_maps["mapid"]).agg(count("match_id").alias("play_count")).orderBy(col("play_count").desc())
print("Top maps:")
top_maps.show(1)

# d) Map with most Killing Spree medals
top_spree_maps = joined.filter(broadcast_medals["name"] == "Killing Spree") \
    .groupBy(broadcast_maps["mapid"]).agg(count("medal_id").alias("spree_count")).orderBy(col("spree_count").desc())
print("Maps with most Killing Spree medals:")
top_spree_maps.show(1)

Top players by average kills:


                                                                                

+------------+---------+
|   player_id|avg_kills|
+------------+---------+
|gimpinator14|    109.0|
+------------+---------+
only showing top 1 row

Top playlists:


                                                                                

+--------------------+----------+
|         playlist_id|play_count|
+--------------------+----------+
|f72e0ef0-7c4a-430...|   1565529|
+--------------------+----------+
only showing top 1 row

Top maps:


                                                                                

+--------------------+----------+
|               mapid|play_count|
+--------------------+----------+
|c74c9d0f-f206-11e...|   1445545|
+--------------------+----------+
only showing top 1 row

Maps with most Killing Spree medals:




+--------------------+-----------+
|               mapid|spree_count|
+--------------------+-----------+
|c74c9d0f-f206-11e...|      56908|
+--------------------+-----------+
only showing top 1 row



                                                                                

In [19]:
def get_dir_size(path: str) -> int:
    total = 0
    for dirpath, dirnames, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if os.path.exists(fp):
                total += os.path.getsize(fp)
    return total

In [20]:
# sortWithinPartitions experiments on low-cardinality columns
partition_sizes = [4, 8, 16]
keys = ["playlist_id", "mapid"]
results = []

for num_parts in partition_sizes:
    for key in keys:
        sorted_df = (
            joined.select(*joined.columns)
            .repartition(num_parts, col(key))
            .sortWithinPartitions(col(key))
        )
        out_path = f"/tmp/sorted_by_{key}_{num_parts}"
        sorted_df.write.mode("overwrite").parquet(out_path)
        size = get_dir_size(out_path)
        print(f"Written sorted by {key} ({num_parts} parts) -> size: {size} bytes")
        results.append((key, num_parts, size))

# Display results
spark.createDataFrame(results, schema=["method", "partitions", "size_bytes"]).show()

                                                                                

Written sorted by playlist_id (4 parts) -> size: 51389518 bytes


                                                                                

Written sorted by mapid (4 parts) -> size: 54499926 bytes


Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/local/lib/python3.10/socket.py", line 717, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt
                                                                                

KeyboardInterrupt: 