In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast, split, lit, col
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, IntegerType, DoubleType, LongType

# Initialize a Spark Session
spark = SparkSession.builder.appName("Assignment3").getOrCreate()

In [2]:
# Task 1: Disable automatic broadcast join

spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

In [4]:
# Task 2: Explicitly Broadcast Join medals and maps
# Define required case classes that represent schemas for the data

# Define schema for Matches
matches_schema = StructType([
    StructField("match_id", StringType(), True),
    StructField("mapid", StringType(), True),
    StructField("is_team_game", BooleanType(), True),
    StructField("playlist_id", StringType(), True),
    StructField("game_variant_id", StringType(), True),
    StructField("is_match_over", BooleanType(), True),
    StructField("completion_date", StringType(), True),
    StructField("match_duration", StringType(), True),
    StructField("game_mode", StringType(), True),
    StructField("map_variant_id", StringType(), True)
])

# Define schema for MatchDetails
match_details_schema = StructType([
    StructField("match_id", StringType(), True),
    StructField("player_gamertag", StringType(), True),
    StructField("previous_spartan_rank", IntegerType(), True),
    StructField("spartan_rank", IntegerType(), True),
    StructField("previous_total_xp", IntegerType(), True),
    StructField("total_xp", IntegerType(), True),
    StructField("previous_csr_tier", IntegerType(), True),
    StructField("previous_csr_designation", IntegerType(), True),
    StructField("previous_csr", IntegerType(), True),
    StructField("previous_csr_percent_to_next_tier", IntegerType(), True),
    StructField("previous_csr_rank", IntegerType(), True),
    StructField("current_csr_tier", IntegerType(), True),
    StructField("current_csr_designation", IntegerType(), True),
    StructField("current_csr", IntegerType(), True),
    StructField("current_csr_percent_to_next_tier", IntegerType(), True),
    StructField("current_csr_rank", IntegerType(), True),
    StructField("player_rank_on_team", IntegerType(), True),
    StructField("player_finished", BooleanType(), True),
    StructField("player_average_life", StringType(), True),
    StructField("player_total_kills", IntegerType(), True),
    StructField("player_total_headshots", IntegerType(), True),
    StructField("player_total_weapon_damage", DoubleType(), True),
    StructField("player_total_shots_landed", DoubleType(), True),
    StructField("player_total_melee_kills", DoubleType(), True),
    StructField("player_total_melee_damage", DoubleType(), True),
    StructField("player_total_assassinations", DoubleType(), True),
    StructField("player_total_ground_pound_kills", DoubleType(), True),
    StructField("player_total_shoulder_bash_kills", DoubleType(), True),
    StructField("player_total_grenade_damage", DoubleType(), True),
    StructField("player_total_power_weapon_damage", DoubleType(), True),
    StructField("player_total_power_weapon_grabs", DoubleType(), True),
    StructField("player_total_deaths", DoubleType(), True),
    StructField("player_total_assists", DoubleType(), True),
    StructField("player_total_grenade_kills", DoubleType(), True),
    StructField("did_win", DoubleType(), True),
    StructField("team_id", DoubleType(), True)
])

# Define schema for Maps
maps_schema = StructType([
    StructField("mapid", StringType(), True),
    StructField("name", StringType(), True),
    StructField("description", StringType(), True)
])

# Define schema for Medals
medals_schema = StructType([
    StructField("medal_id", LongType(), True),
    StructField("sprite_uri", StringType(), True),
    StructField("sprite_left", IntegerType(), True),
    StructField("sprite_top", IntegerType(), True),
    StructField("sprite_sheet_width", IntegerType(), True),
    StructField("sprite_sheet_height", IntegerType(), True),
    StructField("sprite_width", IntegerType(), True),
    StructField("sprite_height", IntegerType(), True),
    StructField("classification", StringType(), True),
    StructField("description", StringType(), True),
    StructField("name", StringType(), True),
    StructField("difficulty", IntegerType(), True)
])

# Define schema for MedalsMatchesPlayers
medals_matches_players_schema = StructType([
    StructField("match_id", StringType(), True),
    StructField("player_gamertag", StringType(), True),
    StructField("medal_id", LongType(), True),
    StructField("count", IntegerType(), True)
])

In [5]:
# Read all required .csv files into DataFrames
matches = spark.read.option("header", "true") \
                    .option("inferSchema", "true") \
                    .csv("/home/iceberg/data/matches.csv")

# Show the first 2 rows
matches.show(2)

                                                                                

+--------------------+--------------------+------------+--------------------+--------------------+-------------+-------------------+--------------+---------+--------------+
|            match_id|               mapid|is_team_game|         playlist_id|     game_variant_id|is_match_over|    completion_date|match_duration|game_mode|map_variant_id|
+--------------------+--------------------+------------+--------------------+--------------------+-------------+-------------------+--------------+---------+--------------+
|11de1a94-8d07-416...|c7edbf0f-f206-11e...|        true|f72e0ef0-7c4a-430...|1e473914-46e4-408...|         true|2016-02-22 00:00:00|          NULL|     NULL|          NULL|
|d3643e71-3e51-43e...|cb914b9e-f206-11e...|       false|d0766624-dbd7-453...|257a305e-4dd3-41f...|         true|2016-02-14 00:00:00|          NULL|     NULL|          NULL|
+--------------------+--------------------+------------+--------------------+--------------------+-------------+-------------------+---

In [6]:
match_details = spark.read.option("header", "true") \
                          .option("inferSchema", "true") \
                          .csv("/home/iceberg/data/match_details.csv")

# Show the first 2 rows
match_details.show(2)

25/01/06 11:27:41 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+--------------------+---------------+---------------------+------------+-----------------+--------+-----------------+------------------------+------------+---------------------------------+-----------------+----------------+-----------------------+-----------+--------------------------------+----------------+-------------------+---------------+-------------------+------------------+----------------------+--------------------------+-------------------------+------------------------+-------------------------+---------------------------+-------------------------------+--------------------------------+---------------------------+--------------------------------+-------------------------------+-------------------+--------------------+--------------------------+-------+-------+
|            match_id|player_gamertag|previous_spartan_rank|spartan_rank|previous_total_xp|total_xp|previous_csr_tier|previous_csr_designation|previous_csr|previous_csr_percent_to_next_tier|previous_csr_rank|current_

In [7]:
maps = spark.read.option("header", "true") \
                 .option("inferSchema", "true") \
                 .csv("/home/iceberg/data/maps.csv")

# Show the first 2 rows
maps.show(2)

+--------------------+--------------+--------------------+
|               mapid|          name|         description|
+--------------------+--------------+--------------------+
|c93d708f-f206-11e...|         Urban|Andesia was the c...|
|cb251c51-f206-11e...|Raid on Apex 7|This unbroken rin...|
+--------------------+--------------+--------------------+
only showing top 2 rows



In [8]:
medals = spark.read.option("header", "true") \
                   .option("inferSchema", "true") \
                   .csv("/home/iceberg/data/medals.csv")

# Show the first 2 rows
medals.show(2)

+----------+----------+-----------+----------+------------------+-------------------+------------+-------------+--------------+-----------+----+----------+
|  medal_id|sprite_uri|sprite_left|sprite_top|sprite_sheet_width|sprite_sheet_height|sprite_width|sprite_height|classification|description|name|difficulty|
+----------+----------+-----------+----------+------------------+-------------------+------------+-------------+--------------+-----------+----+----------+
|2315448068|      NULL|       NULL|      NULL|              NULL|               NULL|        NULL|         NULL|          NULL|       NULL|NULL|      NULL|
|3565441934|      NULL|       NULL|      NULL|              NULL|               NULL|        NULL|         NULL|          NULL|       NULL|NULL|      NULL|
+----------+----------+-----------+----------+------------------+-------------------+------------+-------------+--------------+-----------+----+----------+
only showing top 2 rows



In [9]:
medals_matches_players = spark.read.option("header", "true") \
                                   .option("inferSchema", "true") \
                                   .csv("/home/iceberg/data/medals_matches_players.csv")

# Show the first 2 rows
medals_matches_players.show(2)

                                                                                

+--------------------+---------------+----------+-----+
|            match_id|player_gamertag|  medal_id|count|
+--------------------+---------------+----------+-----+
|009fdac5-e15c-47c...|       EcZachly|3261908037|    7|
|009fdac5-e15c-47c...|       EcZachly| 824733727|    2|
+--------------------+---------------+----------+-----+
only showing top 2 rows



In [10]:
# Create Temporary Views for the required DataFrames

matches.createOrReplaceTempView("matchesView")

match_details.createOrReplaceTempView("matchDetailsView")

maps.createOrReplaceTempView("mapsView")

medals_matches_players.createOrReplaceTempView("medalsMatchesPlayersView")

medals.createOrReplaceTempView("medalsView")

In [11]:
# Create the long table (medals) using SQL

matches_medal_maps_agg = spark.sql("""
    SELECT mmp.medal_id,
           me.name AS medal_name,
           mmp.match_id,
           ma.mapid,
           COLLECT_LIST(DISTINCT mmp.player_gamertag) AS player_gamertag_array
    FROM medalsMatchesPlayersView mmp
    JOIN matchesView ma ON mmp.match_id = ma.match_id
    JOIN medalsView me ON mmp.medal_id = me.medal_id
    GROUP BY mmp.medal_id, me.name, mmp.match_id, ma.mapid
""")

# Show the first 5 rows
matches_medal_maps_agg.show(5)

[Stage 25:>                                                         (0 + 1) / 1]

+---------+----------+--------------------+--------------------+---------------------+
| medal_id|medal_name|            match_id|               mapid|player_gamertag_array|
+---------+----------+--------------------+--------------------+---------------------+
|121048710|   Rampage|073a5745-339b-4f7...|c74c9d0f-f206-11e...|              [Lanqe]|
|121048710|   Rampage|143a653a-27f6-4d2...|c74c9d0f-f206-11e...|     [False EnvisioN]|
|121048710|   Rampage|259119e2-e79c-443...|c7805740-f206-11e...|     [Lvl 61 Scyther]|
|121048710|   Rampage|3453a66d-6a56-434...|cebd854f-f206-11e...|    [BiscuitAnanas31]|
|121048710|   Rampage|35e30ec5-7e71-43c...|c7b7baf0-f206-11e...|        [XCornholeoX]|
+---------+----------+--------------------+--------------------+---------------------+
only showing top 5 rows



                                                                                

In [12]:
# Explicitly broadcast join Medals & Maps

medals_maps = matches_medal_maps_agg.alias("m") \
    .join(broadcast(maps).alias("mp"), matches_medal_maps_agg["mapid"] == maps["mapid"]) \
    .select("m.*", "mp.name", "mp.description")

# Show the first 5 rows
medals_maps.show(5)



+---------+----------+--------------------+--------------------+---------------------+--------+--------------------+
| medal_id|medal_name|            match_id|               mapid|player_gamertag_array|    name|         description|
+---------+----------+--------------------+--------------------+---------------------+--------+--------------------+
|121048710|   Rampage|073a5745-339b-4f7...|c74c9d0f-f206-11e...|              [Lanqe]|  Alpine|These vistas are ...|
|121048710|   Rampage|143a653a-27f6-4d2...|c74c9d0f-f206-11e...|     [False EnvisioN]|  Alpine|These vistas are ...|
|121048710|   Rampage|259119e2-e79c-443...|c7805740-f206-11e...|     [Lvl 61 Scyther]| Glacier|Each of Halo's mi...|
|121048710|   Rampage|3453a66d-6a56-434...|cebd854f-f206-11e...|    [BiscuitAnanas31]|Coliseum|Forerunner Warrio...|
|121048710|   Rampage|35e30ec5-7e71-43c...|c7b7baf0-f206-11e...|        [XCornholeoX]|Parallax|The Orion Arm of ...|
+---------+----------+--------------------+--------------------+

                                                                                

In [13]:
# Task 3: Bucket join match_details, matches, and medal_matches_players on match_id with 16 buckets

# Create DDL for bucketed tables

# Matches
spark.sql("""DROP TABLE IF EXISTS bootcamp.matches_bucketed""")

bucketed_matches_ddl = """
                        CREATE TABLE IF NOT EXISTS bootcamp.matches_bucketed (
                            match_id STRING,
                            is_team_game BOOLEAN,
                            playlist_id STRING,
                            mapid STRING
                        )
                        USING iceberg
                        PARTITIONED BY (bucket(16, match_id))
                        """
spark.sql(bucketed_matches_ddl)



# Match Details
spark.sql("""DROP TABLE IF EXISTS bootcamp.match_details_bucketed""")

bucketed_match_details_ddl = """
                                CREATE TABLE IF NOT EXISTS bootcamp.match_details_bucketed (
                                    match_id STRING,
                                    player_gamertag STRING,
                                    player_total_kills INTEGER,
                                    player_total_deaths INTEGER
                                )
                                USING iceberg
                                PARTITIONED BY (bucket(16, match_id))
                                """
spark.sql(bucketed_match_details_ddl)



# Medals Matches Players
spark.sql("""DROP TABLE IF EXISTS bootcamp.medal_matches_players_bucketed""")

bucketed_medal_matches_players_ddl = """
                                        CREATE TABLE IF NOT EXISTS bootcamp.medal_matches_players_bucketed (
                                            match_id STRING,
                                            player_gamertag STRING,
                                            medal_id BIGINT
                                        )
                                        USING iceberg
                                        PARTITIONED BY (bucket(16, match_id))
                                        """
spark.sql(bucketed_medal_matches_players_ddl)

DataFrame[]

In [14]:
# Write data from the above DataFrames to the corresponding bucketed tables

# Matches
matches.select("match_id", "is_team_game", "playlist_id", "mapid") \
       .write.mode("append") \
       .bucketBy(16, "match_id") \
       .saveAsTable("bootcamp.matches_bucketed")

# Match Details
match_details.select("match_id", "player_gamertag", "player_total_kills", "player_total_deaths") \
             .write.mode("append") \
             .bucketBy(16, "match_id") \
             .saveAsTable("bootcamp.match_details_bucketed")

# Medals Matches Players
medals_matches_players.select("match_id", "player_gamertag", "medal_id") \
                      .write.mode("append") \
                      .bucketBy(16, "match_id") \
                      .saveAsTable("bootcamp.medal_matches_players_bucketed")

                                                                                

In [17]:
# Bucket Join match_details, matches, and medal_matches_players

bucketed_matches_medals = spark.sql("""
                                    SELECT mb.match_id,
                                           mb.playlist_id,
                                           mb.mapid,
                                           mdb.player_gamertag,
                                           mdb.player_total_kills,
                                           mdb.player_total_deaths,
                                           mmpb.medal_id
                                    FROM bootcamp.matches_bucketed mb
                                    JOIN bootcamp.match_details_bucketed mdb ON mb.match_id = mdb.match_id
                                    JOIN bootcamp.medal_matches_players_bucketed mmpb ON mb.match_id = mmpb.match_id
                                    WHERE mdb.player_gamertag IS NOT NULL
                                """)

# Show the first 5 rows
bucketed_matches_medals.show(5)



+--------------------+--------------------+--------------------+---------------+------------------+-------------------+----------+
|            match_id|         playlist_id|               mapid|player_gamertag|player_total_kills|player_total_deaths|  medal_id|
+--------------------+--------------------+--------------------+---------------+------------------+-------------------+----------+
|00169217-cca6-4b4...|2323b76a-db98-4e0...|cc040aa1-f206-11e...|  King Terror V|                14|                  7|3261908037|
|00169217-cca6-4b4...|2323b76a-db98-4e0...|cc040aa1-f206-11e...|  King Terror V|                14|                  7|3001183151|
|00169217-cca6-4b4...|2323b76a-db98-4e0...|cc040aa1-f206-11e...|  King Terror V|                14|                  7| 824733727|
|00169217-cca6-4b4...|2323b76a-db98-4e0...|cc040aa1-f206-11e...|  King Terror V|                14|                  7|2078758684|
|00169217-cca6-4b4...|2323b76a-db98-4e0...|cc040aa1-f206-11e...|  King Terror V|   

                                                                                

In [18]:
# Task 4: Aggregate the joined data frame to figure out questions like:

# Save the bucketed table as a temporary view

bucketed_matches_medals.createOrReplaceTempView("bucketed_matches_medals")

In [19]:
# I. Which player averages the most kills per game?

players_avg_kills = spark.sql("""
                                SELECT player_gamertag, 
                                       AVG(player_total_kills) AS average_kills 
                                FROM bucketed_matches_medals
                                GROUP BY player_gamertag
                                ORDER BY average_kills DESC
                            """)

players_avg_kills.show(1)



+---------------+-------------+
|player_gamertag|average_kills|
+---------------+-------------+
|   gimpinator14|        109.0|
+---------------+-------------+
only showing top 1 row



                                                                                

In [20]:
# II. Which playlist gets played the most?

playlist_most_played = spark.sql("""
                                    SELECT playlist_id, 
                                           COUNT(match_id) AS playlist_plays 
                                    FROM bucketed_matches_medals
                                    GROUP BY playlist_id
                                    ORDER BY playlist_plays DESC
                                """)

playlist_most_played.show(1)

                                                                                

+--------------------+--------------+
|         playlist_id|playlist_plays|
+--------------------+--------------+
|f72e0ef0-7c4a-430...|       1565529|
+--------------------+--------------+
only showing top 1 row



In [21]:
# III. Which map gets played the most?

map_most_played = spark.sql("""
                            SELECT map.name AS map, 
                                   COUNT(match.match_id) AS map_plays 
                            FROM bucketed_matches_medals match
                            JOIN mapsView map ON match.mapid = map.mapid
                            GROUP BY map.name
                            ORDER BY map_plays DESC
                        """)

map_most_played.show(1)

25/01/06 11:38:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/06 11:38:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/06 11:38:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/06 11:38:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/06 11:38:42 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/06 11:38:42 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/06 11:38:43 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/06 11:38:43 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

+------+---------+
|   map|map_plays|
+------+---------+
|Alpine|  1445545|
+------+---------+
only showing top 1 row



                                                                                

In [28]:
# IV. Which map do players get the most Killing Spree medals on?

map_most_killing_spree_medals = spark.sql("""
                                        SELECT map.name AS map, 
                                               COUNT(match.medal_id) AS killing_spree_medals 
                                        FROM bucketed_matches_medals match
                                        JOIN mapsView map ON match.mapid = map.mapid
                                        JOIN medalsView med ON match.medal_id = med.medal_id

                                        WHERE med.name = 'Killing Spree'
                                        
                                        GROUP BY map.name
                                        ORDER BY killing_spree_medals DESC
                                    """)

map_most_killing_spree_medals.show(1)

25/01/06 11:56:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/06 11:56:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

+------+--------------------+
|   map|killing_spree_medals|
+------+--------------------+
|Alpine|               56908|
+------+--------------------+
only showing top 1 row



                                                                                

In [24]:
# Task 5: With the aggregated data set, try different .sortWithinPartitions to see which has the smallest data size (hint: playlists and maps are both very low cardinality)

# I. playlistMostPlayed

sort_partition_playlist_most_played = playlist_most_played.repartition(col("playlist_id")) \
                                                          .sortWithinPartitions(col("playlist_plays").desc())

sort_partition_playlist_most_played.show(1)



+--------------------+--------------+
|         playlist_id|playlist_plays|
+--------------------+--------------+
|f72e0ef0-7c4a-430...|       1565529|
+--------------------+--------------+
only showing top 1 row



                                                                                

In [25]:
# II. mapMostPlayed

sort_partition_map_most_played = map_most_played.repartition(col("map")) \
                                                .sortWithinPartitions(col("map_plays").desc())

sort_partition_map_most_played.show(1)

25/01/06 11:41:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/06 11:41:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/06 11:41:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/06 11:41:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/06 11:41:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/06 11:41:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/06 11:41:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

+------+---------+
|   map|map_plays|
+------+---------+
|Alpine|  1445545|
+------+---------+
only showing top 1 row



                                                                                

In [30]:
# III. mapMostKillingMedals

sort_partition_map_most_killing_spree_medals = map_most_killing_spree_medals.repartition(col("map")) \
                                                                .sortWithinPartitions(col("killing_spree_medals").desc())

sort_partition_map_most_killing_spree_medals.show(1)

25/01/06 11:57:44 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/06 11:57:44 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

+------+--------------------+
|   map|killing_spree_medals|
+------+--------------------+
|Alpine|               56908|
+------+--------------------+
only showing top 1 row



                                                                                