In [22]:
from pyspark.sql.functions import col, avg, count, desc, broadcast
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession
# Disable automatic broadcast join
spark = SparkSession.builder.appName("Jupyter").getOrCreate()
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")


In [26]:
# Load datasets (assuming they are in CSV format)
match_details = spark.read.option("header", "true").csv("/home/iceberg/data/match_details.csv")
matches = spark.read.option("header", "true").csv("/home/iceberg/data/matches.csv")
medals_matches_players = spark.read.option("header", "true").csv("/home/iceberg/data/medals_matches_players.csv")
medals = spark.read.option("header", "true").csv("/home/iceberg/data/medals.csv")
maps = spark.read.option("header", "true").csv("/home/iceberg/data/maps.csv")

In [24]:
# Convert appropriate columns to integers where necessary
match_details = match_details.withColumn("match_id", col("match_id").cast("int")) \
                             .withColumn("player_total_kills", col("player_total_kills").cast("int"))

matches = matches.withColumn("match_id", col("match_id").cast("int"))

medals_matches_players = medals_matches_players.withColumn("match_id", col("match_id").cast("int")) \
                                               .withColumn("medal_id", col("medal_id").cast("int")) \
                                               .withColumn("count", col("count").cast("int"))

medals = medals.withColumn("medal_id", col("medal_id").cast("int"))

In [39]:
# Rename the 'name' column in the 'medals' DataFrame to 'medal_name' to avoid ambiguity
medals_renamed = medals.withColumnRenamed("name", "medal_name")

In [40]:
# Explicitly broadcast the 'medals' and 'maps' DataFrames
medals_broadcasted = broadcast(medals_renamed)
maps_broadcasted = broadcast(maps)


In [41]:
# Bucket join for match_details, matches, and medals_matches_players on match_id with 16 buckets
match_details = match_details.repartitionByRange(16, col("match_id"))
matches = matches.repartitionByRange(16, col("match_id"))
medals_matches_players = medals_matches_players.repartitionByRange(16, col("match_id"))


In [42]:
# Perform bucket join with broadcasted DataFrames
joined_df = match_details.join(matches, "match_id") \
                         .join(medals_matches_players, ["match_id", "player_gamertag"]) \
                         .join(medals_broadcasted, "medal_id") \
                         .join(maps_broadcasted, "mapid", "left")

In [43]:
# Aggregation: Which player averages the most kills per game?
kills_per_game = joined_df.groupBy("player_gamertag") \
                          .agg(avg("player_total_kills").alias("avg_kills")) \
                          .orderBy(desc("avg_kills"))

In [44]:
# Aggregation: Which playlist gets played the most?
playlist_count = joined_df.groupBy("playlist_id") \
                          .agg(count("match_id").alias("match_count")) \
                          .orderBy(desc("match_count"))


In [45]:
# Aggregation: Which map gets played the most?
map_count = joined_df.groupBy("mapid") \
                     .agg(count("match_id").alias("match_count")) \
                     .orderBy(desc("match_count"))

In [46]:
# Aggregation: Which map do players get the most Killing Spree medals on?
killing_spree_maps = joined_df.filter(col("medal_name") == "Killing Spree") \
                              .groupBy("mapid") \
                              .agg(count("medal_id").alias("killing_spree_count")) \
                              .orderBy(desc("killing_spree_count"))

In [48]:
# Sorting within partitions to minimize data size
kills_per_game = kills_per_game.sortWithinPartitions("player_gamertag")
playlist_count = playlist_count.sortWithinPartitions("playlist_id")
map_count = map_count.sortWithinPartitions("mapid")
killing_spree_maps = killing_spree_maps.sortWithinPartitions("mapid")



In [49]:
# Show results
print("Players with the highest average kills per game:")
kills_per_game.show(truncate=False)

print("Most played playlists:")
playlist_count.show(truncate=False)

print("Most played maps:")
map_count.show(truncate=False)

print("Maps with the most Killing Spree medals:")
killing_spree_maps.show(truncate=False)

Players with the highest average kills per game:


                                                                                

+---------------+------------------+
|player_gamertag|avg_kills         |
+---------------+------------------+
|A 0 N Eclipse  |5.5               |
|A 2 tailed fox |23.0              |
|A 29 Delivery  |11.0              |
|A 2tha nimal   |9.066666666666666 |
|A 7 Gram Blunt |12.0              |
|A 9ja Ch0p     |5.0               |
|A American Monk|1.0               |
|A B4ller1      |5.0               |
|A BALLS86      |1.0               |
|A BIG mC       |5.6               |
|A BOOTY TAP    |2.0               |
|A BOSS Coyote  |10.0              |
|A BRIGHT SHADOW|4.6923076923076925|
|A Baby Lynx    |12.0              |
|A BacKWaRdsManN|6.0               |
|A Bad Dream x  |11.0              |
|A Baggie       |20.0              |
|A Bear of Doom |8.5               |
|A Big Blue Ape |13.0              |
|A Blind Kid 8P |29.0              |
+---------------+------------------+
only showing top 20 rows

Most played playlists:


                                                                                

+------------------------------------+-----------+
|playlist_id                         |match_count|
+------------------------------------+-----------+
|0504ca3c-de41-48f3-b9c8-3aab534d69e5|2254       |
|0bcf2be1-3168-4e42-9fb5-3551d7dbce77|66477      |
|2323b76a-db98-4e03-aa37-e171cfbdd1a4|92148      |
|2e812e09-912f-458b-a659-4ccb84232c65|1493       |
|355dc154-9809-4edb-8ed4-fff910c6ae9c|17194      |
|4b12472e-2a06-4235-ba58-f376be6c1b39|3839       |
|5728f612-3f20-4459-98bd-3478c79c4861|8001       |
|7385b4a1-86bf-4aec-b9c2-411a6aa48633|9490       |
|780cc101-005c-4fca-8ce7-6f36d7156ffe|66081      |
|7b7e892c-d9b7-4b03-bef8-c6a071df28ef|10685      |
|819eb188-1a1c-48b4-9af3-283d2447ff6f|5487       |
|88b7de19-113c-4beb-af7f-8553aeda3f4c|2116       |
|892189e9-d712-4bdb-afa7-1ccab43fbed4|86496      |
|b5d5a242-ffa5-4d88-a229-5031916be036|3778       |
|bc0f8ad6-31e6-4a18-87d9-ad5a2dbc8212|11071      |
|c98949ae-60a8-43dc-85d7-0feb0b92e719|107422     |
|d0766624-dbd7-4536-ba39-2d890a

                                                                                

+------------------------------------+-----------+
|mapid                               |match_count|
+------------------------------------+-----------+
|c74c9d0f-f206-11e4-8330-24be05e24f7e|105658     |
|c7805740-f206-11e4-982c-24be05e24f7e|70182      |
|c7b7baf0-f206-11e4-ae9a-24be05e24f7e|14515      |
|c7edbf0f-f206-11e4-aa52-24be05e24f7e|186118     |
|ca737f8f-f206-11e4-a7e2-24be05e24f7e|20994      |
|caacb800-f206-11e4-81ab-24be05e24f7e|38731      |
|cb914b9e-f206-11e4-b447-24be05e24f7e|41098      |
|cbcea2c0-f206-11e4-8c4a-24be05e24f7e|17926      |
|cc040aa1-f206-11e4-a3e0-24be05e24f7e|33786      |
|cc74f4e1-f206-11e4-ad66-24be05e24f7e|16648      |
|cd844200-f206-11e4-9393-24be05e24f7e|34368      |
|cdb934b0-f206-11e4-8810-24be05e24f7e|51845      |
|cdee4e70-f206-11e4-87a2-24be05e24f7e|32247      |
|ce1dc2de-f206-11e4-a646-24be05e24f7e|39699      |
|ce89a40f-f206-11e4-b83f-24be05e24f7e|8759       |
|cebd854f-f206-11e4-b46e-24be05e24f7e|38560      |
+------------------------------



+------------------------------------+-------------------+
|mapid                               |killing_spree_count|
+------------------------------------+-------------------+
|c74c9d0f-f206-11e4-8330-24be05e24f7e|4317               |
|c7805740-f206-11e4-982c-24be05e24f7e|2611               |
|c7b7baf0-f206-11e4-ae9a-24be05e24f7e|517                |
|c7edbf0f-f206-11e4-aa52-24be05e24f7e|6553               |
|ca737f8f-f206-11e4-a7e2-24be05e24f7e|916                |
|caacb800-f206-11e4-81ab-24be05e24f7e|1654               |
|cb914b9e-f206-11e4-b447-24be05e24f7e|1733               |
|cbcea2c0-f206-11e4-8c4a-24be05e24f7e|809                |
|cc040aa1-f206-11e4-a3e0-24be05e24f7e|1439               |
|cc74f4e1-f206-11e4-ad66-24be05e24f7e|771                |
|cd844200-f206-11e4-9393-24be05e24f7e|1544               |
|cdb934b0-f206-11e4-8810-24be05e24f7e|1991               |
|cdee4e70-f206-11e4-87a2-24be05e24f7e|1540               |
|ce1dc2de-f206-11e4-a646-24be05e24f7e|1751              

                                                                                