### Setting Up Environment

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col
spark = SparkSession.builder.appName("Jupyter").getOrCreate()

spark

# Query 1

disable the default behavior of broadcast joins

In [None]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", "-1")

# Query 2

join the `medals` and `maps` tables with an explicitly specified a broadcast join

In [None]:
from pyspark.sql.functions import broadcast, lit, col

### Medals Broadcast Join

In [None]:
medalsBucketed = spark.read.option("header", "true") \
                        .option("inferSchema", "true") \
                        .csv("/home/iceberg/data/medals.csv")


spark.sql("""DROP TABLE IF EXISTS abbad.medals_bucketed""")
bucketedDDL = """
 CREATE TABLE IF NOT EXISTS abbad.medals_bucketed (
    medal_id STRING,
     sprite_uri STRING,
     sprite_left INTEGER,
     sprite_top	INTEGER,
     sprite_sheet_width	INTEGER,
     sprite_sheet_height INTEGER,
     sprite_width INTEGER,
     sprite_height INTEGER,
     classification	STRING, 
     description STRING,	
     name STRING,
     difficulty INTEGER
 )
 USING iceberg
 PARTITIONED BY (difficulty, bucket(16, medal_id));
 """
spark.sql(bucketedDDL)

medalsBucketed.select(
     col("medal_id"), col("classification"), col("name"), col("difficulty")
     ) \
     .write.mode("append")  \
     .partitionBy("difficulty") \
     .bucketBy(16, "medal_id").saveAsTable("abbad.medals_bucketed")

In [None]:
medalsMatchesPlayersBucketed = spark.read.option("header", "true") \
                        .option("inferSchema", "true") \
                        .csv("/home/iceberg/data/medals_matches_players.csv")

spark.sql("""DROP TABLE IF EXISTS abbad.medals_matches_players_bucketed""")
bucketedDDL = """
 CREATE TABLE IF NOT EXISTS abbad.medals_matches_players_bucketed (
    match_id STRING,
    player_gamertag STRING,
    medal_id STRING,
    count INTEGER
 )
 USING iceberg
 PARTITIONED BY (bucket(16, medal_id));
 """
spark.sql(bucketedDDL)

medalsMatchesPlayersBucketed.select(
     col("match_id"), col("player_gamertag"), col("medal_id"), col("count")
     ) \
     .write.mode("append")  \
     .bucketBy(16, "medal_id").saveAsTable("abbad.medals_matches_players_bucketed")

In [None]:
explicitBroadcast = medalsMatchesPlayersBucketed.alias("mmp").join(broadcast(medalsBucketed).alias("m"), col("mmp.medal_id") == col("m.medal_id")) \
   .select(col("mmp.*"))

explicitBroadcast.write.mode("overwrite").insertInto("abbad.medals_matches_players_bucketed")

### Maps Broadcast Join

In [None]:
mapsBucketed = spark.read.option("header", "true") \
                        .option("inferSchema", "true") \
                        .csv("/home/iceberg/data/maps.csv")

spark.sql("""DROP TABLE IF EXISTS abbad.maps_bucketed""")
bucketedDDL = """
 CREATE TABLE IF NOT EXISTS abbad.maps_bucketed (
    mapid STRING,
    name STRING,
    description STRING
 )
 USING iceberg
 PARTITIONED BY (name, bucket(16, mapid));
 """
spark.sql(bucketedDDL)

mapsBucketed.select(
     col("mapid"), col("name")
     ) \
     .write.mode("append")  \
     .partitionBy("difficulty") \
     .bucketBy(16, "mapid").saveAsTable("abbad.maps_bucketed")

In [None]:
matchesBucketed = spark.read.option("header", "true") \
                        .option("inferSchema", "true") \
                        .csv("/home/iceberg/data/matches.csv")

spark.sql("""DROP TABLE IF EXISTS abbad.matches_bucketed""")
bucketedDDL = """
 CREATE TABLE IF NOT EXISTS abbad.matches_bucketed (
    match_id STRING,
    mapid STRING,
    is_team_game BOOLEAN,
    playlist_id STRING,
    game_variant_id STRING,
    is_match_over BOOLEAN,
    completion_date TIMESTAMP,
    match_duration STRING,
    game_mode STRING,
    map_variant_id STRING
 )
 USING iceberg
 PARTITIONED BY (completion_date, bucket(16, mapid));
 """
spark.sql(bucketedDDL)

matchesBucketed.select(
     col("match_id"),col("mapid"),col("is_team_game"),col("playlist_id"),col("game_variant_id"),col("is_match_over"), \
        col("completion_date"),col("match_duration"), col("game_mode"),col("map_variant_id") \
     ) \
     .write.mode("append")  \
     .partitionBy("completion_date") \
     .bucketBy(16, "mapid").saveAsTable("abbad.matches_bucketed")

In [None]:
explicitBroadcast = matchesBucketed.alias("matches").join(broadcast(mapsBucketed).alias("maps"), col("matches.mapid") == col("maps.medal_id")) \
   .select(col("matches.*"))

explicitBroadcast.write.mode("overwrite").insertInto("abbad.matches_bucketed")

# Query 3 

join the `match_details`, `matches` and `medal_matches_players` using a bucket join on `match_id` with 16 buckets

In [None]:
matchDetailsBucketed = spark.read.option("header", "true") \
                        .option("inferSchema", "true") \
                        .csv("/home/iceberg/data/match_details.csv")

spark.sql("""DROP TABLE IF EXISTS abbad.match_details_bucketed""")
bucketedDDL = """
 CREATE TABLE IF NOT EXISTS abbad.match_details_bucketed (
    match_id STRING,
    player_gamertag STRING,
    player_total_kills INTEGER
 )
 USING iceberg
 PARTITIONED BY (player_total_kills, bucket(16, match_id));
 """
spark.sql(bucketedDDL)

matchDetailsBucketed.select(
     col("match_id"),col("player_gamertag"), col("player_total_kills"), col("playlist_id") \
     ) \
     .write.mode("append")  \
     .partitionBy("player_total_kills") \
     .bucketBy(16, "match_id").saveAsTable("abbad.match_details_bucketed")

In [None]:
matchDetailsBucketed.write.format("parquet").bucketBy(16, "match_id").sortBy("match_id") \
            .saveAsTable("match_details_bucketed")

matchesBucketed.write.format("parquet").bucketBy(16, "match_id").sortBy("match_id") \
            .saveAsTable("matches_bucketed")

medalsMatchesPlayersBucketed.write.format("parquet").bucketBy(16, "match_id").sortBy("match_id") \
            .saveAsTable("medals_matches_players_bucketed")

In [None]:
bucketedMatchDetails = spark.table("match_details_bucketed")
bucketedMatches = spark.table("matches_bucketed")
bucketedMedalsMatchesPlayers = spark.table("medals_matches_players_bucketed")

In [None]:
joinedDF = bucketedMatchDetails.join(bucketedMatches, "match_id") \
                               .join(bucketedMedalsMatchesPlayers, "match_id")

# 4. Queries

In [None]:
from pyspark.sql import functions as F

## Query 4a

which player has the highest average kills per game?

In [None]:
avgKillsPerPlayer = joinedDF.groupBy("player_gamertag") \
                            .agg(F.avg("player_total_kills").alias("average_kills")) \
                            .orderBy(F.desc("average_kills")) \
                            .limit(1)

avgKillsPerPlayer.show()

## Query 4b

which playlist has received the most plays?

In [None]:
mostPlayedPlaylist = joinedDF.groupBy("playlist_id") \
                             .count() \
                             .withColumnRenamed("count", "number_of_plays") \
                             .orderBy(F.desc("number_of_plays")) \
                             .limit(1)

mostPlayedPlaylist.show()

## Query 4c

which map was played the most?

In [None]:
mostPlayedMap = joinedDF.groupBy("mapid") \
                        .count() \
                        .withColumnRenamed("count", "number_of_plays") \
                        .orderBy(F.desc("number_of_plays")) \
                        .limit(1)

mostPlayedMap.show()

## Query 4d

on which map do players receive the highest number of Killing Spree medals?

In [None]:
killingSpreeMedalId = "2430242797"

mostKillingSpreeMedalsMap = joinedDF.filter(joinedDF["medal_id"] == killingSpreeMedalId) \
                                    .groupBy("mapid") \
                                    .count() \
                                    .withColumnRenamed("count", "number_of_killing_spree_medals") \
                                    .orderBy(F.desc("number_of_killing_spree_medals")) \
                                    .limit(1)

mostKillingSpreeMedalsMap.show()

# Queries 5

try at least 3 different versions of partitioned tables, and use .sortWithinPartitions to get the smallest footprint possible (hint: playlists and maps are both very low cardinality)

In [None]:
partitionedByPlaylist = joinedDF.repartition("playlist_id").sortWithinPartitions("match_id")

partitionedByPlaylist.explain()

In [None]:
partitionedByMap = joinedDF.repartition("mapid").sortWithinPartitions("match_id")

partitionedByMap.explain()

In [None]:
partitionedByPlaylistAndMap = joinedDF.repartition("playlist_id", "mapid").sortWithinPartitions("match_id")

partitionedByPlaylistAndMap.explain()

In [None]:
partitionedByCompletionDate = joinedDF.repartition("completion_date").sortWithinPartitions("match_id")

partitionedByCompletionDate.explain()

In [None]:
partitionedByPlayerKills = joinedDF.repartition("player_total_kills").sortWithinPartitions("match_id")

partitionedByPlayerKills.explain()