In [2]:
from pyspark.sql.functions import broadcast, split, lit
from pyspark.sql.functions import col
from pyspark import StorageLevel
from pyspark.sql import SparkSession

**Question 1:** Disable automatic broadcast join

In [2]:
spark = SparkSession.builder \
    .appName("IcebergTableManagement") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "200") \
    .config("spark.sql.files.maxPartitionBytes", "134217728") \
    .config("spark.sql.autoBroadcastJoinThreshold", "-1") \
    .config("spark.dynamicAllocation.enabled", "true") \
    .config("spark.dynamicAllocation.minExecutors", "1") \
    .config("spark.dynamicAllocation.maxExecutors", "50") \
    .getOrCreate()

24/12/10 13:23:11 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


**Question 2:** Explicitly broadcast JOINs `medals` and `maps`

In [41]:
df_medals = spark.read.option("header", "true").option("inferSchema", "true").csv("/home/iceberg/data/medals.csv")
df_maps = spark.read.option("header", "true").option("inferSchema", "true").csv("/home/iceberg/data/maps.csv")

In [4]:
df_medals.show(5)

+----------+--------------------+-----------+----------+------------------+-------------------+------------+-------------+--------------+--------------------+--------------+----------+
|  medal_id|          sprite_uri|sprite_left|sprite_top|sprite_sheet_width|sprite_sheet_height|sprite_width|sprite_height|classification|         description|          name|difficulty|
+----------+--------------------+-----------+----------+------------------+-------------------+------------+-------------+--------------+--------------------+--------------+----------+
|2315448068|                NULL|       NULL|      NULL|              NULL|               NULL|        NULL|         NULL|          NULL|                NULL|          NULL|      NULL|
|3565441934|                NULL|       NULL|      NULL|              NULL|               NULL|        NULL|         NULL|          NULL|                NULL|          NULL|      NULL|
|4162659350|https://content.h...|        750|       750|                74|

In [5]:
df_medals.printSchema()

root
 |-- medal_id: long (nullable = true)
 |-- sprite_uri: string (nullable = true)
 |-- sprite_left: integer (nullable = true)
 |-- sprite_top: integer (nullable = true)
 |-- sprite_sheet_width: integer (nullable = true)
 |-- sprite_sheet_height: integer (nullable = true)
 |-- sprite_width: integer (nullable = true)
 |-- sprite_height: integer (nullable = true)
 |-- classification: string (nullable = true)
 |-- description: string (nullable = true)
 |-- name: string (nullable = true)
 |-- difficulty: integer (nullable = true)



In [6]:
df_medals.count()

183

In [7]:
df_maps.show(5)

+--------------------+-------------------+--------------------+
|               mapid|               name|         description|
+--------------------+-------------------+--------------------+
|c93d708f-f206-11e...|              Urban|Andesia was the c...|
|cb251c51-f206-11e...|     Raid on Apex 7|This unbroken rin...|
|c854e54f-f206-11e...|March on Stormbreak|                NULL|
|c8d69870-f206-11e...| Escape from A.R.C.|Scientists flocke...|
|73ed1fd0-45e5-4bb...|             Osiris|                NULL|
+--------------------+-------------------+--------------------+
only showing top 5 rows



In [8]:
df_maps.printSchema()

root
 |-- mapid: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)



In [9]:
df_maps.count()

40

In [10]:
# df_maps.select("name").distinct().show()
# df_medals.select("name").distinct().show()
# common_names = df_maps.select("name").distinct().intersect(df_medals.select("name").distinct())
# common_names.show()
# common_descriptions = df_maps.select("description").distinct().intersect(df_medals.select("description").distinct())
# common_descriptions.show()

In [11]:
df_1 = df_medals.join(broadcast(df_maps), on="name", how="outer")
df_1.show(5)

24/12/09 16:57:19 WARN HintErrorLogger: Hint (strategy=broadcast) is not supported in the query: build right for full outer join.
24/12/09 16:57:19 WARN HintErrorLogger: Hint (strategy=broadcast) is not supported in the query: build right for full outer join.
24/12/09 16:57:19 WARN HintErrorLogger: Hint (strategy=broadcast) is not supported in the query: build right for full outer join.


+----+----------+----------+-----------+----------+------------------+-------------------+------------+-------------+--------------+-----------+----------+--------------------+-----------+
|name|  medal_id|sprite_uri|sprite_left|sprite_top|sprite_sheet_width|sprite_sheet_height|sprite_width|sprite_height|classification|description|difficulty|               mapid|description|
+----+----------+----------+-----------+----------+------------------+-------------------+------------+-------------+--------------+-----------+----------+--------------------+-----------+
|NULL|2315448068|      NULL|       NULL|      NULL|              NULL|               NULL|        NULL|         NULL|          NULL|       NULL|      NULL|                NULL|       NULL|
|NULL|3565441934|      NULL|       NULL|      NULL|              NULL|               NULL|        NULL|         NULL|          NULL|       NULL|      NULL|                NULL|       NULL|
|NULL|      NULL|      NULL|       NULL|      NULL|    

**Question 3:** Bucket join `match_details`, `matches`, and `medal_matches_players` on `match_id` with `16` buckets

In [3]:
match_details_df = spark.read.option("header", "true").option("inferSchema", "true").csv("/home/iceberg/data/match_details.csv")
matches_df = spark.read.option("header", "true").option("inferSchema", "true").csv("/home/iceberg/data/matches.csv")
medals_matches_players_df = spark.read.option("header", "true").option("inferSchema", "true").csv("/home/iceberg/data/medals_matches_players.csv")

                                                                                

In [2]:
match_details_df.printSchema()

root
 |-- match_id: string (nullable = true)
 |-- player_gamertag: string (nullable = true)
 |-- previous_spartan_rank: integer (nullable = true)
 |-- spartan_rank: integer (nullable = true)
 |-- previous_total_xp: integer (nullable = true)
 |-- total_xp: integer (nullable = true)
 |-- previous_csr_tier: integer (nullable = true)
 |-- previous_csr_designation: integer (nullable = true)
 |-- previous_csr: integer (nullable = true)
 |-- previous_csr_percent_to_next_tier: integer (nullable = true)
 |-- previous_csr_rank: integer (nullable = true)
 |-- current_csr_tier: integer (nullable = true)
 |-- current_csr_designation: integer (nullable = true)
 |-- current_csr: integer (nullable = true)
 |-- current_csr_percent_to_next_tier: integer (nullable = true)
 |-- current_csr_rank: integer (nullable = true)
 |-- player_rank_on_team: integer (nullable = true)
 |-- player_finished: boolean (nullable = true)
 |-- player_average_life: string (nullable = true)
 |-- player_total_kills: integer (nu

In [3]:
matches_df.printSchema()

root
 |-- match_id: string (nullable = true)
 |-- mapid: string (nullable = true)
 |-- is_team_game: boolean (nullable = true)
 |-- playlist_id: string (nullable = true)
 |-- game_variant_id: string (nullable = true)
 |-- is_match_over: boolean (nullable = true)
 |-- completion_date: timestamp (nullable = true)
 |-- match_duration: string (nullable = true)
 |-- game_mode: string (nullable = true)
 |-- map_variant_id: string (nullable = true)



In [4]:
medals_matches_players_df.printSchema()

root
 |-- match_id: string (nullable = true)
 |-- player_gamertag: string (nullable = true)
 |-- medal_id: long (nullable = true)
 |-- count: integer (nullable = true)



In [4]:
spark.sql("""DROP TABLE IF EXISTS bootcamp.hw3_matches""")
matches_ddl = """
CREATE TABLE IF NOT EXISTS bootcamp.hw3_matches(
    match_id STRING,
    mapid STRING,
    is_team_game BOOLEAN,
    playlist_id STRING,
    game_variant_id STRING,
    is_match_over BOOLEAN,
    completion_date TIMESTAMP,
    match_duration STRING,
    game_mode STRING,
    map_variant_id STRING
)
USING iceberg
CLUSTERED BY (match_id) INTO 16 BUCKETS
"""
spark.sql(matches_ddl)
matches_df.select("*") \
    .write \
    .format("iceberg") \
    .mode("append") \
    .bucketBy(16, "match_id") \
    .saveAsTable("bootcamp.hw3_matches")

                                                                                

In [5]:
spark.sql("""DROP TABLE IF EXISTS bootcamp.hw3_medals_matches_players""")
medals_matches_players_ddl = """
CREATE TABLE IF NOT EXISTS bootcamp.hw3_medals_matches_players(
    match_id STRING,
    player_gamertag STRING,
    medal_id BIGINT,
    count INTEGER
)
USING iceberg
CLUSTERED BY (match_id) INTO 16 BUCKETS
"""
spark.sql(medals_matches_players_ddl)
medals_matches_players_df.select("*") \
    .write \
    .format("iceberg") \
    .mode("append") \
    .bucketBy(16, "match_id") \
    .saveAsTable("bootcamp.hw3_medals_matches_players")

                                                                                

In [6]:
spark.sql("""DROP TABLE IF EXISTS bootcamp.hw3_match_details""")
match_details_ddl = """
CREATE TABLE IF NOT EXISTS bootcamp.hw3_matche_details(
    match_id STRING,
    player_gamertag STRING,
    spartan_rank INTEGER,
    player_total_kills INTEGER
)
USING iceberg
CLUSTERED BY (match_id) INTO 16 BUCKETS
"""
spark.sql(match_details_ddl)
match_details_df.select("match_id", "player_gamertag", "spartan_rank", "player_total_kills") \
    .write \
    .format("iceberg") \
    .mode("append") \
    .bucketBy(16, "match_id") \
    .saveAsTable("bootcamp.hw3_match_details")

                                                                                

In [17]:
spark.sql("SELECT * FROM bootcamp.hw3_medals_matches_players").show(10)

+--------------------+---------------+----------+-----+
|            match_id|player_gamertag|  medal_id|count|
+--------------------+---------------+----------+-----+
|27d7c16b-b780-4f8...|       EcZachly| 824733727|    1|
|27d7c16b-b780-4f8...|       EcZachly|3261908037|    5|
|27d7c16b-b780-4f8...|       EcZachly|2078758684|    1|
|27d7c16b-b780-4f8...|       EcZachly|1573153198|    1|
|27d7c16b-b780-4f8...|       EcZachly|2782465081|    1|
|27d7c16b-b780-4f8...|       EcZachly|2287626681|    1|
|e39c1eac-a39b-4e0...|       EcZachly| 250435527|    1|
|e39c1eac-a39b-4e0...|       EcZachly|3261908037|    2|
|e39c1eac-a39b-4e0...|       EcZachly|3400287617|    1|
|6128f58a-e42e-472...|       EcZachly|3261908037|    8|
+--------------------+---------------+----------+-----+
only showing top 10 rows



In [20]:
spark.sql("SELECT * FROM bootcamp.hw3_matches").show(5)

+--------------------+--------------------+------------+--------------------+--------------------+-------------+-------------------+--------------+---------+--------------------+
|            match_id|               mapid|is_team_game|         playlist_id|     game_variant_id|is_match_over|    completion_date|match_duration|game_mode|      map_variant_id|
+--------------------+--------------------+------------+--------------------+--------------------+-------------+-------------------+--------------+---------+--------------------+
|f44c9997-eb6f-4d6...|ce1dc2de-f206-11e...|        true|0504ca3c-de41-48f...|b0df8938-0fb6-42e...|         true|2016-02-28 00:00:00|          NULL|     NULL|d5a6277a-96d5-499...|
|f0f2daf2-52f3-4ff...|cbcea2c0-f206-11e...|        NULL|2323b76a-db98-4e0...|257a305e-4dd3-41f...|         NULL|2016-02-04 00:00:00|          NULL|     NULL|7108c409-6d1e-41d...|
|8aec419e-2bfa-4fc...|c7edbf0f-f206-11e...|        true|f72e0ef0-7c4a-430...|1e473914-46e4-408...|       

In [21]:
spark.sql("SELECT * FROM bootcamp.hw3_match_details").show(5)

+--------------------+---------------+------------+------------------+
|            match_id|player_gamertag|spartan_rank|player_total_kills|
+--------------------+---------------+------------+------------------+
|f8852913-2ccf-46f...|    OneWingKing|         122|                 7|
|155cfd23-4f97-4f1...|   BigChubSmith|           8|                15|
|155cfd23-4f97-4f1...|  JakeWilson801|          18|                18|
|155cfd23-4f97-4f1...|      taterbase|           5|                 1|
|155cfd23-4f97-4f1...| BeyondHumanx39|          24|                13|
+--------------------+---------------+------------+------------------+
only showing top 5 rows



In [7]:
match_details_df.select("match_id").distinct().count()

                                                                                

19050

In [8]:
matches_df.select("match_id").distinct().count()

24025

In [9]:
medals_matches_players_df.select("match_id").distinct().count()

                                                                                

18942

In [11]:
spark.sql("SELECT count( distinct match_id) FROM bootcamp.hw3_medals_matches_players").show()



+------------------------+
|count(DISTINCT match_id)|
+------------------------+
|                   18942|
+------------------------+



                                                                                

In [4]:
def get_columns_without_match_id(table_name, alias):
    all_columns = spark.table(table_name).columns
    return [f"{alias}.{col}" for col in all_columns if col != "match_id"]
matches_columns = get_columns_without_match_id("bootcamp.hw3_matches", "m")
match_details_columns = get_columns_without_match_id("bootcamp.hw3_match_details", "md")
medals_columns = get_columns_without_match_id("bootcamp.hw3_medals_matches_players", "mp")    

In [5]:
joining_query = f"""
SELECT
    COALESCE(m.match_id, md.match_id) AS match_id,
    {', '.join(matches_columns)}, 
    {', '.join(match_details_columns)}, 
    mp.player_gamertag AS medal_player_gamertag,
    mp.medal_id,
    mp.count
FROM bootcamp.hw3_matches AS m
FULL OUTER JOIN bootcamp.hw3_match_details AS md
    ON m.match_id = md.match_id
FULL OUTER JOIN bootcamp.hw3_medals_matches_players AS mp
    ON COALESCE(m.match_id, md.match_id) = mp.match_id
"""
joined_table_df = spark.sql(joining_query)

In [27]:
subset_df = joined_table_df.limit(100) 
subset_pdf = subset_df.toPandas()
print(subset_pdf)

                                                                                

                                match_id  \
0   0012f42d-dfbc-44e8-ab2b-49b854a015d1   
1   00169217-cca6-4b47-8df0-559ee424143f   
2   00169217-cca6-4b47-8df0-559ee424143f   
3   00169217-cca6-4b47-8df0-559ee424143f   
4   00169217-cca6-4b47-8df0-559ee424143f   
..                                   ...   
95  00169217-cca6-4b47-8df0-559ee424143f   
96  00169217-cca6-4b47-8df0-559ee424143f   
97  00169217-cca6-4b47-8df0-559ee424143f   
98  00169217-cca6-4b47-8df0-559ee424143f   
99  00169217-cca6-4b47-8df0-559ee424143f   

                                   mapid  is_team_game  \
0   cebd854f-f206-11e4-b46e-24be05e24f7e          True   
1   cc040aa1-f206-11e4-a3e0-24be05e24f7e          True   
2   cc040aa1-f206-11e4-a3e0-24be05e24f7e          True   
3   cc040aa1-f206-11e4-a3e0-24be05e24f7e          True   
4   cc040aa1-f206-11e4-a3e0-24be05e24f7e          True   
..                                   ...           ...   
95  cc040aa1-f206-11e4-a3e0-24be05e24f7e          True   
96  cc0

**Question4** Aggregate the joined data frame. </br>
**Q4.1** Which player averages the most kills per game?

In [6]:
joined_table_df.createOrReplaceTempView("joined_table")

In [29]:
q1 = """ 
WITH deduplicated_table AS (
    SELECT DISTINCT 
        match_id, 
        player_gamertag, 
        player_total_kills
    FROM 
        joined_table
),
player_match_kills AS (
    SELECT 
        player_gamertag,
        match_id,
        SUM(player_total_kills) AS total_kills_per_match
    FROM 
        deduplicated_table
    GROUP BY 
        player_gamertag, match_id
),
player_avg_kills AS (
    SELECT 
        player_gamertag,
        AVG(total_kills_per_match) AS avg_kills_per_match
    FROM 
        player_match_kills
    GROUP BY 
        player_gamertag
)
SELECT 
    player_gamertag,
    avg_kills_per_match
FROM 
    player_avg_kills
ORDER BY 
    avg_kills_per_match DESC
LIMIT 1;
"""
spark.sql(q1).show()



+---------------+-------------------+
|player_gamertag|avg_kills_per_match|
+---------------+-------------------+
|   gimpinator14|              109.0|
+---------------+-------------------+



                                                                                

In [35]:
# spark.sql("""select match_id, player_total_kills, medal_player_gamertag, medal_id,count from joined_table where player_gamertag == 'gimpinator14' """).show(truncate=False)

**Q4.2** Which playlist gets played the most?

In [48]:
q2 = """
WITH deduplicated_table AS (
    SELECT DISTINCT 
        match_id, 
        playlist_id
    FROM 
        joined_table
)
SELECT 
    playlist_id,
    COUNT(*) AS times_played
FROM 
    deduplicated_table
GROUP BY 
    playlist_id
ORDER BY 
    times_played DESC
LIMIT 1;
"""
spark.sql(q2).show(truncate=False)



+------------------------------------+------------+
|playlist_id                         |times_played|
+------------------------------------+------------+
|f72e0ef0-7c4a-4307-af78-8e38dac3fdba|9350        |
+------------------------------------+------------+



                                                                                

**Q4.3** Which map gets played the most?

In [49]:
q3 = """
WITH deduplicated_table AS (
    SELECT DISTINCT 
        match_id, 
        mapid
    FROM 
        joined_table
)
SELECT 
    mapid,
    COUNT(*) AS times_played
FROM 
    deduplicated_table
GROUP BY 
    mapid
ORDER BY 
    times_played DESC
LIMIT 1;
"""
spark.sql(q3).show(truncate=False)



+------------------------------------+------------+
|mapid                               |times_played|
+------------------------------------+------------+
|c7edbf0f-f206-11e4-aa52-24be05e24f7e|8587        |
+------------------------------------+------------+



                                                                                

**Q4.4**Which map do players get the most Killing Spree medals on?

In [46]:
df_medals.createOrReplaceTempView("medals")

In [47]:
q4="""
    WITH deduplicated_table AS (
        SELECT DISTINCT 
            jt.match_id, 
            jt.mapid,
            jt.medal_player_gamertag,
            jt.medal_id,
            jt.count
        FROM 
            joined_table jt
        JOIN
            medals m
        ON 
            jt.medal_id = m.medal_id
        WHERE 
            m.classification = 'KillingSpree'
    )
        SELECT 
        mapid, 
        SUM(count) AS total_medals
    FROM 
        deduplicated_table
    GROUP BY 
        mapid
    ORDER BY 
        total_medals DESC
    LIMIT 1
"""
spark.sql(q4).show(truncate=False)

24/12/10 16:57:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/10 16:57:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/10 16:57:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/10 16:57:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/10 16:57:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/10 16:57:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/10 16:57:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/10 16:57:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/10 16:57:28 WARN RowBasedKeyValueBatch: Calling spill() on

+------------------------------------+------------+
|mapid                               |total_medals|
+------------------------------------+------------+
|c7edbf0f-f206-11e4-aa52-24be05e24f7e|6925        |
+------------------------------------+------------+



                                                                                

**Q5** Try different `.sortWithinPartitions`

In [7]:
spark.sql(
"""CREATE TABLE IF NOT EXISTS bootcamp.hw3_joined_table (
    match_id STRING,
    mapid STRING,
    is_team_game BOOLEAN,
    playlist_id STRING,
    game_variant_id STRING,
    is_match_over BOOLEAN,
    completion_date TIMESTAMP,
    match_duration STRING,
    game_mode STRING,
    map_variant_id STRING,
    player_gamertag STRING,
    spartan_rank INTEGER,
    player_total_kills INTEGER,
    medal_player_gamertag STRING,
    medal_id BIGINT,
    count INTEGER
)
USING iceberg
    PARTITIONED BY (match_id);
    """
)

DataFrame[]

In [8]:
spark.sql(
"""CREATE TABLE IF NOT EXISTS bootcamp.hw3_sorted_1 (
    match_id STRING,
    mapid STRING,
    is_team_game BOOLEAN,
    playlist_id STRING,
    game_variant_id STRING,
    is_match_over BOOLEAN,
    completion_date TIMESTAMP,
    match_duration STRING,
    game_mode STRING,
    map_variant_id STRING,
    player_gamertag STRING,
    spartan_rank INTEGER,
    player_total_kills INTEGER,
    medal_player_gamertag STRING,
    medal_id BIGINT,
    count INTEGER
)
USING iceberg
    PARTITIONED BY (match_id);
    """
)

DataFrame[]

In [53]:
start_df = joined_table_df.repartition(4, col("match_id"))
sorted_df_1 = start_df.sortWithinPartitions(col("match_id"), col("player_gamertag"), col("mapid")) 

start_df.write.mode("overwrite").saveAsTable("bootcamp.hw3_joined_table")
sorted_df_1.write.mode("overwrite").saveAsTable("bootcamp.hw3_sorted_1")

                                                                                

In [55]:
%%sql
select * from bootcamp.hw3_joined_table.files

24/12/10 18:08:10 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


content,file_path,file_format,spec_id,partition,record_count,file_size_in_bytes,column_sizes,value_counts,null_value_counts,nan_value_counts,lower_bounds,upper_bounds,key_metadata,split_offsets,equality_ids,sort_order_id,readable_metrics
0,s3://warehouse/bootcamp/hw3_joined_table/data/00000-445-32a73932-45f8-45e2-92be-b573265aac4a-0-00001.parquet,PARQUET,1,Row(match_id=None),1690168,2147520,"{1: 153280, 2: 20332, 3: 8172, 4: 19550, 5: 15729, 6: 6488, 7: 24376, 8: 3570, 9: 3570, 10: 30346, 11: 315229, 12: 80511, 13: 67884, 14: 745234, 15: 244357, 16: 342208}","{1: 1690168, 2: 1690168, 3: 1690168, 4: 1690168, 5: 1690168, 6: 1690168, 7: 1690168, 8: 1690168, 9: 1690168, 10: 1690168, 11: 1690168, 12: 1690168, 13: 1690168, 14: 1690168, 15: 1690168, 16: 1690168}","{1: 0, 2: 0, 3: 113155, 4: 0, 5: 0, 6: 113155, 7: 0, 8: 1690168, 9: 1690168, 10: 886154, 11: 1204, 12: 1204, 13: 1204, 14: 1477, 15: 1477, 16: 1477}",{},"{1: bytearray(b'0000e3cf-727c-49'), 2: bytearray(b'5e130537-2275-40'), 3: bytearray(b'\x00'), 4: bytearray(b'0504ca3c-de41-48'), 5: bytearray(b'1571fdac-e0b4-4e'), 6: bytearray(b'\x01'), 7: bytearray(b'\x00\xa0L\xc2\n#\x05\x00'), 10: bytearray(b'00e24014-e0c7-44'), 11: bytearray(b'A 29 Delivery'), 12: bytearray(b'\x01\x00\x00\x00'), 13: bytearray(b'\x00\x00\x00\x00'), 14: bytearray(b'A 29 Delivery'), 15: bytearray(b'Uc\x1e\x02\x00\x00\x00\x00'), 16: bytearray(b'\x01\x00\x00\x00')}","{1: bytearray(b'fff72374-8977-49'), 2: bytearray(b'cebd854f-f206-12'), 3: bytearray(b'\x01'), 4: bytearray(b'fe2ad4e1-3def-47'), 5: bytearray(b'f6de5351-3797-42'), 6: bytearray(b'\x01'), 7: bytearray(b'\x00\xa0\xfa\xecq=\x05\x00'), 10: bytearray(b'fffc437b-2fd9-47'), 11: bytearray(b'zztonii'), 12: bytearray(b'\x97\x00\x00\x00'), 13: bytearray(b'm\x00\x00\x00'), 14: bytearray(b'zztonii'), 15: bytearray(b'*Sx\xfd\x00\x00\x00\x00'), 16: bytearray(b""\'\x00\x00\x00"")}",,[4],,0,"Row(completion_date=Row(column_size=24376, value_count=1690168, null_value_count=0, nan_value_count=None, lower_bound=datetime.datetime(2015, 10, 27, 0, 0), upper_bound=datetime.datetime(2016, 9, 27, 0, 0)), count=Row(column_size=342208, value_count=1690168, null_value_count=1477, nan_value_count=None, lower_bound=1, upper_bound=39), game_mode=Row(column_size=3570, value_count=1690168, null_value_count=1690168, nan_value_count=None, lower_bound=None, upper_bound=None), game_variant_id=Row(column_size=15729, value_count=1690168, null_value_count=0, nan_value_count=None, lower_bound='1571fdac-e0b4-4e', upper_bound='f6de5351-3797-42'), is_match_over=Row(column_size=6488, value_count=1690168, null_value_count=113155, nan_value_count=None, lower_bound=True, upper_bound=True), is_team_game=Row(column_size=8172, value_count=1690168, null_value_count=113155, nan_value_count=None, lower_bound=False, upper_bound=True), map_variant_id=Row(column_size=30346, value_count=1690168, null_value_count=886154, nan_value_count=None, lower_bound='00e24014-e0c7-44', upper_bound='fffc437b-2fd9-47'), mapid=Row(column_size=20332, value_count=1690168, null_value_count=0, nan_value_count=None, lower_bound='5e130537-2275-40', upper_bound='cebd854f-f206-12'), match_duration=Row(column_size=3570, value_count=1690168, null_value_count=1690168, nan_value_count=None, lower_bound=None, upper_bound=None), match_id=Row(column_size=153280, value_count=1690168, null_value_count=0, nan_value_count=None, lower_bound='0000e3cf-727c-49', upper_bound='fff72374-8977-49'), medal_id=Row(column_size=244357, value_count=1690168, null_value_count=1477, nan_value_count=None, lower_bound=35545941, upper_bound=4252521258), medal_player_gamertag=Row(column_size=745234, value_count=1690168, null_value_count=1477, nan_value_count=None, lower_bound='A 29 Delivery', upper_bound='zztonii'), player_gamertag=Row(column_size=315229, value_count=1690168, null_value_count=1204, nan_value_count=None, lower_bound='A 29 Delivery', upper_bound='zztonii'), player_total_kills=Row(column_size=67884, value_count=1690168, null_value_count=1204, nan_value_count=None, lower_bound=0, upper_bound=109), playlist_id=Row(column_size=19550, value_count=1690168, null_value_count=0, nan_value_count=None, lower_bound='0504ca3c-de41-48', upper_bound='fe2ad4e1-3def-47'), spartan_rank=Row(column_size=80511, value_count=1690168, null_value_count=1204, nan_value_count=None, lower_bound=1, upper_bound=151))"
0,s3://warehouse/bootcamp/hw3_joined_table/data/00001-446-32a73932-45f8-45e2-92be-b573265aac4a-0-00001.parquet,PARQUET,1,Row(match_id=None),1738116,2163731,"{1: 156735, 2: 20094, 3: 8403, 4: 19429, 5: 16002, 6: 6710, 7: 25383, 8: 3654, 9: 3654, 10: 31007, 11: 321218, 12: 82069, 13: 69397, 14: 757804, 15: 255876, 16: 318294}","{1: 1738116, 2: 1738116, 3: 1738116, 4: 1738116, 5: 1738116, 6: 1738116, 7: 1738116, 8: 1738116, 9: 1738116, 10: 1738116, 11: 1738116, 12: 1738116, 13: 1738116, 14: 1738116, 15: 1738116, 16: 1738116}","{1: 0, 2: 0, 3: 111426, 4: 0, 5: 0, 6: 111426, 7: 0, 8: 1738116, 9: 1738116, 10: 932732, 11: 1236, 12: 1236, 13: 1236, 14: 1566, 15: 1566, 16: 1566}",{},"{1: bytearray(b'000e3254-27b0-4c'), 2: bytearray(b'5e130537-2275-40'), 3: bytearray(b'\x00'), 4: bytearray(b'0504ca3c-de41-48'), 5: bytearray(b'1571fdac-e0b4-4e'), 6: bytearray(b'\x01'), 7: bytearray(b'\x00\xa0L\xc2\n#\x05\x00'), 10: bytearray(b'011eaa34-3f05-48'), 11: bytearray(b'A 2 tailed fox'), 12: bytearray(b'\x01\x00\x00\x00'), 13: bytearray(b'\x00\x00\x00\x00'), 14: bytearray(b'A 2 tailed fox'), 15: bytearray(b'Uc\x1e\x02\x00\x00\x00\x00'), 16: bytearray(b'\x01\x00\x00\x00')}","{1: bytearray(b'fff59f47-2a37-4c'), 2: bytearray(b'cebd854f-f206-12'), 3: bytearray(b'\x01'), 4: bytearray(b'f72e0ef0-7c4a-44'), 5: bytearray(b'f6de5351-3797-42'), 6: bytearray(b'\x01'), 7: bytearray(b'\x00\x00\xd2\n\x86=\x05\x00'), 10: bytearray(b'fffc437b-2fd9-47'), 11: bytearray(b'zzSOzz'), 12: bytearray(b'\x97\x00\x00\x00'), 13: bytearray(b'S\x00\x00\x00'), 14: bytearray(b'zzSOzz'), 15: bytearray(b'*Sx\xfd\x00\x00\x00\x00'), 16: bytearray(b':\x00\x00\x00')}",,[4],,0,"Row(completion_date=Row(column_size=25383, value_count=1738116, null_value_count=0, nan_value_count=None, lower_bound=datetime.datetime(2015, 10, 27, 0, 0), upper_bound=datetime.datetime(2016, 9, 28, 0, 0)), count=Row(column_size=318294, value_count=1738116, null_value_count=1566, nan_value_count=None, lower_bound=1, upper_bound=58), game_mode=Row(column_size=3654, value_count=1738116, null_value_count=1738116, nan_value_count=None, lower_bound=None, upper_bound=None), game_variant_id=Row(column_size=16002, value_count=1738116, null_value_count=0, nan_value_count=None, lower_bound='1571fdac-e0b4-4e', upper_bound='f6de5351-3797-42'), is_match_over=Row(column_size=6710, value_count=1738116, null_value_count=111426, nan_value_count=None, lower_bound=True, upper_bound=True), is_team_game=Row(column_size=8403, value_count=1738116, null_value_count=111426, nan_value_count=None, lower_bound=False, upper_bound=True), map_variant_id=Row(column_size=31007, value_count=1738116, null_value_count=932732, nan_value_count=None, lower_bound='011eaa34-3f05-48', upper_bound='fffc437b-2fd9-47'), mapid=Row(column_size=20094, value_count=1738116, null_value_count=0, nan_value_count=None, lower_bound='5e130537-2275-40', upper_bound='cebd854f-f206-12'), match_duration=Row(column_size=3654, value_count=1738116, null_value_count=1738116, nan_value_count=None, lower_bound=None, upper_bound=None), match_id=Row(column_size=156735, value_count=1738116, null_value_count=0, nan_value_count=None, lower_bound='000e3254-27b0-4c', upper_bound='fff59f47-2a37-4c'), medal_id=Row(column_size=255876, value_count=1738116, null_value_count=1566, nan_value_count=None, lower_bound=35545941, upper_bound=4252521258), medal_player_gamertag=Row(column_size=757804, value_count=1738116, null_value_count=1566, nan_value_count=None, lower_bound='A 2 tailed fox', upper_bound='zzSOzz'), player_gamertag=Row(column_size=321218, value_count=1738116, null_value_count=1236, nan_value_count=None, lower_bound='A 2 tailed fox', upper_bound='zzSOzz'), player_total_kills=Row(column_size=69397, value_count=1738116, null_value_count=1236, nan_value_count=None, lower_bound=0, upper_bound=83), playlist_id=Row(column_size=19429, value_count=1738116, null_value_count=0, nan_value_count=None, lower_bound='0504ca3c-de41-48', upper_bound='f72e0ef0-7c4a-44'), spartan_rank=Row(column_size=82069, value_count=1738116, null_value_count=1236, nan_value_count=None, lower_bound=1, upper_bound=151))"
0,s3://warehouse/bootcamp/hw3_joined_table/data/00002-447-32a73932-45f8-45e2-92be-b573265aac4a-0-00001.parquet,PARQUET,1,Row(match_id=None),1724725,2184579,"{1: 155790, 2: 20750, 3: 8180, 4: 19687, 5: 15953, 6: 6716, 7: 26075, 8: 3652, 9: 3652, 10: 30882, 11: 317920, 12: 81985, 13: 68427, 14: 760188, 15: 235625, 16: 361001}","{1: 1724725, 2: 1724725, 3: 1724725, 4: 1724725, 5: 1724725, 6: 1724725, 7: 1724725, 8: 1724725, 9: 1724725, 10: 1724725, 11: 1724725, 12: 1724725, 13: 1724725, 14: 1724725, 15: 1724725, 16: 1724725}","{1: 0, 2: 0, 3: 136428, 4: 0, 5: 0, 6: 136428, 7: 0, 8: 1724725, 9: 1724725, 10: 896680, 11: 1265, 12: 1265, 13: 1265, 14: 1455, 15: 1455, 16: 1455}",{},"{1: bytearray(b'0001a1c4-83dc-4f'), 2: bytearray(b'5e130537-2275-40'), 3: bytearray(b'\x00'), 4: bytearray(b'0504ca3c-de41-48'), 5: bytearray(b'1571fdac-e0b4-4e'), 6: bytearray(b'\x01'), 7: bytearray(b'\x00\xa0L\xc2\n#\x05\x00'), 10: bytearray(b'011eaa34-3f05-48'), 11: bytearray(b'A 0 N Eclipse'), 12: bytearray(b'\x01\x00\x00\x00'), 13: bytearray(b'\x00\x00\x00\x00'), 14: bytearray(b'A 0 N Eclipse'), 15: bytearray(b'Uc\x1e\x02\x00\x00\x00\x00'), 16: bytearray(b'\x01\x00\x00\x00')}","{1: bytearray(b'fffc65f4-bc88-41'), 2: bytearray(b'cebd854f-f206-12'), 3: bytearray(b'\x01'), 4: bytearray(b'fe2ad4e1-3def-47'), 5: bytearray(b'f6de5351-3797-42'), 6: bytearray(b'\x01'), 7: bytearray(b'\x00\x00\xd2\n\x86=\x05\x00'), 10: bytearray(b'fffc437b-2fd9-47'), 11: bytearray(b'zzzgameszzz'), 12: bytearray(b'\x96\x00\x00\x00'), 13: bytearray(b'L\x00\x00\x00'), 14: bytearray(b'zzzgameszzz'), 15: bytearray(b'*Sx\xfd\x00\x00\x00\x00'), 16: bytearray(b'\x1f\x00\x00\x00')}",,[4],,0,"Row(completion_date=Row(column_size=26075, value_count=1724725, null_value_count=0, nan_value_count=None, lower_bound=datetime.datetime(2015, 10, 27, 0, 0), upper_bound=datetime.datetime(2016, 9, 28, 0, 0)), count=Row(column_size=361001, value_count=1724725, null_value_count=1455, nan_value_count=None, lower_bound=1, upper_bound=31), game_mode=Row(column_size=3652, value_count=1724725, null_value_count=1724725, nan_value_count=None, lower_bound=None, upper_bound=None), game_variant_id=Row(column_size=15953, value_count=1724725, null_value_count=0, nan_value_count=None, lower_bound='1571fdac-e0b4-4e', upper_bound='f6de5351-3797-42'), is_match_over=Row(column_size=6716, value_count=1724725, null_value_count=136428, nan_value_count=None, lower_bound=True, upper_bound=True), is_team_game=Row(column_size=8180, value_count=1724725, null_value_count=136428, nan_value_count=None, lower_bound=False, upper_bound=True), map_variant_id=Row(column_size=30882, value_count=1724725, null_value_count=896680, nan_value_count=None, lower_bound='011eaa34-3f05-48', upper_bound='fffc437b-2fd9-47'), mapid=Row(column_size=20750, value_count=1724725, null_value_count=0, nan_value_count=None, lower_bound='5e130537-2275-40', upper_bound='cebd854f-f206-12'), match_duration=Row(column_size=3652, value_count=1724725, null_value_count=1724725, nan_value_count=None, lower_bound=None, upper_bound=None), match_id=Row(column_size=155790, value_count=1724725, null_value_count=0, nan_value_count=None, lower_bound='0001a1c4-83dc-4f', upper_bound='fffc65f4-bc88-41'), medal_id=Row(column_size=235625, value_count=1724725, null_value_count=1455, nan_value_count=None, lower_bound=35545941, upper_bound=4252521258), medal_player_gamertag=Row(column_size=760188, value_count=1724725, null_value_count=1455, nan_value_count=None, lower_bound='A 0 N Eclipse', upper_bound='zzzgameszzz'), player_gamertag=Row(column_size=317920, value_count=1724725, null_value_count=1265, nan_value_count=None, lower_bound='A 0 N Eclipse', upper_bound='zzzgameszzz'), player_total_kills=Row(column_size=68427, value_count=1724725, null_value_count=1265, nan_value_count=None, lower_bound=0, upper_bound=76), playlist_id=Row(column_size=19687, value_count=1724725, null_value_count=0, nan_value_count=None, lower_bound='0504ca3c-de41-48', upper_bound='fe2ad4e1-3def-47'), spartan_rank=Row(column_size=81985, value_count=1724725, null_value_count=1265, nan_value_count=None, lower_bound=1, upper_bound=150))"
0,s3://warehouse/bootcamp/hw3_joined_table/data/00003-448-32a73932-45f8-45e2-92be-b573265aac4a-0-00001.parquet,PARQUET,1,Row(match_id=None),1738952,2200768,"{1: 156593, 2: 20507, 3: 8221, 4: 19717, 5: 16131, 6: 6663, 7: 24925, 8: 3654, 9: 3654, 10: 31160, 11: 321279, 12: 81921, 13: 69116, 14: 764244, 15: 258054, 16: 346814}","{1: 1738952, 2: 1738952, 3: 1738952, 4: 1738952, 5: 1738952, 6: 1738952, 7: 1738952, 8: 1738952, 9: 1738952, 10: 1738952, 11: 1738952, 12: 1738952, 13: 1738952, 14: 1738952, 15: 1738952, 16: 1738952}","{1: 0, 2: 0, 3: 115372, 4: 0, 5: 0, 6: 115372, 7: 0, 8: 1738952, 9: 1738952, 10: 925597, 11: 1270, 12: 1270, 13: 1270, 14: 1605, 15: 1605, 16: 1605}",{},"{1: bytearray(b'0000e589-e3a9-40'), 2: bytearray(b'5e130537-2275-40'), 3: bytearray(b'\x00'), 4: bytearray(b'0504ca3c-de41-48'), 5: bytearray(b'1571fdac-e0b4-4e'), 6: bytearray(b'\x01'), 7: bytearray(b'\x00\xa0L\xc2\n#\x05\x00'), 10: bytearray(b'011eaa34-3f05-48'), 11: bytearray(b'A 0 N Eclipse'), 12: bytearray(b'\x01\x00\x00\x00'), 13: bytearray(b'\x00\x00\x00\x00'), 14: bytearray(b'A 0 N Eclipse'), 15: bytearray(b'Uc\x1e\x02\x00\x00\x00\x00'), 16: bytearray(b'\x01\x00\x00\x00')}","{1: bytearray(b'fffa2980-342d-46'), 2: bytearray(b'cebd854f-f206-12'), 3: bytearray(b'\x01'), 4: bytearray(b'fe2ad4e1-3def-47'), 5: bytearray(b'f6de5351-3797-42'), 6: bytearray(b'\x01'), 7: bytearray(b'\x00\x00\xd2\n\x86=\x05\x00'), 10: bytearray(b'fffc437b-2fd9-47'), 11: bytearray(b'zzzKusohakokids'), 12: bytearray(b'\x97\x00\x00\x00'), 13: bytearray(b'Z\x00\x00\x00'), 14: bytearray(b'zzzKusohakokids'), 15: bytearray(b'*Sx\xfd\x00\x00\x00\x00'), 16: bytearray(b'$\x00\x00\x00')}",,[4],,0,"Row(completion_date=Row(column_size=24925, value_count=1738952, null_value_count=0, nan_value_count=None, lower_bound=datetime.datetime(2015, 10, 27, 0, 0), upper_bound=datetime.datetime(2016, 9, 28, 0, 0)), count=Row(column_size=346814, value_count=1738952, null_value_count=1605, nan_value_count=None, lower_bound=1, upper_bound=36), game_mode=Row(column_size=3654, value_count=1738952, null_value_count=1738952, nan_value_count=None, lower_bound=None, upper_bound=None), game_variant_id=Row(column_size=16131, value_count=1738952, null_value_count=0, nan_value_count=None, lower_bound='1571fdac-e0b4-4e', upper_bound='f6de5351-3797-42'), is_match_over=Row(column_size=6663, value_count=1738952, null_value_count=115372, nan_value_count=None, lower_bound=True, upper_bound=True), is_team_game=Row(column_size=8221, value_count=1738952, null_value_count=115372, nan_value_count=None, lower_bound=False, upper_bound=True), map_variant_id=Row(column_size=31160, value_count=1738952, null_value_count=925597, nan_value_count=None, lower_bound='011eaa34-3f05-48', upper_bound='fffc437b-2fd9-47'), mapid=Row(column_size=20507, value_count=1738952, null_value_count=0, nan_value_count=None, lower_bound='5e130537-2275-40', upper_bound='cebd854f-f206-12'), match_duration=Row(column_size=3654, value_count=1738952, null_value_count=1738952, nan_value_count=None, lower_bound=None, upper_bound=None), match_id=Row(column_size=156593, value_count=1738952, null_value_count=0, nan_value_count=None, lower_bound='0000e589-e3a9-40', upper_bound='fffa2980-342d-46'), medal_id=Row(column_size=258054, value_count=1738952, null_value_count=1605, nan_value_count=None, lower_bound=35545941, upper_bound=4252521258), medal_player_gamertag=Row(column_size=764244, value_count=1738952, null_value_count=1605, nan_value_count=None, lower_bound='A 0 N Eclipse', upper_bound='zzzKusohakokids'), player_gamertag=Row(column_size=321279, value_count=1738952, null_value_count=1270, nan_value_count=None, lower_bound='A 0 N Eclipse', upper_bound='zzzKusohakokids'), player_total_kills=Row(column_size=69116, value_count=1738952, null_value_count=1270, nan_value_count=None, lower_bound=0, upper_bound=90), playlist_id=Row(column_size=19717, value_count=1738952, null_value_count=0, nan_value_count=None, lower_bound='0504ca3c-de41-48', upper_bound='fe2ad4e1-3def-47'), spartan_rank=Row(column_size=81921, value_count=1738952, null_value_count=1270, nan_value_count=None, lower_bound=1, upper_bound=151))"


In [11]:
sorted_df_2 = start_df.sortWithinPartitions(col("match_id"), col("playlist_id"), col("mapid")) 
sorted_df_3 = start_df.sortWithinPartitions(col("match_id"), col("mapid"), col("medal_id")) 
sorted_df_2.write.mode("overwrite").saveAsTable("bootcamp.hw3_sorted_2")
sorted_df_3.write.mode("overwrite").saveAsTable("bootcamp.hw3_sorted_3")

                                                                                

In [12]:
%%sql

SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'unsorted'  
FROM bootcamp.hw3_joined_table.files

UNION ALL
SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'sorted_1'
FROM bootcamp.hw3_sorted_1.files
UNION ALL
SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'sorted_2'
FROM bootcamp.hw3_sorted_2.files
UNION ALL
SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'sorted_3'
FROM bootcamp.hw3_sorted_3.files

24/12/11 12:49:43 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


size,num_files,unsorted
8696598,4,unsorted
8706875,4,sorted_1
8711281,4,sorted_2
16967862,4,sorted_3
