In [0]:
# prelude

from pyspark.sql.types import StructType, StructField
from typing import List, Dict, Generator, Union, Callable
from pyspark.sql import functions as F
# !!!! see https://stackoverflow.com/questions/36719039/sum-operation-on-pyspark-dataframe-giving-typeerror-when-type-is-fine
from pyspark.sql.functions import sum as _sum     



kInputFile = 's3://full-stack-bigdata-datasets/Big_Data/Project_Steam/steam_game_output.json'


In [0]:
df = df = spark.read.json(kInputFile)

In [0]:
# sanity check
df.count()

Out[64]: 55691

In [0]:
# let's have a quick look
df.limit(5).toPandas()

  Unable to convert the field data. If this column is not necessary, you may consider dropping it or converting to primitive type before the conversion.
Direct cause: Nested StructType not supported in conversion to Arrow
Attempting non-optimization as 'spark.sql.execution.arrow.pyspark.fallback.enabled' is set to true.
  warn(msg)


Unnamed: 0,data,id
0,"(10, [Multi-player, Valve Anti-Cheat enabled, ...",10
1,"(1000000, [Single-player, Partial Controller S...",1000000
2,"(1000010, [Single-player, Partial Controller S...",1000010
3,"(1000030, [Multi-player, Single-player, Co-op,...",1000030
4,"(1000040, [Single-player], 0, DoubleC Games, 0...",1000040


In [0]:
# Let's confirm the content of data and let's inspect the schema
df.printSchema()

root
 |-- data: struct (nullable = true)
 |    |-- appid: long (nullable = true)
 |    |-- categories: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- ccu: long (nullable = true)
 |    |-- developer: string (nullable = true)
 |    |-- discount: string (nullable = true)
 |    |-- genre: string (nullable = true)
 |    |-- header_image: string (nullable = true)
 |    |-- initialprice: string (nullable = true)
 |    |-- languages: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- negative: long (nullable = true)
 |    |-- owners: string (nullable = true)
 |    |-- platforms: struct (nullable = true)
 |    |    |-- linux: boolean (nullable = true)
 |    |    |-- mac: boolean (nullable = true)
 |    |    |-- windows: boolean (nullable = true)
 |    |-- positive: long (nullable = true)
 |    |-- price: string (nullable = true)
 |    |-- publisher: string (nullable = true)
 |    |-- release_date: string (nullable = true)
 |    |-


## 3. Platform analysis
1. Are most games available on Windows/Mac/Linux instead?
1. Do certain genres tend to be preferentially available on certain platforms?
1. You're free to follow these guidelines, or to choose a different angle of analysis, as long as your analysis reveals relevant and useful information.

### 1. Are most games available on Windows/Mac/Linux instead?

In [0]:
df_tmp = df.selectExpr("id as id", "data.platforms as platform")
display(df_tmp)

id,platform
10,"List(true, true, true)"
1000000,"List(false, false, true)"
1000010,"List(false, false, true)"
1000030,"List(false, true, true)"
1000040,"List(false, false, true)"
1000080,"List(false, true, true)"
1000100,"List(false, false, true)"
1000110,"List(false, false, true)"
1000130,"List(false, true, true)"
1000280,"List(false, false, true)"


In [0]:

df_flattened = df_tmp.selectExpr("id", "platform.linux as linux", "platform.mac as mac", "platform.windows as windows")
df_flattened.show(10)


+-------+-----+-----+-------+
|     id|linux|  mac|windows|
+-------+-----+-----+-------+
|     10| true| true|   true|
|1000000|false|false|   true|
|1000010|false|false|   true|
|1000030|false| true|   true|
|1000040|false|false|   true|
|1000080|false| true|   true|
|1000100|false|false|   true|
|1000110|false|false|   true|
|1000130|false| true|   true|
|1000280|false|false|   true|
+-------+-----+-----+-------+
only showing top 10 rows



Not easy to graps

In [0]:
result = df_flattened.groupBy("linux", "mac", "windows").agg(F.count("id").alias("total_ids"))
result.show()

+-----+-----+-------+---------+
|linux|  mac|windows|total_ids|
+-----+-----+-------+---------+
|false| true|   true|     5951|
| true|false|   true|     1647|
| true| true|   true|     6807|
| true|false|  false|        3|
| true| true|  false|        1|
|false| true|  false|       11|
|false|false|   true|    41271|
+-----+-----+-------+---------+



Much better

In [0]:
result = df_flattened.groupBy().agg(
    _sum(F.when(df_flattened.linux, 1).otherwise(0)).alias("on_linux"),
    _sum(F.when(df_flattened.windows, 1).otherwise(0)).alias("on_windows"),
    _sum(F.when(df_flattened.mac, 1).otherwise(0)).alias("on_mac")
)
result.show()

+--------+----------+------+
|on_linux|on_windows|on_mac|
+--------+----------+------+
|    8458|     55676| 12770|
+--------+----------+------+



* One could have expected a much better result for Linux since Android is based on a Linux kernel and Steam is available on Android.  
* **Answer to the question :** according to the data from Steam, most of the games run on Windows

### 2. Do certain genres tend to be preferentially available on certain platforms?

In [0]:
df_tmp = df.selectExpr("id as id", "data.platforms as platform", "data.genre as genre")
df_tmp.show(10)

+-------+--------------------+--------------------+
|     id|            platform|               genre|
+-------+--------------------+--------------------+
|     10|  {true, true, true}|              Action|
|1000000|{false, false, true}|Action, Adventure...|
|1000010|{false, false, true}|Adventure, Indie,...|
|1000030| {false, true, true}|Action, Indie, Si...|
|1000040|{false, false, true}|Action, Casual, I...|
|1000080| {false, true, true}|Action, Adventure...|
|1000100|{false, false, true}|Adventure, Indie,...|
|1000110|{false, false, true}|Action, Adventure...|
|1000130| {false, true, true}|       Casual, Indie|
|1000280|{false, false, true}|          Indie, RPG|
+-------+--------------------+--------------------+
only showing top 10 rows



In [0]:
df_tmp_exploded = df_tmp.withColumn("genre", F.explode(F.split(F.col("genre"), ", ")))
df_tmp_exploded.show(10)

+-------+--------------------+---------+
|     id|            platform|    genre|
+-------+--------------------+---------+
|     10|  {true, true, true}|   Action|
|1000000|{false, false, true}|   Action|
|1000000|{false, false, true}|Adventure|
|1000000|{false, false, true}|    Indie|
|1000010|{false, false, true}|Adventure|
|1000010|{false, false, true}|    Indie|
|1000010|{false, false, true}|      RPG|
|1000010|{false, false, true}| Strategy|
|1000030| {false, true, true}|   Action|
|1000030| {false, true, true}|    Indie|
+-------+--------------------+---------+
only showing top 10 rows



In [0]:

df_flattened = df_tmp_exploded.selectExpr("id", "platform.linux as linux", "platform.mac as mac", "platform.windows as windows", "genre")
df_flattened.show(10)

+-------+-----+-----+-------+---------+
|     id|linux|  mac|windows|    genre|
+-------+-----+-----+-------+---------+
|     10| true| true|   true|   Action|
|1000000|false|false|   true|   Action|
|1000000|false|false|   true|Adventure|
|1000000|false|false|   true|    Indie|
|1000010|false|false|   true|Adventure|
|1000010|false|false|   true|    Indie|
|1000010|false|false|   true|      RPG|
|1000010|false|false|   true| Strategy|
|1000030|false| true|   true|   Action|
|1000030|false| true|   true|    Indie|
+-------+-----+-----+-------+---------+
only showing top 10 rows



In [0]:
genre_per_platform = df_flattened.groupBy("genre").agg(
    _sum(F.when(df_flattened.linux, 1).otherwise(0)).alias("on_linux"),
    _sum(F.when(df_flattened.windows, 1).otherwise(0)).alias("on_windows"),
    _sum(F.when(df_flattened.mac, 1).otherwise(0)).alias("on_mac")
)
display(genre_per_platform.head(10))

genre,on_linux,on_windows,on_mac
Education,19,317,56
Massively Multiplayer,164,1459,270
Sexual Content,7,54,13
Adventure,3302,21427,5039
Sports,287,2665,506
Accounting,0,16,4
Audio Production,7,193,41
Video Production,6,247,29
Animation & Modeling,38,322,74
Racing,304,2154,424


Condensed version with one display at the very end 

In [0]:
df_tmp = df.selectExpr("id as id", "data.platforms as platform", "data.genre as genre")
df_tmp_exploded = df_tmp.withColumn("genre", F.explode(F.split(F.col("genre"), ", ")))
df_flattened = df_tmp_exploded.selectExpr("id", "platform.linux as linux", "platform.mac as mac", "platform.windows as windows", "genre")
genre_per_platform = df_flattened.groupBy("genre").agg(
    _sum(F.when(df_flattened.linux, 1).otherwise(0)).alias("on_linux"),
    _sum(F.when(df_flattened.windows, 1).otherwise(0)).alias("on_windows"),
    _sum(F.when(df_flattened.mac, 1).otherwise(0)).alias("on_mac")
)
display(genre_per_platform.head(10))

genre,on_linux,on_windows,on_mac
Education,19,317,56
Massively Multiplayer,164,1459,270
Sexual Content,7,54,13
Adventure,3302,21427,5039
Sports,287,2665,506
Accounting,0,16,4
Audio Production,7,193,41
Video Production,6,247,29
Animation & Modeling,38,322,74
Racing,304,2154,424


Condensed version (one display at the very end) with percentage per line

In [0]:
from pyspark.sql.functions import col

# Calculer le total d'occurrences pour chaque genre
genre_totals = genre_per_platform.withColumn("total", col("on_linux") + col("on_windows") + col("on_mac"))

# Calculer les pourcentages pour chaque colonne
genre_percentages = genre_totals.withColumn("percentage_linux", (col("on_linux") / col("total")) * 100) \
                                 .withColumn("percentage_windows", (col("on_windows") / col("total")) * 100) \
                                 .withColumn("percentage_mac", (col("on_mac") / col("total")) * 100) \
                                 .drop("on_linux", "on_windows", "on_mac", "total")

# Afficher le résultat
display(genre_percentages)

genre,percentage_linux,percentage_windows,percentage_mac
Education,4.846938775510204,80.86734693877551,14.285714285714285
Massively Multiplayer,8.6634970945589,77.07342842049657,14.263074484944532
Sexual Content,9.45945945945946,72.97297297297297,17.56756756756757
Adventure,11.092448266595,71.97997850040312,16.92757323300188
Sports,8.299595141700406,77.06766917293233,14.632735685367264
Accounting,0.0,80.0,20.0
Audio Production,2.904564315352697,80.08298755186722,17.012448132780083
Video Production,2.127659574468085,87.58865248226951,10.28368794326241
Animation & Modeling,8.755760368663594,74.19354838709677,17.050691244239633
Racing,10.54823039555864,74.73976405274115,14.712005551700209


Condensed version with percentage and 2 digit after decimal point

In [0]:
df_tmp = df.selectExpr("id as id", "data.platforms as platform", "data.genre as genre")
df_tmp_exploded = df_tmp.withColumn("genre", F.explode(F.split(F.col("genre"), ", ")))
df_flattened = df_tmp_exploded.selectExpr("id", "platform.linux as linux", "platform.mac as mac", "platform.windows as windows", "genre")
genre_per_platform = df_flattened.groupBy("genre").agg(
    _sum(F.when(df_flattened.linux, 1).otherwise(0)).alias("on_linux"),
    _sum(F.when(df_flattened.windows, 1).otherwise(0)).alias("on_windows"),
    _sum(F.when(df_flattened.mac, 1).otherwise(0)).alias("on_mac")
)

genre_totals = genre_per_platform.withColumn("total", F.col("on_linux") + F.col("on_windows") + F.col("on_mac"))
genre_percentages = genre_totals.withColumn("percentage_linux", F.round((F.col("on_linux") / F.col("total")) * 100, 2)) \
                                 .withColumn("percentage_windows", F.round((F.col("on_windows") / F.col("total")) * 100, 2)) \
                                 .withColumn("percentage_mac", F.round((F.col("on_mac") / F.col("total")) * 100, 2)) \
                                 .drop("on_linux", "on_windows", "on_mac", "total")

# Formater les colonnes de pourcentage pour afficher deux chiffres après la virgule
genre_percentages = genre_percentages.withColumn("percentage_linux", F.format_number(F.col("percentage_linux"), 2)) \
                                     .withColumn("percentage_windows", F.format_number(F.col("percentage_windows"), 2)) \
                                     .withColumn("percentage_mac", F.format_number(F.col("percentage_mac"), 2))

# Afficher le résultat
display(genre_percentages)

genre,percentage_linux,percentage_windows,percentage_mac
Education,4.85,80.87,14.29
Massively Multiplayer,8.66,77.07,14.26
Sexual Content,9.46,72.97,17.57
Adventure,11.09,71.98,16.93
Sports,8.3,77.07,14.63
Accounting,0.0,80.0,20.0
Audio Production,2.9,80.08,17.01
Video Production,2.13,87.59,10.28
Animation & Modeling,8.76,74.19,17.05
Racing,10.55,74.74,14.71


* **Answer to the question :** Most of the market in on Windows. Adventure, Sports and Racing are the top 3 on the 3 platforms.

### 3. Is there any publisher dedicated to one platform ?

In [0]:
df_tmp = df.selectExpr("id as id", "data.platforms as platform", "data.publisher")
# df_tmp_exploded = df_tmp.withColumn("genre", F.explode(F.split(F.col("genre"), ", ")))
df_flattened = df_tmp.selectExpr("id", "platform.linux as linux", "platform.mac as mac", "platform.windows as windows", "publisher")
publisher_per_platform = df_flattened.groupBy("publisher").agg(
    _sum(F.when(df_flattened.linux, 1).otherwise(0)).alias("on_linux"),
    _sum(F.when(df_flattened.windows, 1).otherwise(0)).alias("on_windows"),
    _sum(F.when(df_flattened.mac, 1).otherwise(0)).alias("on_mac")
)

publisher_totals = publisher_per_platform.withColumn("total", F.col("on_linux") + F.col("on_windows") + F.col("on_mac"))
publisher_percentages = publisher_totals.withColumn("percentage_linux", F.round((F.col("on_linux") / F.col("total")) * 100, 2)) \
                                 .withColumn("percentage_windows", F.round((F.col("on_windows") / F.col("total")) * 100, 2)) \
                                 .withColumn("percentage_mac", F.round((F.col("on_mac") / F.col("total")) * 100, 2)) \
                                 .drop("on_linux", "on_windows", "on_mac", "total")

# Formater les colonnes de pourcentage pour afficher deux chiffres après la virgule
publisher_percentages = publisher_percentages.withColumn("percentage_linux", F.format_number(F.col("percentage_linux"), 2)) \
                                     .withColumn("percentage_windows", F.format_number(F.col("percentage_windows"), 2)) \
                                     .withColumn("percentage_mac", F.format_number(F.col("percentage_mac"), 2))

# Afficher le résultat
display(publisher_percentages)

publisher,percentage_linux,percentage_windows,percentage_mac
Mykhail Konokh,0.0,100.0,0.0
TAKS,0.0,100.0,0.0
IR Studio,0.0,100.0,0.0
BBB Games,0.0,100.0,0.0
Schmidt Workshops,0.0,100.0,0.0
Iceberg Interactive,13.43,68.66,17.91
Navila Software Japan,0.0,100.0,0.0
Virtual Human Interaction Lab,0.0,100.0,0.0
Sword Garden Studios,0.0,100.0,0.0
Decumanus Games,0.0,100.0,0.0


* **Answer to the question :** Yes but only on Windows and Mac. 