In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, isnull
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType

# Creating Spark session
spark = SparkSession.builder.appName("SteamGamesSuccess").getOrCreate()

data = "games_march2025_full.csv"
df = spark.read.csv("games_march2025_full.csv", header=True, inferSchema=True)
df.show(truncate=False)



+-------+-------------------------------+------------+------------+-----+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [2]:
used_columns = [
    "appid", "name", "release_date", "genres", "tags", "developers", "publishers",
    "price", "discount", "recommendations", "positive", "negative", "peak_ccu", "dlc_count"
]

df1 = df.select(*used_columns)

df1.show(20)

+-------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+-----+--------+---------------+--------+--------+--------+---------+
|  appid|                name|release_date|              genres|                tags|          developers|          publishers|price|discount|recommendations|positive|negative|peak_ccu|dlc_count|
+-------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+-----+--------+---------------+--------+--------+--------+---------+
|    730|    Counter-Strike 2|  21/08/2012|['Action', 'Free ...|{'FPS': 90857, 'S...|           ['Valve']|           ['Valve']|    0|       0|        4401572| 7480813| 1135108| 1212356|        1|
| 578080| PUBG: BATTLEGROUNDS|  21/12/2017|['Action', 'Adven...|{'Survival': 1483...|['PUBG Corporation']|   ['KRAFTON, Inc.']|    0|       0|        1732007| 1487960| 1024436|  616738|        0|
|    570|           

In [3]:
df1 = df1.withColumnRenamed("discount", "discount_percentage")

In [4]:
df1 = df1.withColumnRenamed("peak_ccu", "peak_playernum")

In [5]:
df1.printSchema()

root
 |-- appid: string (nullable = true)
 |-- name: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- developers: string (nullable = true)
 |-- publishers: string (nullable = true)
 |-- price: string (nullable = true)
 |-- discount_percentage: string (nullable = true)
 |-- recommendations: string (nullable = true)
 |-- positive: string (nullable = true)
 |-- negative: string (nullable = true)
 |-- peak_playernum: string (nullable = true)
 |-- dlc_count: string (nullable = true)



In [6]:
print(f"Total Records: {df1.count()}")

df1.show()

Total Records: 94954
+-------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+-----+-------------------+---------------+--------+--------+--------------+---------+
|  appid|                name|release_date|              genres|                tags|          developers|          publishers|price|discount_percentage|recommendations|positive|negative|peak_playernum|dlc_count|
+-------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+-----+-------------------+---------------+--------+--------+--------------+---------+
|    730|    Counter-Strike 2|  21/08/2012|['Action', 'Free ...|{'FPS': 90857, 'S...|           ['Valve']|           ['Valve']|    0|                  0|        4401572| 7480813| 1135108|       1212356|        1|
| 578080| PUBG: BATTLEGROUNDS|  21/12/2017|['Action', 'Adven...|{'Survival': 1483...|['PUBG Corporation']|   ['KRAFTON, Inc.']|

# Data Preprocessing
- Handling Missing Values

In [7]:
from pyspark.sql.functions import col, sum

null_counts = df1.select([sum(col(c).isNull().cast("int")).alias(c) for c in df1.columns])
null_counts.show()

+-----+----+------------+------+----+----------+----------+-----+-------------------+---------------+--------+--------+--------------+---------+
|appid|name|release_date|genres|tags|developers|publishers|price|discount_percentage|recommendations|positive|negative|peak_playernum|dlc_count|
+-----+----+------------+------+----+----------+----------+-----+-------------------+---------------+--------+--------+--------------+---------+
|    0|   2|           0|     5|  10|         9|         9|    0|                 11|             10|     110|      75|             9|        0|
+-----+----+------------+------+----+----------+----------+-----+-------------------+---------------+--------+--------+--------------+---------+



In [8]:
# We dropped null name rows, because a game without a name is invalid.
# We dropped null tag rows, because there were quite a few of them and we do not get much out of them.
# We dropped null peak_playernum rows, because if it has had no peak it means it it does not have a playerbase, so we do not need it.
df_cleaned = df1.dropna(subset=["name", "tags", "peak_playernum"])

# We changed null genres into "unknown", because there were only two of them. It won't impact the analyzis much.
df_filled = df_cleaned.fillna({"genres": "unknown", "developers": "unknown", "publishers": "unknown", "discount_percentage": "0", "recommendations": "0"
                               ,"positive": "0", "negative": "0"})




In [9]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, regexp_replace, trim, when, round

In [10]:
df = df_filled.withColumn(
    "positive_clean",
    when(
        trim(col("positive")).rlike("^\d+$"),
        trim(col("positive")).cast("double")
    ).otherwise(0)
).withColumn(
    "negative_clean",
    when(
        trim(col("negative")).rlike("^\d+$"),
        trim(col("negative")).cast("double")
    ).otherwise(0)
).withColumn(
    "all_reviews",
    round(col("positive_clean") + col("negative_clean"), 2)
)


In [None]:
#creating the percentage of reviews columns
df_with_percentages = df.withColumn(
    "positive_percentage", 
    (F.col("positive") / F.col("all_reviews") * 100).cast("double")
).withColumn(
    "negative_percentage", 
    (F.col("negative") / F.col("all_reviews") * 100).cast("double")
)

df_with_percentages = df_with_percentages.withColumn(
    "positive_percentage", 
    F.round(F.col("positive_percentage"), 2)
).withColumn(
    "negative_percentage", 
    F.round(F.col("negative_percentage"), 2)
)

df_with_percentages.show()


+-------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+-----+-------------------+---------------+--------+--------+--------------+---------+--------------+--------------+-----------+-------------------+-------------------+
|  appid|                name|release_date|              genres|                tags|          developers|          publishers|price|discount_percentage|recommendations|positive|negative|peak_playernum|dlc_count|positive_clean|negative_clean|all_reviews|positive_percentage|negative_percentage|
+-------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+-----+-------------------+---------------+--------+--------+--------------+---------+--------------+--------------+-----------+-------------------+-------------------+
|    730|    Counter-Strike 2|  21/08/2012|['Action', 'Free ...|{'FPS': 90857, 'S...|           ['Valve']|         

In [None]:
df = df_with_percentages.fillna({"positive_percentage": "0", "negative_percentage": "0"})

In [26]:
null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
null_counts.show()

+-----+----+------------+------+----+----------+----------+-----+-------------------+---------------+--------+--------+--------------+---------+--------------+--------------+-----------+-------------------+-------------------+
|appid|name|release_date|genres|tags|developers|publishers|price|discount_percentage|recommendations|positive|negative|peak_playernum|dlc_count|positive_clean|negative_clean|all_reviews|positive_percentage|negative_percentage|
+-----+----+------------+------+----+----------+----------+-----+-------------------+---------------+--------+--------+--------------+---------+--------------+--------------+-----------+-------------------+-------------------+
|    0|   0|           0|     0|   0|         0|         0|    0|                  0|              0|       0|       0|             0|        0|             0|             0|          0|                  0|                  0|
+-----+----+------------+------+----+----------+----------+-----+-------------------+-------

In [27]:
df.show()

+-------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+-----+-------------------+---------------+--------+--------+--------------+---------+--------------+--------------+-----------+-------------------+-------------------+
|  appid|                name|release_date|              genres|                tags|          developers|          publishers|price|discount_percentage|recommendations|positive|negative|peak_playernum|dlc_count|positive_clean|negative_clean|all_reviews|positive_percentage|negative_percentage|
+-------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+-----+-------------------+---------------+--------+--------+--------------+---------+--------------+--------------+-----------+-------------------+-------------------+
|    730|    Counter-Strike 2|  21/08/2012|['Action', 'Free ...|{'FPS': 90857, 'S...|           ['Valve']|         

**Duplicates**

In [36]:
id_duplicates = df.groupBy("appid").count().filter("count > 1")
name_duplicates = df.groupBy("name").count().filter("count > 1")

id_duplicates.show()
name_duplicates.show()

+--------------------+-----+
|               appid|count|
+--------------------+-----+
|nd deciding the s...|    2|
|   ahead of schedule|    2|
+--------------------+-----+

+--------------------+-----+
|                name|count|
+--------------------+-----+
|      Eternal Return|    2|
|               Nomad|    2|
|Loading Screen Si...|    2|
|The Lord of the R...|    2|
|             Journey|    2|
|Call of Duty®: Bl...|    2|
|        Blood Strike|    2|
|Call of Duty®: Bl...|    2|
|    EA SPORTS FC™ 24|    4|
|              ISLAND|    2|
|Romance of the Th...|    2|
|            Downfall|    2|
|     torpedo strikes|    2|
|                Home|    2|
|Ys I & II Chronic...|    2|
| Monday Night Combat|    2|
|               Chasm|    2|
|       Second Chance|    2|
|      Hero's Journey|    3|
|          The Bunker|    3|
+--------------------+-----+
only showing top 20 rows



In [37]:
df_no_duplicates = df.dropDuplicates(["name"])

name_duplicates = df_no_duplicates.groupBy("name").count().filter("count > 1")
name_duplicates.show()

+----+-----+
|name|count|
+----+-----+
+----+-----+



In [38]:
used_columns = [
    "appid", "name", "release_date", "genres", "tags", "developers", "publishers",
    "price", "discount_percentage", "recommendations", "peak_playernum", "dlc_count", "all_reviews", "positive_percentage", "negative_percentage"
]

df_final = df_no_duplicates.select(*used_columns)

In [39]:
df_final.show(10)

+-------+------------------------------------+------------+--------------------+--------------------+--------------------+--------------------+-----+-------------------+---------------+--------------+---------+-----------+-------------------+-------------------+
|  appid|                                name|release_date|              genres|                tags|          developers|          publishers|price|discount_percentage|recommendations|peak_playernum|dlc_count|all_reviews|positive_percentage|negative_percentage|
+-------+------------------------------------+------------+--------------------+--------------------+--------------------+--------------------+-----+-------------------+---------------+--------------+---------+-----------+-------------------+-------------------+
|3066390|"軍艦島探訪記　ある写真家の記録　...|  01/08/2024|      ['Simulation']|{'Simulation': 53...|         ['XYimage']|         ['XYimage']|19.99|                  0|              0|             0|        0|        4.0|     

# Spark optimization


In [40]:
df_final.createOrReplaceTempView("games")

In [41]:
spark.sql("SELECT * FROM games").show(10)

+-------+------------------------------------+------------+--------------------+--------------------+--------------------+--------------------+-----+-------------------+---------------+--------------+---------+-----------+-------------------+-------------------+
|  appid|                                name|release_date|              genres|                tags|          developers|          publishers|price|discount_percentage|recommendations|peak_playernum|dlc_count|all_reviews|positive_percentage|negative_percentage|
+-------+------------------------------------+------------+--------------------+--------------------+--------------------+--------------------+-----+-------------------+---------------+--------------+---------+-----------+-------------------+-------------------+
|3066390|"軍艦島探訪記　ある写真家の記録　...|  01/08/2024|      ['Simulation']|{'Simulation': 53...|         ['XYimage']|         ['XYimage']|19.99|                  0|              0|             0|        0|        4.0|     