In [125]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, isnull
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType

# Creating Spark session
spark = SparkSession.builder.appName("SteamGamesSuccess").getOrCreate()

data = "C:/Users/jansu/Desktop/archive/games_march2025_full.csv"
df = spark.read.csv("C:/Users/jansu/Desktop/archive/games_march2025_full.csv", header=True, inferSchema=True)

In [126]:
df.show(5)

+------+--------------------+------------+------------+-----+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+-------+-----+-----+----------------+--------------------+------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------+--------------------+------------------+----------------+--------------------+------------------------+-----------------------+-----------------------+----------------------+--------------------+----------+--------------------+--------------------+-----------------+--------------+------------------+
| appid|                name|release_date|required_age|price|dlc_count|detailed_description|      about_the_game|   short_description|             review

In [127]:
used_columns = [
    "appid", "name", "release_date", "genres", "tags", "developers", "publishers",
    "price", "discount", "recommendations", "positive", "negative",
    "num_reviews_total", "peak_ccu", "dlc_count"
]

# Keep only these columns
df1 = df.select(*used_columns)

df1.show(5)

+------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+---------------+------------------+----------------+-----------------+----------+---------+
| appid|                name|release_date|              genres|                tags|          developers|          publishers|price|            discount|recommendations|          positive|        negative|num_reviews_total|  peak_ccu|dlc_count|
+------+--------------------+------------+--------------------+--------------------+--------------------+--------------------+-----+--------------------+---------------+------------------+----------------+-----------------+----------+---------+
|   730|    Counter-Strike 2|  2012-08-21|['Action', 'Free ...|{'FPS': 90857, 'S...|           ['Valve']|           ['Valve']|  0.0|                   0|        4401572|           7480813|         1135108|          8632939|   1212356|      1.0|
|578080| PUBG: BATTL

In [128]:
df1 = df1.withColumnRenamed("discount", "discount_percentage")

In [129]:
df1 = df1.withColumnRenamed("peak_ccu", "peak_playernum")

In [130]:
df1.printSchema()

root
 |-- appid: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- developers: string (nullable = true)
 |-- publishers: string (nullable = true)
 |-- price: string (nullable = true)
 |-- discount_percentage: string (nullable = true)
 |-- recommendations: string (nullable = true)
 |-- positive: string (nullable = true)
 |-- negative: string (nullable = true)
 |-- num_reviews_total: string (nullable = true)
 |-- peak_playernum: string (nullable = true)
 |-- dlc_count: double (nullable = true)



In [131]:
print(f"Total Records: {df1.count()}")

df1.describe().show()

Total Records: 94948
+-------+------------------+-------------+--------------------+------------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+
|summary|             appid|         name|        release_date|                  genres|                tags|          developers|          publishers|             price| discount_percentage|     recommendations|            positive|            negative|   num_reviews_total|      peak_playernum|         dlc_count|
+-------+------------------+-------------+--------------------+------------------------+--------------------+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+
|  count|             94948|   

# Data Preprocessing
- Handling Missing Values

In [132]:
from pyspark.sql.functions import col, sum

null_counts = df1.select([sum(col(c).isNull().cast("int")).alias(c) for c in df1.columns])
null_counts.show()

+-----+----+------------+------+----+----------+----------+-----+-------------------+---------------+--------+--------+-----------------+--------------+---------+
|appid|name|release_date|genres|tags|developers|publishers|price|discount_percentage|recommendations|positive|negative|num_reviews_total|peak_playernum|dlc_count|
+-----+----+------------+------+----+----------+----------+-----+-------------------+---------------+--------+--------+-----------------+--------------+---------+
|    0|   2|           0|     2|  55|         5|         3|    0|                 14|              6|     104|     793|               10|            10|        0|
+-----+----+------------+------+----+----------+----------+-----+-------------------+---------------+--------+--------+-----------------+--------------+---------+



In [None]:
# We dropped null name rows, because a game without a name is invalid.
# We dropped null tag rows, because there were quite a few of them and we do not get much out of them.
# We dropped null peak_playernum rows, because if it has had no peak it means it it does not have a playerbase, so we do not need it.
df_cleaned = df1.dropna(subset=["name", "tags", "peak_playernum"])

# We changed null genres into "unknown", because there were only two of them. It won't impact the analyzis much.
df_filled = df_cleaned.fillna({"genres": "unknown", "developers": "unknown", "publishers": "unknown", "discount_percentage": "0", "recommendations": "0"
                               ,"positive": "0", "negative": "0"})

# df_filled2 = df_filled.withColumn(
#     "num_reviews_total",
#     F.when(F.col("num_reviews_total").isNull(), F.col("positive") + F.col("negative"))
#     .otherwise(F.col("num_reviews_total"))
# )

In [134]:
null_counts = df_filled.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_filled.columns])
null_counts.show()

+-----+----+------------+------+----+----------+----------+-----+-------------------+---------------+--------+--------+-----------------+--------------+---------+
|appid|name|release_date|genres|tags|developers|publishers|price|discount_percentage|recommendations|positive|negative|num_reviews_total|peak_playernum|dlc_count|
+-----+----+------------+------+----+----------+----------+-----+-------------------+---------------+--------+--------+-----------------+--------------+---------+
|    0|   0|           0|     0|   0|         0|         0|    0|                  0|              0|       0|       0|               10|             0|        0|
+-----+----+------------+------+----+----------+----------+-----+-------------------+---------------+--------+--------+-----------------+--------------+---------+



**Duplicates**

In [139]:
id_duplicates = df_filled.groupBy("appid").count().filter("count > 1")
name_duplicates = df_filled.groupBy("name").count().filter("count > 1")

id_duplicates.show()
name_duplicates.show()

+-----+-----+
|appid|count|
+-----+-----+
+-----+-----+

+--------------------+-----+
|                name|count|
+--------------------+-----+
|              ISLAND|    2|
|Romance of the Th...|    2|
|      Eternal Return|    2|
|Ys I & II Chronic...|    2|
|               Nomad|    2|
|Loading Screen Si...|    2|
|      Hero's Journey|    3|
|The Lord of the R...|    2|
|             Journey|    2|
|Call of Duty®: Bl...|    2|
|        Blood Strike|    2|
|Call of Duty®: Bl...|    2|
|          The Bunker|    3|
|    EA SPORTS FC™ 24|    4|
|              WASTED|    2|
|            Downfall|    2|
|         Battle Ball|    2|
|            Paradise|    2|
|     Graveyard Shift|    2|
|           The Guest|    2|
+--------------------+-----+
only showing top 20 rows



In [141]:
df_no_duplicates = df_filled.dropDuplicates(["name"])

name_duplicates = df_no_duplicates.groupBy("name").count().filter("count > 1")
name_duplicates.show()

+----+-----+
|name|count|
+----+-----+
+----+-----+

