In [0]:
#Remove directory

#dbutils.fs.rm("dbfs:/FileStore/tables/steam_200k.csv/", recurse=True)

In [0]:
# import mlflow and autolog machine learning runs

import mlflow

mlflow.pyspark.ml.autolog()

In [0]:
dbutils.fs.ls("/FileStore/tables/")

Out[4]: [FileInfo(path='dbfs:/FileStore/tables/account-models/', name='account-models/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/accounts/', name='accounts/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/accounts.zip', name='accounts.zip', size=5297592, modificationTime=1706721501000),
 FileInfo(path='dbfs:/FileStore/tables/accounts2/', name='accounts2/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/activations/', name='activations/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/activations.zip', name='activations.zip', size=8411369, modificationTime=1706718610000),
 FileInfo(path='dbfs:/FileStore/tables/clinicaltrial_2023/', name='clinicaltrial_2023/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/clinicaltrial_2023-1.zip', name='clinicaltrial_2023-1.zip', size=57166668, modificationTime=1714296754000),
 FileInfo(path='dbfs:/FileStore/tables/clinicaltrial_2023.zip', n

In [0]:

steam_200k = spark.read.csv("/FileStore/tables/steam_200k.csv",
                    header = "False",
                    inferSchema="true")

steam_200k.show(5)

+---------+--------------------+--------+-----+
|      _c0|                 _c1|     _c2|  _c3|
+---------+--------------------+--------+-----+
|151603712|The Elder Scrolls...|purchase|  1.0|
|151603712|The Elder Scrolls...|    play|273.0|
|151603712|           Fallout 4|purchase|  1.0|
|151603712|           Fallout 4|    play| 87.0|
|151603712|               Spore|purchase|  1.0|
+---------+--------------------+--------+-----+
only showing top 5 rows



In [0]:
steam_200k = steam_200k.toDF("userId", "game", "behaviour", "value")

steam_200k.show(5)

+---------+--------------------+---------+-----+
|   userId|                game|behaviour|value|
+---------+--------------------+---------+-----+
|151603712|The Elder Scrolls...| purchase|  1.0|
|151603712|The Elder Scrolls...|     play|273.0|
|151603712|           Fallout 4| purchase|  1.0|
|151603712|           Fallout 4|     play| 87.0|
|151603712|               Spore| purchase|  1.0|
+---------+--------------------+---------+-----+
only showing top 5 rows



## Exploratory Data Analysis

In [0]:
steam_200k.describe().show()

+-------+-------------------+----------------+---------+------------------+
|summary|             userId|            game|behaviour|             value|
+-------+-------------------+----------------+---------+------------------+
|  count|             200000|          200000|   200000|            200000|
|   mean|  1.0365586594664E8|           140.0|     null|17.874383999999914|
| stddev|7.208073512913968E7|             0.0|     null|138.05695165086792|
|    min|               5250|     007 Legends|     play|               0.1|
|    max|          309903146|theHunter Primal| purchase|           11754.0|
+-------+-------------------+----------------+---------+------------------+



In [0]:
#Creating a dataframe to check null values counts
from pyspark.sql.functions import count, col, when, isnan

null_df = steam_200k.select([count(when(col(c).contains('None') | \
                                        col(c).contains('NULL') | \
                                        (col(c) == '') | \
                                        col(c).isNull() | \
                                        isnan(c), c
                                        )).alias(c)
                            for c in steam_200k.columns])

null_df.show()

# There is no null values

+------+----+---------+-----+
|userId|game|behaviour|value|
+------+----+---------+-----+
|     0|   0|        0|    0|
+------+----+---------+-----+



In [0]:
from pyspark.ml.feature import StringIndexer

stringIndexer = StringIndexer(inputCol="game", outputCol="game_id")

model = stringIndexer.fit(steam_200k)
steam_200k_indexed = model.transform(steam_200k)
steam_200k_indexed = steam_200k_indexed.withColumn('game_id', steam_200k_indexed['game_id'].cast('integer'))

steam_200k_indexed.show(10)


2024/05/02 10:41:48 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '6077ebca7ff94baa8d9cb97646d46d67', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow


+---------+--------------------+---------+-----+-------+
|   userId|                game|behaviour|value|game_id|
+---------+--------------------+---------+-----+-------+
|151603712|The Elder Scrolls...| purchase|  1.0|      8|
|151603712|The Elder Scrolls...|     play|273.0|      8|
|151603712|           Fallout 4| purchase|  1.0|    100|
|151603712|           Fallout 4|     play| 87.0|    100|
|151603712|               Spore| purchase|  1.0|    332|
|151603712|               Spore|     play| 14.9|    332|
|151603712|   Fallout New Vegas| purchase|  1.0|     29|
|151603712|   Fallout New Vegas|     play| 12.1|     29|
|151603712|       Left 4 Dead 2| purchase|  1.0|      4|
|151603712|       Left 4 Dead 2|     play|  8.9|      4|
+---------+--------------------+---------+-----+-------+
only showing top 10 rows



In [0]:
from pyspark.sql.types import IntegerType

steam_200k_indexed = steam_200k_indexed.withColumn('value', steam_200k_indexed['value'].cast(IntegerType()))


In [0]:
steam_200k_indexed.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- game: string (nullable = true)
 |-- behaviour: string (nullable = true)
 |-- value: integer (nullable = true)
 |-- game_id: integer (nullable = true)



In [0]:
#Movies with highest number of members

steam_200k_indexed.groupBy("game").count().orderBy("count", ascending=False).limit(10).display()

game,count
Dota 2,9682
Team Fortress 2,4646
Counter-Strike Global Offensive,2789
Unturned,2632
Left 4 Dead 2,1752
Counter-Strike Source,1693
Counter-Strike,1424
Garry's Mod,1397
The Elder Scrolls V Skyrim,1394
Warframe,1271


Databricks visualization. Run in Databricks to view.

In [0]:
steam_200k.createOrReplaceTempView("steam_200kView")

In [0]:
%sql

select steam_200kView.game, count(*) as count from steam_200kView group by 1 order by count(*) desc limit 10

game,count
Dota 2,9682
Team Fortress 2,4646
Counter-Strike Global Offensive,2789
Unturned,2632
Left 4 Dead 2,1752
Counter-Strike Source,1693
Counter-Strike,1424
Garry's Mod,1397
The Elder Scrolls V Skyrim,1394
Warframe,1271


In [0]:
#Split the data based on the behaviour and then train the model seperately
from pyspark.sql.functions import col

game_play = steam_200k_indexed.filter(steam_200k.behaviour == "play")
game_purchase = steam_200k_indexed.filter(steam_200k.behaviour == "purchase")

game_play.show(5)
game_purchase.show(5)

+---------+--------------------+---------+-----+-------+
|   userId|                game|behaviour|value|game_id|
+---------+--------------------+---------+-----+-------+
|151603712|The Elder Scrolls...|     play|  273|      8|
|151603712|           Fallout 4|     play|   87|    100|
|151603712|               Spore|     play|   14|    332|
|151603712|   Fallout New Vegas|     play|   12|     29|
|151603712|       Left 4 Dead 2|     play|    8|      4|
+---------+--------------------+---------+-----+-------+
only showing top 5 rows

+---------+--------------------+---------+-----+-------+
|   userId|                game|behaviour|value|game_id|
+---------+--------------------+---------+-----+-------+
|151603712|The Elder Scrolls...| purchase|    1|      8|
|151603712|           Fallout 4| purchase|    1|    100|
|151603712|               Spore| purchase|    1|    332|
|151603712|   Fallout New Vegas| purchase|    1|     29|
|151603712|       Left 4 Dead 2| purchase|    1|      4|
+-----

In [0]:
# Are there any instances of game not purchased

game_purchase.groupBy("behaviour").count().orderBy("count", ascending=False).display()
game_play.groupBy("behaviour").count().orderBy("count", ascending=False).display()

#There are more games purchased than games played. Decided to train the model on the two implicit feedback

behaviour,count
purchase,129511


behaviour,count
play,70489


In [0]:
# Train on played game
# Train, test, splitting

(training, test) = game_purchase.randomSplit([0.7, 0.3])

In [0]:
training.show()

+------+--------------------+---------+-----+-------+
|userId|                game|behaviour|value|game_id|
+------+--------------------+---------+-----+-------+
|  5250|         Alien Swarm| purchase|    1|     32|
|  5250|     Cities Skylines| purchase|    1|    158|
|  5250|Counter-Strike So...| purchase|    1|      5|
|  5250|       Day of Defeat| purchase|    1|     28|
|  5250|  Deathmatch Classic| purchase|    1|     34|
|  5250|Deus Ex Human Rev...| purchase|    1|    118|
|  5250|              Dota 2| purchase|    1|      0|
|  5250|           Half-Life| purchase|    1|     45|
|  5250|         Half-Life 2| purchase|    1|     16|
|  5250|Half-Life 2 Death...| purchase|    1|     13|
|  5250|Half-Life 2 Episo...| purchase|    1|     37|
|  5250|Half-Life 2 Episo...| purchase|    1|     44|
|  5250|Half-Life Opposin...| purchase|    1|     75|
|  5250|              Portal| purchase|    1|     14|
|  5250|            Ricochet| purchase|    1|     35|
|  5250|     Team Fortress 2

In [0]:
from pyspark.ml.recommendation import ALS

recommender = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="game_id", ratingCol="value")

model = recommender.fit(training)

2024/05/02 10:42:01 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '4b79f6e56e054f899660bfe385667e60', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow


In [0]:
predictions = model.transform(test).dropna()

In [0]:
predictions.show()

+--------+--------------------+---------+-----+-------+----------+
|  userId|                game|behaviour|value|game_id|prediction|
+--------+--------------------+---------+-----+-------+----------+
|16167221|    Anno 1404 Venice| purchase|    1|   2102| 0.8999708|
|16167221|Dark Messiah of M...| purchase|    1|    638| 0.9913661|
|16167221|         Half-Life 2| purchase|    1|     16| 0.9942625|
|16167221|  Hitman Blood Money| purchase|    1|    413|0.99875677|
|16167221|Oddworld Abe's Ex...| purchase|    1|   1106| 1.0095332|
|16167221|              Portal| purchase|    1|     14|  0.992677|
|16167221|            Warframe| purchase|    1|      9|  0.943786|
|35264447|  Bejeweled 2 Deluxe| purchase|    1|   2637| 0.5058149|
|35264447|     BookWorm Deluxe| purchase|    1|   2107| 1.0739595|
|35264447|Bookworm Adventur...| purchase|    1|   2215| 1.0524024|
|35264447|Insaniquarium! De...| purchase|    1|   2066| 1.0379715|
|35264447|       Peggle Deluxe| purchase|    1|   1061| 1.0513

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(metricName="rmse", labelCol="value", predictionCol="prediction")

rmse = evaluator.evaluate(predictions)

print('Root Mean Square Error is %g' %rmse)

Root Mean Square Error is 0.0889128


In [0]:
from pyspark.ml.tuning import ParamGridBuilder
 
als = ALS(maxIter=10, userCol="userId", itemCol="game_id", ratingCol="value", coldStartStrategy="drop", seed=100)
 
# Create a parameter grid
 
parameters = ParamGridBuilder()\
.addGrid(als.rank, [5, 10, 15,20])\
.addGrid(als.regParam, [0.001, 0.005, 0.01, 0.05, 0.1])\
.build()

In [0]:
from pyspark.ml.tuning import TrainValidationSplit
 
# Define TrainValidationSplit
 
tvs = TrainValidationSplit()\
.setSeed(100)\
.setTrainRatio(0.75)\
.setEstimatorParamMaps(parameters)\
.setEstimator(recommender)\
.setEvaluator(evaluator)

In [0]:
# Train model using grid search
 
gridsearchModel = tvs.fit(training)

2024/05/02 10:42:29 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '7ab41f77097d4e3e8836b10f9c061d8d', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow


In [0]:
# Select best model and identify the parameters
 
bestModel = gridsearchModel.bestModel
 
print("Parameters for the best model:")
print("Rank Parameter: %g" %bestModel.rank)
print("RegParam Parameter: %g" %bestModel._java_obj.parent().getRegParam())

Parameters for the best model:
Rank Parameter: 10
RegParam Parameter: 0.01


In [0]:
evaluator.evaluate(bestModel.transform(test).dropna())

Out[27]: 0.08891277362182896

In [0]:
# Recommended to all users

userRecs = model.recommendForAllUsers(10)

In [0]:
userRecs.show()

+--------+--------------------+
|  userId|     recommendations|
+--------+--------------------+
|   76767|[{1909, 1.4789294...|
|  144736|[{1909, 1.4735514...|
|  229911|[{1909, 1.4789824...|
|  835015|[{1909, 1.4730397...|
|  948368|[{1909, 1.4958116...|
|  975449|[{1222, 1.2975358...|
| 1268792|[{1909, 1.4785532...|
| 2531540|[{1457, 1.4706628...|
| 2753525|[{1352, 1.5829908...|
| 3450426|[{1909, 1.448066}...|
| 7923954|[{1457, 1.4623661...|
| 7987640|[{1909, 1.5123131...|
| 8259307|[{1457, 1.4236703...|
| 8567888|[{1909, 1.4315543...|
| 8585433|[{1457, 1.3505102...|
| 8784496|[{1909, 1.398669}...|
| 8795607|[{1909, 1.4337885...|
|10144413|[{1457, 1.3930854...|
|10595342|[{1457, 1.4211575...|
|10599862|[{1352, 1.646375}...|
+--------+--------------------+
only showing top 20 rows

