In [0]:
# import mlflow and autolog machine learning runs

import mlflow

mlflow.pyspark.ml.autolog()

In [0]:
dbutils.fs.ls("/FileStore/tables/")

Out[81]: [FileInfo(path='dbfs:/FileStore/tables/account-models/', name='account-models/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/accounts/', name='accounts/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/accounts.zip', name='accounts.zip', size=5297592, modificationTime=1706721501000),
 FileInfo(path='dbfs:/FileStore/tables/accounts2/', name='accounts2/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/activations/', name='activations/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/activations.zip', name='activations.zip', size=8411369, modificationTime=1706718610000),
 FileInfo(path='dbfs:/FileStore/tables/clinicaltrial_2023/', name='clinicaltrial_2023/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/FileStore/tables/clinicaltrial_2023-1.zip', name='clinicaltrial_2023-1.zip', size=57166668, modificationTime=1714296754000),
 FileInfo(path='dbfs:/FileStore/tables/clinicaltrial_2023.zip', 

In [0]:

steam_200k = spark.read.csv("/FileStore/tables/steam_200k.csv",
                    header = "False",
                    inferSchema="true")

steam_200k.show(5)

+---------+--------------------+--------+-----+
|      _c0|                 _c1|     _c2|  _c3|
+---------+--------------------+--------+-----+
|151603712|The Elder Scrolls...|purchase|  1.0|
|151603712|The Elder Scrolls...|    play|273.0|
|151603712|           Fallout 4|purchase|  1.0|
|151603712|           Fallout 4|    play| 87.0|
|151603712|               Spore|purchase|  1.0|
+---------+--------------------+--------+-----+
only showing top 5 rows



In [0]:
steam_200k = steam_200k.toDF("userId", "game", "behaviour", "value")

steam_200k.show(5)

+---------+--------------------+---------+-----+
|   userId|                game|behaviour|value|
+---------+--------------------+---------+-----+
|151603712|The Elder Scrolls...| purchase|  1.0|
|151603712|The Elder Scrolls...|     play|273.0|
|151603712|           Fallout 4| purchase|  1.0|
|151603712|           Fallout 4|     play| 87.0|
|151603712|               Spore| purchase|  1.0|
+---------+--------------------+---------+-----+
only showing top 5 rows



In [0]:
# Add Unique id to each game

from pyspark.sql.functions import monotonically_increasing_id

game_id = steam_200k.groupBy("game").agg(monotonically_increasing_id().alias("game_id"))

game_id = game_id.withColumn("game_id", col("game_id").cast("integer"))

game_id.show(10)

+--------------------+-------+
|                game|game_id|
+--------------------+-------+
|              Dota 2|      0|
|METAL GEAR SOLID ...|      1|
|LEGO Batman The V...|      2|
|                RIFT|      3|
|             Anodyne|      4|
|  Legend of Grimrock|      5|
|Divinity Original...|      6|
|            Meltdown|      7|
|SanctuaryRPG Blac...|      8|
|       Snuggle Truck|      9|
+--------------------+-------+
only showing top 10 rows



In [0]:
steam_200k_id = steam_200k.join(game_id, ["game"], "left")

steam_200k_id.show(5)

+--------------------+---------+---------+-----+-------+
|                game|   userId|behaviour|value|game_id|
+--------------------+---------+---------+-----+-------+
|The Elder Scrolls...|151603712| purchase|  1.0|   2609|
|The Elder Scrolls...|151603712|     play|273.0|   2609|
|           Fallout 4|151603712| purchase|  1.0|    410|
|           Fallout 4|151603712|     play| 87.0|    410|
|               Spore|151603712| purchase|  1.0|   3868|
+--------------------+---------+---------+-----+-------+
only showing top 5 rows



In [0]:
steam_200k_id.printSchema()

root
 |-- game: string (nullable = true)
 |-- userId: integer (nullable = true)
 |-- behaviour: string (nullable = true)
 |-- value: double (nullable = true)
 |-- game_id: integer (nullable = true)



In [0]:
#Movies with highest number of members
steam_200k.groupBy("game").count().orderBy("count", ascending=False).limit(10).display()

game,count
Dota 2,9682
Team Fortress 2,4646
Counter-Strike Global Offensive,2789
Unturned,2632
Left 4 Dead 2,1752
Counter-Strike Source,1693
Counter-Strike,1424
Garry's Mod,1397
The Elder Scrolls V Skyrim,1394
Warframe,1271


In [0]:
steam_200k.createOrReplaceTempView("steam_200kView")

In [0]:
%sql

select steam_200kView.game, count(*) as count from steam_200kView group by 1 order by count(*) desc limit 10

game,count
Dota 2,9682
Team Fortress 2,4646
Counter-Strike Global Offensive,2789
Unturned,2632
Left 4 Dead 2,1752
Counter-Strike Source,1693
Counter-Strike,1424
Garry's Mod,1397
The Elder Scrolls V Skyrim,1394
Warframe,1271


In [0]:
#Split the data based on the behaviour and then train the model seperately
from pyspark.sql.functions import col

game_play = steam_200k_id.filter(steam_200k.behaviour == "play")
game_purchase = steam_200k_id.filter(steam_200k.behaviour == "purchase")

game_play.show(5)
game_purchase.show(5)

+--------------------+---------+---------+-----+-------+
|                game|   userId|behaviour|value|game_id|
+--------------------+---------+---------+-----+-------+
|The Elder Scrolls...|151603712|     play|273.0|   2609|
|           Fallout 4|151603712|     play| 87.0|    410|
|               Spore|151603712|     play| 14.9|   3868|
|   Fallout New Vegas|151603712|     play| 12.1|   3820|
|       Left 4 Dead 2|151603712|     play|  8.9|     69|
+--------------------+---------+---------+-----+-------+
only showing top 5 rows

+--------------------+---------+---------+-----+-------+
|                game|   userId|behaviour|value|game_id|
+--------------------+---------+---------+-----+-------+
|The Elder Scrolls...|151603712| purchase|  1.0|   2609|
|           Fallout 4|151603712| purchase|  1.0|    410|
|               Spore|151603712| purchase|  1.0|   3868|
|   Fallout New Vegas|151603712| purchase|  1.0|   3820|
|       Left 4 Dead 2|151603712| purchase|  1.0|     69|
+-----

In [0]:
# Are there any instances of game not purchased

game_purchase.groupBy("behaviour").count().orderBy("count", ascending=False).display()
game_play.groupBy("behaviour").count().orderBy("count", ascending=False).display()

#There are more games purchased than games played.

behaviour,count
purchase,129511


behaviour,count
play,70489


In [0]:
#Drop behaviour column

game_play = game_play.drop("game", "behaviour")
game_purchase = game_purchase.drop("game", "behaviour")

game_play.show(5)
game_purchase.show(5)

+---------+-----+-------+
|   userId|value|game_id|
+---------+-----+-------+
|151603712|273.0|   2609|
|151603712| 87.0|    410|
|151603712| 14.9|   3868|
|151603712| 12.1|   3820|
|151603712|  8.9|     69|
+---------+-----+-------+
only showing top 5 rows

+---------+-----+-------+
|   userId|value|game_id|
+---------+-----+-------+
|151603712|  1.0|   2609|
|151603712|  1.0|    410|
|151603712|  1.0|   3868|
|151603712|  1.0|   3820|
|151603712|  1.0|     69|
+---------+-----+-------+
only showing top 5 rows



In [0]:
# Train on played game
# Train, test, splitting

(training, test) = game_play.randomSplit([0.8, 0.2], seed=10)

In [0]:
from pyspark.ml.recommendation import ALS

recommender_1 = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="game_id", ratingCol="value",seed=12)

model_1 = recommender_1.fit(training)

2024/04/30 00:47:25 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '880ba9d74df842cfbcc5b940a7e3c070', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pyspark.ml workflow


In [0]:
predictions = model_1.transform(test).dropna()

In [0]:
predictions.show()

+--------+------+-------+----------+
|  userId| value|game_id|prediction|
+--------+------+-------+----------+
|  298950|  65.0|      6| 13.990758|
| 1936551|  12.1|      5| 11.421807|
| 1950243|   1.1|      0|  35.72331|
| 2083767|   3.1|      0|  12.77536|
| 4834220| 175.0|     17| 26.099392|
|11161178|   1.0|      6|  9.195946|
|11373749|   2.2|      0|  75.48928|
|11373749|   6.7|      4| 7.9728937|
|11403772|  13.7|      6|  -4.05429|
|11403772|2443.0|      0| 27.728891|
|11813637|2341.0|      0| 17.134727|
|12529679|   0.5|      0| 25.701632|
|13565651| 201.0|     17| 1.6733923|
|14417857|  57.0|      0| 163.61313|
|17995238| 462.0|      3| 49.716846|
|20464587|   0.8|      3| -8.597855|
|21061921|   0.1|     37| 11.374548|
|24366790| 190.0|      3|  71.13456|
|24469287|   2.2|      0| 12.043607|
|24469287|  54.0|      1| 12.562393|
+--------+------+-------+----------+
only showing top 20 rows



In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator_1 = RegressionEvaluator(metricName="rmse", labelCol="value", predictionCol="prediction")

rmse = evaluator_1.evaluate(predictions)

print('Root Mean Square Error is %g' %rmse)

Root Mean Square Error is 715.521
