In [0]:
# If you are running Databricks Runtime version 7.1 or above, uncomment this line and run this cell:
%pip install mlflow

Python interpreter will be restarted.
You should consider upgrading via the '/local_disk0/.ephemeral_nfs/envs/pythonEnv-0f945cec-d43d-4747-9cb2-03c4ce238594/bin/python -m pip install --upgrade pip' command.
Python interpreter will be restarted.


In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.tuning import CrossValidator ,ParamGridBuilder 
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
import mlflow
import mlflow.sklearn

In [0]:
#1st step is to create session and read data
Spark = SparkSession.builder.appName('recommendation').getOrCreate
sparkDF = spark.read.csv("/FileStore/tables/ratings.csv", header="true", inferSchema="true")
sparkDF.printSchema()

root
 |-- UserID: integer (nullable = true)
 |-- MovieID: integer (nullable = true)
 |-- Rating: integer (nullable = true)
 |-- Timestamp: integer (nullable = true)



In [0]:
sparkDF.show()

+------+-------+------+---------+
|UserID|MovieID|Rating|Timestamp|
+------+-------+------+---------+
|     1|   1193|     5|978300760|
|     1|    661|     3|978302109|
|     1|    914|     3|978301968|
|     1|   3408|     4|978300275|
|     1|   2355|     5|978824291|
|     1|   1197|     3|978302268|
|     1|   1287|     5|978302039|
|     1|   2804|     5|978300719|
|     1|    594|     4|978302268|
|     1|    919|     4|978301368|
|     1|    595|     5|978824268|
|     1|    938|     4|978301752|
|     1|   2398|     4|978302281|
|     1|   2918|     4|978302124|
|     1|   1035|     5|978301753|
|     1|   2791|     4|978302188|
|     1|   2687|     3|978824268|
|     1|   2018|     4|978301777|
|     1|   3105|     5|978301713|
|     1|   2797|     4|978302039|
+------+-------+------+---------+
only showing top 20 rows



In [0]:
#2nd step is to spilt data ro train and test
train, test = sparkDF.randomSplit([0.7 , 0.3]) 

In [0]:
#3rd step is to make als object
als  = ALS(userCol='UserID', itemCol='MovieID', seed=None, ratingCol='Rating', nonnegative=False)

In [0]:
#4th step: Parameter Grid
param_grid = ParamGridBuilder().addGrid(als.rank , [10, 20]).addGrid(als.maxIter , [20, 30]).build()

In [0]:
#5th step create evaluator
evaluator = RegressionEvaluator(labelCol="Rating", predictionCol="prediction")

In [0]:
#6th step cross validation
cv = CrossValidator(estimator= als , estimatorParamMaps= param_grid , evaluator = evaluator, numFolds = 3)

In [0]:
#7th step traing cv
model = cv.fit(train)

MLlib will automatically track trials in MLflow. After your tuning fit() call has completed, view the MLflow UI to see logged runs.


In [0]:
#8th step: make prediction
predictions = model.transform(test)
predictions.show()

+------+-------+------+----------+----------+
|UserID|MovieID|Rating| Timestamp|prediction|
+------+-------+------+----------+----------+
|   148|   1580|     4| 977333525|   4.19132|
|   148|   2122|     4| 979576798| 2.5385976|
|   148|   2142|     4| 979578765| 3.4822428|
|   148|   2366|     5| 977333254|  3.536327|
|   463|   1580|     1| 976227202|  2.823757|
|   471|   3175|     5| 976224710| 3.6693158|
|  1088|   1591|     1| 974936438| 2.3341706|
|   392|   1645|     4| 976550402|  3.134005|
|   392|   1959|     4| 976552172| 3.2426915|
|   858|   1580|     3| 975345152| 3.9262373|
|   897|   1645|     3| 975222773| 3.2163181|
|    31|   2366|     5| 978120364| 3.6254513|
|   516|   1580|     3| 976280139| 3.0079026|
|  1139|   1645|     3| 974877900| 3.4581625|
|   137|   1645|     4|1000868305| 3.5603244|
|   251|   1580|     4| 976715892| 3.5030975|
|   808|   2122|     3| 975393329| 2.6154156|
|    53|   3175|     5| 977949217|  4.472388|
|   970|    463|     3| 980625253|

In [0]:
predictions = predictions.dropna()

In [0]:
display(predictions)

UserID,MovieID,Rating,Timestamp,prediction
148,1580,4,977333525,4.19132
148,2122,4,979576798,2.5385976
148,2142,4,979578765,3.4822428
148,2366,5,977333254,3.536327
463,1580,1,976227202,2.823757
471,3175,5,976224710,3.6693158
1088,1591,1,974936438,2.3341706
392,1645,4,976550402,3.134005
392,1959,4,976552172,3.2426915
858,1580,3,975345152,3.9262373


In [0]:
#9th step: evaluation
rsme = evaluator.evaluate(predictions)
rsme

Out[28]: 0.8633819197606695