In [5]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import findspark
findspark.init()
spark = SparkSession.builder.getOrCreate()

ratings = spark.read.json("movies1.json").select("user_id","product_id","score").cache()
ratings = ratings.head(10000)
ratings = spark.createDataFrame(ratings)

ratings.show(5)

+--------------+----------+-----+
|       user_id|product_id|score|
+--------------+----------+-----+
|A141HP4LYPWMSR|B003AI2VGA|  3.0|
|A328S9RN3U5M68|B003AI2VGA|  3.0|
|A1I7QGUDP043DG|B003AI2VGA|  5.0|
|A1M5405JH9THP9|B003AI2VGA|  3.0|
| ATXL536YX71TR|B003AI2VGA|  3.0|
+--------------+----------+-----+
only showing top 5 rows



In [6]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

indexers = [
    StringIndexer(inputCol=column, outputCol=column+"_index").fit(ratings)
    for column in ["user_id", "product_id"]
]

pipeline = Pipeline(stages=indexers)
ratings_indexed = pipeline.fit(ratings).transform(ratings)

training_data,validation_data = ratings_indexed.randomSplit([8.0,2.0])

als = ALS(userCol="user_id_index",itemCol="product_id_index",ratingCol="score",rank=10,maxIter=5,regParam=0.01,coldStartStrategy="drop")
evaluator = RegressionEvaluator(metricName="rmse",labelCol="score",predictionCol="prediction")

model = als.fit(training_data)
predictions=model.transform(validation_data)
predictions.show(10,False)

+--------------+----------+-----+-------------+----------------+-----------+
|user_id       |product_id|score|user_id_index|product_id_index|prediction |
+--------------+----------+-----+-------------+----------------+-----------+
|A1HX00DL0SZM1G|B000063W1R|4.0  |126.0        |7.0             |0.44221467 |
|AQZH7YTWQPOBE |B000063W1R|3.0  |101.0        |7.0             |1.7895193  |
|ANCOMAI0I7LVG |B000063W1R|5.0  |1.0          |7.0             |9.392696   |
|A328S9RN3U5M68|B000063W1R|5.0  |6.0          |7.0             |-1.3334583 |
|A13TO1ZFAH9SVN|B000063W1R|5.0  |235.0        |7.0             |-2.0930994 |
|A3L2U581LL17EB|B000063W1R|4.0  |511.0        |7.0             |-0.02595526|
|A3OI841P5R6FCH|B000063W1R|4.0  |523.0        |7.0             |-0.20683861|
|A1I7QGUDP043DG|B003AI2VGA|5.0  |286.0        |144.0           |1.4592302  |
|A1TK6WNUIAEQRU|B000NDFLWG|4.0  |326.0        |91.0            |0.6543808  |
|A2AOZQ3WTNVVOK|B008FPU7AA|3.0  |23.0         |112.0           |-8.577151  |

In [7]:
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) = {rmse}")

# Additional Evaluation Metric: Mean Absolute Error (MAE)
evaluator_mae = RegressionEvaluator(
    metricName="mae",
    labelCol="score",
    predictionCol="prediction"
)

mae = evaluator_mae.evaluate(predictions)
print(f"Mean Absolute Error (MAE) = {mae}")


Root Mean Squared Error (RMSE) = 5.978962508599982
Mean Absolute Error (MAE) = 4.419619387324336
