In [1]:
# import pyspark
# import sys 
# from pyspark import SparkContext
# from pyspark import SparkConf
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import col 
# from pyspark.sql.functions import expr


from pyspark.mllib.recommendation import ALS
from numpy import array
import hashlib
import math

In [2]:
#QUESTION 1

from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import findspark
findspark.init()
spark = SparkSession.builder.getOrCreate()

ratings = spark.read.json("movies 1.json").select("user_id","product_id","score").cache()
ratings = ratings.head(10000)
ratings = spark.createDataFrame(ratings)

ratings.show(5)



+--------------+----------+-----+
|       user_id|product_id|score|
+--------------+----------+-----+
|A141HP4LYPWMSR|B003AI2VGA|  3.0|
|A328S9RN3U5M68|B003AI2VGA|  3.0|
|A1I7QGUDP043DG|B003AI2VGA|  5.0|
|A1M5405JH9THP9|B003AI2VGA|  3.0|
| ATXL536YX71TR|B003AI2VGA|  3.0|
+--------------+----------+-----+
only showing top 5 rows



In [3]:
#QUESTION 2


from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

indexers = [
    StringIndexer(inputCol=column, outputCol=column+"_index").fit(ratings)
    for column in ["user_id", "product_id"]
]

pipeline = Pipeline(stages=indexers)
ratings_indexed = pipeline.fit(ratings).transform(ratings)

training_data,validation_data = ratings_indexed.randomSplit([8.0,2.0])

als = ALS(userCol="user_id_index",itemCol="product_id_index",ratingCol="score",rank=10,maxIter=5,regParam=0.01,coldStartStrategy="drop")
evaluator = RegressionEvaluator(metricName="rmse",labelCol="score",predictionCol="prediction")

model = als.fit(training_data)
predictions=model.transform(validation_data)
predictions.show(10,False)

+--------------+----------+-----+-------------+----------------+-----------+
|user_id       |product_id|score|user_id_index|product_id_index|prediction |
+--------------+----------+-----+-------------+----------------+-----------+
|A15Q7ABIU9O9YZ|B000063W1R|5.0  |243.0        |7.0             |7.3311815  |
|A3MV1KKHX51FYT|B008FPU7AA|3.0  |85.0         |112.0           |-0.31129515|
|ADX5JX5LKLC22 |B000063W1R|5.0  |580.0        |7.0             |4.994489   |
|A13TO1ZFAH9SVN|B000063W1R|5.0  |235.0        |7.0             |7.3311815  |
|A39CX0EE4BZCZC|B000063W1R|5.0  |9.0          |7.0             |0.29798898 |
|A32AK8FOAZEPE2|B000063W1R|3.0  |453.0        |7.0             |1.506263   |
|A87RT63V7SMD3 |B000063W1R|4.0  |565.0        |7.0             |0.14513317 |
|AQ01Q3070LT29 |B000063W1R|1.0  |38.0         |7.0             |3.1729112  |
|A1XY417YALQB6G|B000063W1R|5.0  |335.0        |7.0             |-0.32873374|
|A1T0Z4J5PPLTC7|0790747324|4.0  |321.0        |12.0            |-3.8623018 |

In [4]:
#QUESTION 3

user1 = validation_data.filter(validation_data['user_id_index']==1.0).select(['product_id','user_id','user_id_index','product_id_index'])
user1.show()
recommendations = model.transform(user1) 
recommendations.orderBy('prediction',ascending=False).show()


+----------+-------------+-------------+----------------+
|product_id|      user_id|user_id_index|product_id_index|
+----------+-------------+-------------+----------------+
|B0001EYSQC|ANCOMAI0I7LVG|          1.0|            31.0|
|B001OKUREO|ANCOMAI0I7LVG|          1.0|            10.0|
+----------+-------------+-------------+----------------+

+----------+-------------+-------------+----------------+----------+
|product_id|      user_id|user_id_index|product_id_index|prediction|
+----------+-------------+-------------+----------------+----------+
|B0001EYSQC|ANCOMAI0I7LVG|          1.0|            31.0| 3.7107296|
|B001OKUREO|ANCOMAI0I7LVG|          1.0|            10.0|-1.3654118|
+----------+-------------+-------------+----------------+----------+



In [6]:
#QUESTION 4

rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) = {rmse}")

# Additional Evaluation Metric: Mean Absolute Error (MAE)
evaluator_mae = RegressionEvaluator(
    metricName="mae",
    labelCol="score",
    predictionCol="prediction"
)

mae = evaluator_mae.evaluate(predictions)
print(f"Mean Absolute Error (MAE) = {mae}")


Root Mean Squared Error (RMSE) = 5.387574248149598
Mean Absolute Error (MAE) = 4.120510554215715
