#### Q1) Demonstrate how to load a dataset suitable for recommendation systems into a PySpark DataFrame.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import findspark
findspark.init()
spark = SparkSession.builder.getOrCreate()

ratings = spark.read.json("movies_1.json").select("user_id","product_id","score").cache()
# ratings = ratings.head(10000)
# ratings = spark.createDataFrame(ratings)

ratings.show(5)



+--------------+----------+-----+
|       user_id|product_id|score|
+--------------+----------+-----+
|A141HP4LYPWMSR|B003AI2VGA|  3.0|
|A328S9RN3U5M68|B003AI2VGA|  3.0|
|A1I7QGUDP043DG|B003AI2VGA|  5.0|
|A1M5405JH9THP9|B003AI2VGA|  3.0|
| ATXL536YX71TR|B003AI2VGA|  3.0|
+--------------+----------+-----+
only showing top 5 rows



#### Q2) Implement a PySpark script that splits the data and trains a recommendation model.

In [2]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

indexers = [
    StringIndexer(inputCol=column, outputCol=column+"_index").fit(ratings)
    for column in ["user_id", "product_id"]
]

pipeline = Pipeline(stages=indexers)
ratings_indexed = pipeline.fit(ratings).transform(ratings)

training_data,validation_data = ratings_indexed.randomSplit([8.0,2.0])

als = ALS(userCol="user_id_index",itemCol="product_id_index",ratingCol="score",rank=10,maxIter=5,regParam=0.01,coldStartStrategy="drop")
evaluator = RegressionEvaluator(metricName="rmse",labelCol="score",predictionCol="prediction")

model = als.fit(training_data)
predictions=model.transform(validation_data)
predictions.show(10,True)



+--------------+----------+-----+-------------+----------------+-----------+
|       user_id|product_id|score|user_id_index|product_id_index| prediction|
+--------------+----------+-----+-------------+----------------+-----------+
| A5KLP39OKXD5P|B0001G6PZC|  5.0|        540.0|             7.0|  4.6193957|
| ADEETJWRKD6OA|B000063W82|  5.0|       5287.0|             6.0| -0.8339311|
|A16QODENBJVUI1|B0012EM5GK|  5.0|         65.0|           303.0|-0.74546933|
|A2SPL0BGAV9DMJ|B002OHDRF2|  5.0|       3834.0|            21.0|  3.2403333|
|A3OIZEXS8CGBOD|B0001G6PZC|  1.0|       1005.0|             7.0|  0.9981969|
|A3OIZEXS8CGBOD|B0001G6PZC|  1.0|       1005.0|             7.0|  0.9981969|
|A2EIR50X0I6HHA|B0006FFRDE|  2.0|       3441.0|          1117.0|  1.9378917|
|A34KBX6VF28QYN|6304286961|  3.0|       4182.0|            85.0| -1.0662425|
|A2SY0QKQO1N34F|B002OHDRF2|  5.0|       3844.0|            21.0|  3.2403333|
|A371J1XULN9E9W|B0001G6PZC|  2.0|       4263.0|             7.0| 0.25101998|

#### Q3) Implement a PySpark script using the ALS algorithm for collaborative filtering.

In [3]:
user1 = validation_data.filter(validation_data['user_id_index']==1.0).select(['product_id','user_id','user_id_index','product_id_index'])
user1.show()
recommendations = model.transform(user1) 
recommendations.orderBy('prediction',ascending=False).show()

+----------+--------------+-------------+----------------+
|product_id|       user_id|user_id_index|product_id_index|
+----------+--------------+-------------+----------------+
|B000UGBOT0|A2NJO6YE954DBH|          1.0|            78.0|
|B000AMWIVM|A2NJO6YE954DBH|          1.0|            11.0|
|B003NEQ74S|A2NJO6YE954DBH|          1.0|           239.0|
|B0000DK4QK|A2NJO6YE954DBH|          1.0|            51.0|
|B0072V6PPE|A2NJO6YE954DBH|          1.0|            53.0|
|B00005JMZK|A2NJO6YE954DBH|          1.0|            10.0|
|B000OYTPJO|A2NJO6YE954DBH|          1.0|           112.0|
|B00004CZR2|A2NJO6YE954DBH|          1.0|            55.0|
|B00005Y6YM|A2NJO6YE954DBH|          1.0|           366.0|
|B00005Y6YQ|A2NJO6YE954DBH|          1.0|           376.0|
+----------+--------------+-------------+----------------+

+----------+--------------+-------------+----------------+-----------+
|product_id|       user_id|user_id_index|product_id_index| prediction|
+----------+--------------+----

#### Q4) Implement code to evaluate the performance of the recommendation model using appropriate metrics.

In [4]:
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) = {rmse}")

# Additional Evaluation Metric: Mean Absolute Error (MAE)
evaluator_mae = RegressionEvaluator(
    metricName="mae",
    labelCol="score",
    predictionCol="prediction"
)

mae = evaluator_mae.evaluate(predictions)
print(f"Mean Absolute Error (MAE) = {mae}")

Root Mean Squared Error (RMSE) = 4.613334467627979
Mean Absolute Error (MAE) = 3.088277224188922
