#### Q1: Demonstrate how to load a dataset suitable for recommendation systems into a PySpark DataFrame.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import findspark
findspark.init()
spark = SparkSession.builder.getOrCreate()

ratings = spark.read.json("movies 1.json").select("user_id","product_id","score").cache()
ratings = ratings.head(10000)
ratings = spark.createDataFrame(ratings)

ratings.show(5)



+--------------+----------+-----+
|       user_id|product_id|score|
+--------------+----------+-----+
|A141HP4LYPWMSR|B003AI2VGA|  3.0|
|A328S9RN3U5M68|B003AI2VGA|  3.0|
|A1I7QGUDP043DG|B003AI2VGA|  5.0|
|A1M5405JH9THP9|B003AI2VGA|  3.0|
| ATXL536YX71TR|B003AI2VGA|  3.0|
+--------------+----------+-----+
only showing top 5 rows



#### Q2: Implement a PySpark script that splits the data and trains a recommendation model.

In [2]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

indexers = [
    StringIndexer(inputCol=column, outputCol=column+"_index").fit(ratings)
    for column in ["user_id", "product_id"]
]

pipeline = Pipeline(stages=indexers)
ratings_indexed = pipeline.fit(ratings).transform(ratings)

training_data,validation_data = ratings_indexed.randomSplit([8.0,2.0])

als = ALS(userCol="user_id_index",itemCol="product_id_index",ratingCol="score",rank=10,maxIter=5,regParam=0.01,coldStartStrategy="drop")
evaluator = RegressionEvaluator(metricName="rmse",labelCol="score",predictionCol="prediction")

model = als.fit(training_data)
predictions=model.transform(validation_data)
predictions.show(10,False)

+--------------+----------+-----+-------------+----------------+-----------+
|user_id       |product_id|score|user_id_index|product_id_index|prediction |
+--------------+----------+-----+-------------+----------------+-----------+
|A3MV1KKHX51FYT|B008FPU7AA|3.0  |85.0         |112.0           |0.17213519 |
|ADX5JX5LKLC22 |B000063W1R|5.0  |580.0        |7.0             |4.9947615  |
|A3ENN12GLNTUAF|B0071AD95K|5.0  |493.0        |128.0           |-1.9853785 |
|A2NUHWMHA9XNKV|B000063W1R|5.0  |157.0        |7.0             |-3.2191253 |
|A3N2MVBI1A2I9Y|B000063W1R|5.0  |519.0        |7.0             |-0.7700126 |
|A28ILXH590CMRJ|B000063W1R|4.0  |355.0        |7.0             |1.1505669  |
|A13TO1ZFAH9SVN|B000063W1R|5.0  |235.0        |7.0             |-2.1389847 |
|A1TK6WNUIAEQRU|B000NDFLWG|4.0  |326.0        |91.0            |-0.26183853|
|A2NJO6YE954DBH|B000063W1R|4.0  |8.0          |7.0             |-3.544883  |
|A41I67QYRAOSQ |B000063W1R|4.0  |550.0        |7.0             |-0.7700126 |

#### Q3: Implement a PySpark script using the ALS algorithm for collaborative filtering.

In [3]:
user1 = validation_data.filter(validation_data['user_id_index']==1.0).select(['product_id','user_id','user_id_index','product_id_index'])
user1.show()
recommendations = model.transform(user1) 
recommendations.orderBy('prediction',ascending=False).show()

+----------+-------------+-------------+----------------+
|product_id|      user_id|user_id_index|product_id_index|
+----------+-------------+-------------+----------------+
|0800103688|ANCOMAI0I7LVG|          1.0|            30.0|
|B004EPYZQ2|ANCOMAI0I7LVG|          1.0|             3.0|
|B0001EYSQC|ANCOMAI0I7LVG|          1.0|            31.0|
+----------+-------------+-------------+----------------+

+----------+-------------+-------------+----------------+----------+
|product_id|      user_id|user_id_index|product_id_index|prediction|
+----------+-------------+-------------+----------------+----------+
|B0001EYSQC|ANCOMAI0I7LVG|          1.0|            31.0|-0.8538896|
|B004EPYZQ2|ANCOMAI0I7LVG|          1.0|             3.0|-3.5181665|
|0800103688|ANCOMAI0I7LVG|          1.0|            30.0|-13.863456|
+----------+-------------+-------------+----------------+----------+



#### Q4: Implement code to evaluate the performance of the recommendation model using appropriate metrics.

In [4]:
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) = {rmse}")

# Additional Evaluation Metric: Mean Absolute Error (MAE)
evaluator_mae = RegressionEvaluator(
    metricName="mae",
    labelCol="score",
    predictionCol="prediction"
)

mae = evaluator_mae.evaluate(predictions)
print(f"Mean Absolute Error (MAE) = {mae}")

Root Mean Squared Error (RMSE) = 5.40235713605181
Mean Absolute Error (MAE) = 4.209195835054521
