# Recommendation System using PySpark

### Lab Exercises:

1) Demonstrate how to load a dataset suitable for recommendation systems into a PySpark DataFrame.

2) Implement a PySpark script that splits the data and trains a recommendation model.

3) Implement a PySpark script using the ALS algorithm for collaborative filtering.

4) Implement code to evaluate the performance of the recommendation model using appropriate metrics.

In [12]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder.appName('MovieRecommendation').getOrCreate()



In [13]:
json_file_path = "movies 1.json"

In [14]:
df = spark.read.json(json_file_path)

In [15]:
df.printSchema()
df.show()

root
 |-- helpfulness: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- profile_name: string (nullable = true)
 |-- review: string (nullable = true)
 |-- score: double (nullable = true)
 |-- summary: string (nullable = true)
 |-- time: long (nullable = true)
 |-- user_id: string (nullable = true)

+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+
|helpfulness|product_id|        profile_name|              review|score|             summary|      time|       user_id|
+-----------+----------+--------------------+--------------------+-----+--------------------+----------+--------------+
|        7/7|B003AI2VGA|Brian E. Erland "...|Synopsis: On the ...|  3.0|"There Is So Much...|1182729600|A141HP4LYPWMSR|
|        4/4|B003AI2VGA|          Grady Harp|THE VIRGIN OF JUA...|  3.0|Worthwhile and Im...|1181952000|A328S9RN3U5M68|
|       8/10|B003AI2VGA|Chrissy K. McVay ...|The scenes in thi...|  5.0|This m

In [19]:
#q2 & q3

# Extract relevant columns for recommendation
df = df.select("user_id", "product_id", "score")

# Check for missing or NaN values in the 'score' column
df = df.dropna(subset=["score"])

# Convert user_id and product_id to numeric indices
user_indexer = StringIndexer(inputCol="user_id", outputCol="user_index", handleInvalid="keep")
product_indexer = StringIndexer(inputCol="product_id", outputCol="product_index", handleInvalid="keep")

# Create an ALS (Alternating Least Squares) recommendation model
als = ALS(maxIter=5, regParam=0.01, userCol="user_index", itemCol="product_index", ratingCol="score")

# Create a pipeline to execute the indexers and ALS model
pipeline = Pipeline(stages=[user_indexer, product_indexer, als])

# Split the data into training and test sets
(training_data, test_data) = df.randomSplit([0.8, 0.2])

# Train the recommendation model
model = pipeline.fit(training_data)

# Make predictions on the test set
predictions = model.transform(test_data)

# Check for NaN values in the 'prediction' column
predictions = predictions.dropna(subset=["prediction"])

# Show the predictions
predictions.select("user_id", "product_id", "prediction").show()


+--------------+----------+-----------+
|       user_id|product_id| prediction|
+--------------+----------+-----------+
|A1HMNDO4IUFB0P|B002OHDRF2|  2.9723883|
|A1NLJT22OOPC9H|B002OHDRF2|  2.9723883|
|A3F2JUHM6C9RZ9|B0001G6PZC|  0.9354527|
|A3NQI2JLNPFOF8|6303257933| -2.8448637|
|A1NIEIENEWXCCQ|B0001G6PZC|0.070017986|
|A22RY8N8CNDF3A|B000UGBOT0|   8.940226|
|A2VE83MZF98ITY|0790747324| -2.1918008|
|A1DUWRMZVLRKAJ|B002OHDRF2|   2.894468|
|A2AVV9LV9UXT6F|B000063W1R| 0.43413496|
|A32AK8FOAZEPE2|B000063W1R| 0.25041533|
|A32AK8FOAZEPE2|B0016OLXN2|  0.7743087|
| AW8G5VS3T7IB3|B002OHDRF2|  3.9094448|
|A1VW4ZBXKEYTZ9|B002OHDRF2| 0.99079615|
| AIOXMENE1HBR7|0790747324|  2.2108438|
|A1YQ6QB2127AJ4|B0001G6PZC| -1.0496899|
|A16837CKPTA07O|B000063W1R|  -2.073543|
|A1CLHLW9PFKG9Q|B002OHDRF2|  4.9141936|
|A1SX3Z5KAAY9M7|B0012EM5GK|  4.7644606|
|A356RFKNIG043B|B00096S43U| -10.630995|
|A25ZVI6RH1KA5L|0790747324|  1.5432976|
+--------------+----------+-----------+
only showing top 20 rows



In [54]:
first_id = df.head()
first_id[0]

'A141HP4LYPWMSR'

In [59]:
user1 = test_data.filter(test_data['user_id']==first_id[0]).select(['product_id','user_id'])
user1.show()
recommendations = model.transform(user1) 
recommendations.orderBy('prediction',ascending=False).show()

+----------+--------------+
|product_id|       user_id|
+----------+--------------+
|B000ZLFALI|A141HP4LYPWMSR|
|B0002HOEPI|A141HP4LYPWMSR|
+----------+--------------+

+----------+--------------+----------+-------------+----------+
|product_id|       user_id|user_index|product_index|prediction|
+----------+--------------+----------+-------------+----------+
|B0002HOEPI|A141HP4LYPWMSR|      24.0|        267.0| 3.6757596|
|B000ZLFALI|A141HP4LYPWMSR|      24.0|          4.0| 2.7316642|
+----------+--------------+----------+-------------+----------+



In [58]:
#q4

# Evaluate the model using RMSE
evaluator = RegressionEvaluator(metricName="rmse", labelCol="score", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 4.546954626466892


In [7]:
# Stop the Spark session
spark.stop()