# Lab 04: Recommendation System using PySpark

# Q1
Demonstrate how to load a dataset suitable for recommendation systems into a PySpark
DataFrame.

In [1]:
from pyspark.sql import SparkSession 
from pyspark.ml.evaluation import RegressionEvaluator 
from pyspark.ml.recommendation import ALS 

spark = SparkSession.builder.appName('Recommender').getOrCreate() 



In [2]:
data = spark.read.csv('book_ratings.csv', 
                    inferSchema=True,header=True) 

data.show(5)

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    314|     5|
|      1|    439|     3|
|      1|    588|     5|
|      1|   1169|     4|
|      1|   1185|     4|
+-------+-------+------+
only showing top 5 rows



In [3]:
data.describe().show()

+-------+-----------------+------------------+------------------+
|summary|          book_id|           user_id|            rating|
+-------+-----------------+------------------+------------------+
|  count|           981756|            981756|            981756|
|   mean|4943.275635697668|25616.759933221696|3.8565335989797873|
| stddev|2873.207414896143|15228.338825882149|0.9839408559619973|
|    min|                1|                 1|                 1|
|    max|            10000|             53424|                 5|
+-------+-----------------+------------------+------------------+



# Q2
Implement a PySpark script that splits the data and trains a recommendation model.

In [4]:
train_data, test_data = data.randomSplit([0.8, 0.2])

# Q3
Implement a PySpark script using the ALS algorithm for collaborative filtering.

In [5]:
als = ALS(maxIter=5, 
        regParam=0.01, 
        userCol="user_id", 
        itemCol="book_id", 
        ratingCol="rating") 

model = als.fit(train_data)

# Q4
Implement code to evaluate the performance of the recommendation model using
appropriate metrics

In [6]:
predictions = model.transform(test_data) 

predictions.show()

+-------+-------+------+----------+
|book_id|user_id|rating|prediction|
+-------+-------+------+----------+
|      1|  48482|     3| 3.8724136|
|      1|  26629|     4| 4.5914617|
|      2|   3922|     5|   4.89165|
|      1|  32055|     4| 4.1461143|
|      1|  33697|     4| 4.4002333|
|      1|  11854|     4| 3.7051845|
|      1|  51480|     1| 2.0448668|
|      2|   3022|     4| 3.0778546|
|      2|  10610|     5| 4.6511855|
|      1|  18313|     5|  6.553751|
|      1|  25214|     4| 4.1055894|
|      1|  31001|     4|  5.369414|
|      1|  50104|     5| 5.1758285|
|      1|   5885|     5| 4.4334064|
|      2|   5885|     4|  4.411032|
|      1|  10335|     4| 3.8800526|
|      1|  21713|     5|  4.197895|
|      1|  51460|     3| 3.8316765|
|      1|  30681|     5|  4.552037|
|      1|  10944|     5| 4.6214848|
+-------+-------+------+----------+
only showing top 20 rows



In [7]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction") 
rmse = evaluator.evaluate(predictions) 
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = nan


In [8]:
user1 = test_data.filter(test_data['user_id']==5461).select(['book_id','user_id']) 

user1.show()

+-------+-------+
|book_id|user_id|
+-------+-------+
|     46|   5461|
|     60|   5461|
|     65|   5461|
|     66|   5461|
|     72|   5461|
|    129|   5461|
|    142|   5461|
|    148|   5461|
|    194|   5461|
|    264|   5461|
|    386|   5461|
|    395|   5461|
|    416|   5461|
|    444|   5461|
|    478|   5461|
|    531|   5461|
|    588|   5461|
|    595|   5461|
|    639|   5461|
|    646|   5461|
+-------+-------+
only showing top 20 rows



In [9]:
recommendations = model.transform(user1) 

recommendations.orderBy('prediction',ascending=False).show()

+-------+-------+----------+
|book_id|user_id|prediction|
+-------+-------+----------+
|    129|   5461|  5.245639|
|    444|   5461|  4.895541|
|     65|   5461|  4.848259|
|   2854|   5461|  4.699825|
|    869|   5461| 4.6996374|
|   1597|   5461| 4.6787944|
|   2128|   5461|  4.619902|
|    478|   5461| 4.4679513|
|     66|   5461| 4.4638157|
|    142|   5461|  4.458869|
|     72|   5461| 4.4545608|
|    416|   5461| 4.4195013|
|    588|   5461|  4.399512|
|     60|   5461|  4.353798|
|   3692|   5461|  4.306996|
|    966|   5461| 4.2808766|
|   1402|   5461| 4.2368226|
|    386|   5461|  4.197394|
|    531|   5461|  4.106376|
|    639|   5461| 4.0107675|
+-------+-------+----------+
only showing top 20 rows



In [10]:
spark.stop()
