In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://mirror.its.dal.ca/apache/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xvf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark
#===============
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"
!update-alternatives --config java
!java -version

In [0]:
import findspark
findspark.init()

In [0]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = SparkContext.getOrCreate()

In [0]:
#===============

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('recommender').getOrCreate()
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [0]:
#Data Set
!wget -q http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!unzip ml-latest-small.zip

In [0]:
df = spark.read.csv('ml-latest-small/ratings.csv', inferSchema=True, header=True)

In [0]:
df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [0]:
df.columns

['userId', 'movieId', 'rating', 'timestamp']

In [0]:
df = df.select(['userId', 'movieId', 'rating'])

In [0]:
df.head(5)

[Row(userId=1, movieId=1, rating=4.0),
 Row(userId=1, movieId=3, rating=4.0),
 Row(userId=1, movieId=6, rating=4.0),
 Row(userId=1, movieId=47, rating=5.0),
 Row(userId=1, movieId=50, rating=5.0)]

In [0]:
df.show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|      3|   4.0|
|     1|      6|   4.0|
|     1|     47|   5.0|
|     1|     50|   5.0|
|     1|     70|   3.0|
|     1|    101|   5.0|
|     1|    110|   4.0|
|     1|    151|   5.0|
|     1|    157|   5.0|
|     1|    163|   5.0|
|     1|    216|   5.0|
|     1|    223|   3.0|
|     1|    231|   5.0|
|     1|    235|   4.0|
|     1|    260|   5.0|
|     1|    296|   3.0|
|     1|    316|   3.0|
|     1|    333|   5.0|
|     1|    349|   4.0|
+------+-------+------+
only showing top 20 rows



In [0]:
df.describe().show()

+-------+------------------+----------------+------------------+
|summary|            userId|         movieId|            rating|
+-------+------------------+----------------+------------------+
|  count|            100836|          100836|            100836|
|   mean|326.12756356856676|19435.2957177992| 3.501556983616962|
| stddev| 182.6184914635004|35530.9871987003|1.0425292390606342|
|    min|                 1|               1|               0.5|
|    max|               610|          193609|               5.0|
+-------+------------------+----------------+------------------+



In [0]:
training, test = df.randomSplit([0.85,0.15])

In [0]:
als = ALS(maxIter=20, regParam=0.5, userCol='userId', itemCol='movieId', ratingCol='rating', rank = 20, coldStartStrategy="drop")
model = als.fit(training)
predictions = model.transform(test)

In [0]:
predictions.describe().show()

+-------+------------------+-----------------+------------------+------------------+
|summary|            userId|          movieId|            rating|        prediction|
+-------+------------------+-----------------+------------------+------------------+
|  count|             14434|            14434|             14434|             14434|
|   mean|322.35215463488987| 17582.9490785645| 3.500588887349314|2.7752505001808543|
| stddev|182.23979256436348|33019.33774505243|1.0342216517223937|0.5372386701543078|
|    min|                 1|                1|               0.5|        0.29850447|
|    max|               610|           187593|               5.0|          5.061314|
+-------+------------------+-----------------+------------------+------------------+



In [0]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating')
rmse = evaluator.evaluate(predictions)
rmse

1.1304201790353219

In [0]:
predictions.take(3)

[Row(userId=597, movieId=471, rating=2.0, prediction=3.1950230598449707),
 Row(userId=387, movieId=471, rating=3.0, prediction=2.6318652629852295),
 Row(userId=273, movieId=471, rating=5.0, prediction=3.2172458171844482)]

In [0]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

In [0]:
userRecs.take(3)

[Row(userId=471, recommendations=[Row(movieId=8477, rating=4.876676559448242), Row(movieId=40491, rating=4.876676559448242), Row(movieId=6818, rating=4.876676559448242), Row(movieId=3567, rating=4.610818862915039), Row(movieId=2314, rating=4.542365074157715), Row(movieId=112804, rating=4.494029521942139), Row(movieId=25947, rating=4.454928398132324), Row(movieId=132333, rating=4.418132781982422), Row(movieId=156605, rating=4.389008045196533), Row(movieId=67618, rating=4.316531658172607)]),
 Row(userId=463, recommendations=[Row(movieId=8477, rating=4.8488078117370605), Row(movieId=40491, rating=4.8488078117370605), Row(movieId=6818, rating=4.8488078117370605), Row(movieId=3567, rating=4.584465980529785), Row(movieId=2314, rating=4.516409873962402), Row(movieId=112804, rating=4.468349933624268), Row(movieId=25947, rating=4.429471492767334), Row(movieId=132333, rating=4.392887592315674), Row(movieId=156605, rating=4.363926887512207), Row(movieId=67618, rating=4.291866302490234)]),
 Row(us

In [0]:
movieRecs.take(3)

[Row(movieId=1580, recommendations=[Row(userId=53, rating=3.8588218688964844), Row(userId=43, rating=3.653885841369629), Row(userId=515, rating=3.5200746059417725), Row(userId=251, rating=3.5150742530822754), Row(userId=452, rating=3.497750997543335), Row(userId=12, rating=3.4844002723693848), Row(userId=25, rating=3.48211932182312), Row(userId=276, rating=3.4755635261535645), Row(userId=523, rating=3.4570260047912598), Row(userId=30, rating=3.453098773956299)]),
 Row(movieId=4900, recommendations=[Row(userId=53, rating=3.957070827484131), Row(userId=43, rating=3.746924877166748), Row(userId=515, rating=3.609710931777954), Row(userId=251, rating=3.6045775413513184), Row(userId=452, rating=3.586813449859619), Row(userId=12, rating=3.5731303691864014), Row(userId=25, rating=3.5707879066467285), Row(userId=276, rating=3.564059019088745), Row(userId=523, rating=3.5450496673583984), Row(userId=30, rating=3.541032314300537)]),
 Row(movieId=5300, recommendations=[Row(userId=53, rating=4.07978