In [36]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [37]:

from pyspark.sql import SparkSession


spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Hello Pyspark") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [38]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [39]:
#!pip install wget

In [40]:
#import wget

In [41]:
#file = wget.download('https://raw.githubusercontent.com/apache/spark/master/data/mllib/als/sample_movielens_ratings.txt')

In [42]:
# lines = spark.read.text('/content/sample_movielens_ratings.txt').rdd
lines = spark.read.text('sample_movielens_ratings.txt').rdd

In [43]:
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]), rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2], seed = 99)

In [44]:
ratings.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     0|      2|   3.0|1424380312|
|     0|      3|   1.0|1424380312|
|     0|      5|   2.0|1424380312|
|     0|      9|   4.0|1424380312|
|     0|     11|   1.0|1424380312|
|     0|     12|   2.0|1424380312|
|     0|     15|   1.0|1424380312|
|     0|     17|   1.0|1424380312|
|     0|     19|   1.0|1424380312|
|     0|     21|   1.0|1424380312|
|     0|     23|   1.0|1424380312|
|     0|     26|   3.0|1424380312|
|     0|     27|   1.0|1424380312|
|     0|     28|   1.0|1424380312|
|     0|     29|   1.0|1424380312|
|     0|     30|   1.0|1424380312|
|     0|     31|   1.0|1424380312|
|     0|     34|   1.0|1424380312|
|     0|     37|   1.0|1424380312|
|     0|     41|   2.0|1424380312|
+------+-------+------+----------+
only showing top 20 rows



# MaxIter = 5, regParam = 0.01

ALS ditambahkan seed 99 agar hasilnya lebih konkrit pada tiap eksekusi

RMSE = 2.063

Makin kecil RMSE, makin baik model yang digunakan

In [45]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop", seed = 99)
model = als.fit(training)

In [46]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 2.0626911738577944


# MaxIter = 10, regParam = 0.01

RMSE = 2.051

In [47]:
als = ALS(maxIter=10, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop", seed = 99)
model = als.fit(training)
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 2.0513665951162934


# MaxIter = 15, regParam = 0.01

Batas MaxIter adalah 15. Jika MaxIter adalah 20, maka koneksi terputus

RMSE = 1.969

In [48]:
als = ALS(maxIter=15, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop", seed = 99)
model = als.fit(training)
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.969322142542551


# MaxIter = 15, refParam = 0.05

RMSE = 1.073

In [49]:
als = ALS(maxIter=15, regParam=0.05, userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop", seed = 99)
model = als.fit(training)
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.072580752954934


# MaxIter = 15, refParam = 0.1

RMSE = 0.9995

In [55]:
als = ALS(maxIter=15, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop", seed = 99)
model = als.fit(training)
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.999509719159825


# MaxIter = 15, refParam = 0.5

RMSE = 1.207

In [51]:
als = ALS(maxIter=15, regParam=0.5, userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop", seed = 99)
model = als.fit(training)
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.206610765278927


# MaxIter = 15, regParam = 0.25

RMSE = 1.027

In [52]:
als = ALS(maxIter=15, regParam=0.25, userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop", seed = 99)
model = als.fit(training)
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.0268995626538635


# regParam antara 0.1 sampai 0.25

Dicari regParam antara 0.14 sampai 0.15 selanjutnya

In [53]:
print('regParam\tRMSE')
for i in range(1,15):
  als = ALS(maxIter=15, regParam=0.1 + (i * 0.01), userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop", seed = 99)
  model = als.fit(training)
  predictions = model.transform(test)
  evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
  rmse = evaluator.evaluate(predictions)
  print(str(0.1 + (i * 0.01)), '\t', rmse)

regParam	RMSE
0.11 	 0.9955326587019249
0.12000000000000001 	 0.9929725417544433
0.13 	 0.9915492236967024
0.14 	 0.9911746886903878
0.15000000000000002 	 0.991962484546821
0.16 	 0.9949905116947975
0.17 	 1.0018878325897993
0.18 	 1.0006318112217538
0.19 	 1.0012860686330511
0.2 	 1.0038882304582124
0.21000000000000002 	 1.0075676409167806
0.22 	 1.0119116628913933
0.23 	 1.016656108557533
0.24000000000000002 	 1.0216635698157095


# regParam antara 0.14 sampai 0.15

Selanjutnya akan dicari regParam antara 0.14 sampai 0.141

In [56]:
print('regParam\tRMSE')
for i in range(1,11):
  als = ALS(maxIter=15, regParam=0.14 + (i * 0.001), userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop", seed = 99)
  model = als.fit(training)
  predictions = model.transform(test)
  evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
  rmse = evaluator.evaluate(predictions)
  print(str(0.14 + (i * 0.001)), '\t', rmse)

regParam	RMSE
0.14100000000000001 	 0.9911959392033595
0.14200000000000002 	 0.9912285603092823
0.14300000000000002 	 0.9912729442133439
0.14400000000000002 	 0.9913294884331396
0.14500000000000002 	 0.9913986733239897
0.14600000000000002 	 0.9914812102520433
0.14700000000000002 	 0.9915778000739405
0.14800000000000002 	 0.9916894316696587
0.14900000000000002 	 0.9918171911974133
0.15000000000000002 	 0.991962484546821


# regParam antara 0.14 sampai 0.141

Sehingga, regParam yang tepat adalah 0.14

In [57]:
print('regParam\tRMSE')
for i in range(1,11):
  als = ALS(maxIter=15, regParam=0.14 + (i * 0.0001), userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop", seed = 99)
  model = als.fit(training)
  predictions = model.transform(test)
  evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
  rmse = evaluator.evaluate(predictions)
  print(str(0.14 + (i * 0.0001)), '\t', rmse)

regParam	RMSE
0.1401 	 0.9911763060343723
0.14020000000000002 	 0.9911780452716271
0.1403 	 0.9911798727346361
0.14040000000000002 	 0.9911818440906244
0.1405 	 0.9911839067755341
0.1406 	 0.9911861029665376
0.14070000000000002 	 0.9911883650889476
0.1408 	 0.9911907982262541
0.14090000000000003 	 0.9911933002299144
0.14100000000000001 	 0.9911959392033598


# maxIter = 15, regParam = 0.14

RMSE = 0.9912

In [59]:
als = ALS(maxIter=15, regParam=0.14, userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop", seed = 99)
model = als.fit(training)
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.9911746886903878


In [60]:
userRecs = model.recommendForAllUsers(10)
movieRecs = model.recommendForAllItems(10)

In [61]:
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)

In [66]:
userSubsetRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    26|[{22, 4.605868}, ...|
|    19|[{90, 3.3382707},...|
|    29|[{46, 4.193815}, ...|
+------+--------------------+



In [62]:
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)

In [67]:
movieSubSetRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     65|[{23, 3.8746052},...|
|     26|[{0, 2.0833347}, ...|
|     29|[{8, 4.713466}, {...|
+-------+--------------------+

