<a href="https://colab.research.google.com/github/AnnisaFitry/Tugas7-BigData/blob/main/Tugas7_BD_FIX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TUGAS 7

## Slide 30 dan 48

In [1]:
from google.colab import drive

# Accessing My Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317130 sha256=e76742d91f1fad4c0ce2c68e116e5e4cb461b2666262c8cbd8197bd9b017cee0
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col

In [4]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

class Rating:
    def __init__(self, userId, movieId, rating, timestamp):
        self.userId = int(userId)
        self.movieId = int(movieId)
        self.rating = float(rating)
        self.timestamp = float(timestamp)

def parseRating(str):
    fields = str.split(",")
    assert(len(fields) == 4)
    return Rating(int(fields[0]), int(fields[1]), float(fields[2]), float(fields[3]))

In [5]:
# Test it
parseRating("1,1193,5,978300760")

<__main__.Rating at 0x7ff0f82cfee0>

In [6]:
ratings = spark.read.csv("/content/drive/MyDrive/Tugas7-Bigdata/ml-latest-small/ratings.csv", header=True, inferSchema=True)
# Check if everything is ok
ratings.show(5)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
|     1|     47|   5.0|964983815|
|     1|     50|   5.0|964982931|
+------+-------+------+---------+
only showing top 5 rows



In [7]:
training, test = ratings.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Alternating Least Squares (ALS) matrix factorization.
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating")

model = als.fit(training)
model.write().overwrite().save("mymodel")

# Prepare the recommendations
predictions = model.transform(test)
squaredErrors = predictions.withColumn("squaredError", (col("rating") - col("prediction")) ** 2)
mse = squaredErrors.filter(~col("squaredError").isNull()).selectExpr("sum(squaredError) as sumSquaredError").collect()[0][0] / squaredErrors.filter(~col("squaredError").isNull()).count()

In [8]:
predictions.show(10)

+------+-------+------+---------+----------+
|userId|movieId|rating|timestamp|prediction|
+------+-------+------+---------+----------+
|     1|    223|   3.0|964980985|   4.13816|
|     1|    333|   5.0|964981179| 4.8847094|
|     1|    349|   4.0|964982563|  4.005332|
|     1|    441|   4.0|964980868| 4.9221625|
|     1|    500|   3.0|964981208| 4.5089884|
|     1|    593|   4.0|964983793| 5.0435615|
|     1|    596|   5.0|964982838|  4.631361|
|     1|    648|   3.0|964982563|  4.516017|
|     1|    661|   5.0|964982838| 3.7964694|
|     1|    733|   4.0|964982400| 4.4651685|
+------+-------+------+---------+----------+
only showing top 10 rows



In [9]:
predictions.write.csv("ml2-predictions.csv", header=True)

In [10]:
import math
result = predictions.rdd.map(lambda row: row['prediction'] - row['rating']).map(lambda x: x*x).filter(lambda x: not math.isnan(x))
mse = result.reduce(lambda x,y: x+y)

In [11]:
print(mse)

22490.105130537104
