# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Code Lab 12: Recommendation System with ALS** </center>

---
**Alumnos**: David Abraham Naranjo Salgado, Benjamin Zarate y Angel Cortes

In [1]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-Recommender-Systems") \
    .master("spark://2c9c6f7ab23e:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/08 01:02:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Preparación de Datos

In [3]:
from team_name.spark_utils import SparkUtils

data = "/home/jovyan/notebooks/data/sample_movielens_ratings.txt"
# Define schema for the DataFrame
schema = SparkUtils.generate_schema([("userId", "integer"), ("movieId", "integer"), ("rating", "integer"), ("timestamp", "timestamp")])

df = spark.read.schema(schema).option("header", "false").option("delimiter", "::").csv(data)

df.show(5)

                                                                                

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     0|      2|     3|     NULL|
|     0|      3|     1|     NULL|
|     0|      5|     2|     NULL|
|     0|      9|     4|     NULL|
|     0|     11|     1|     NULL|
+------+-------+------+---------+
only showing top 5 rows



In [4]:
df = df.drop("timestamp")
df.show(5)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     0|      2|     3|
|     0|      3|     1|
|     0|      5|     2|
|     0|      9|     4|
|     0|     11|     1|
+------+-------+------+
only showing top 5 rows



# Configure ALS model

In [22]:
from pyspark.ml.recommendation import ALS

als = ALS(
    userCol="userId", 
    itemCol="movieId", 
    ratingCol="rating", 
    maxIter=16, 
    regParam=0.1, 
    rank=5, # Controls the dimensionality of the latent vector space for 
            # users and items.
    coldStartStrategy="drop"  # Avoids NaN predictions
)

# TRAINNING

In [23]:
model = als.fit(df)

# PREDICTIONS

In [24]:
# Generate recommendations for each user
user_recommendations = model.recommendForAllUsers(numItems=5)

# Show recommendations
user_recommendations.show(truncate=False)



+------+-------------------------------------------------------------------------------------+
|userId|recommendations                                                                      |
+------+-------------------------------------------------------------------------------------+
|0     |[{92, 2.5936005}, {2, 2.3094485}, {62, 2.2523856}, {25, 2.1904168}, {93, 2.1489012}] |
|10    |[{92, 2.775433}, {2, 2.6526644}, {25, 2.638788}, {93, 2.610295}, {49, 2.584126}]     |
|20    |[{22, 3.5625312}, {68, 3.118233}, {94, 3.0791268}, {51, 3.0760531}, {77, 3.0344026}] |
|1     |[{22, 2.9020486}, {68, 2.611147}, {77, 2.5359411}, {62, 2.502466}, {90, 2.4866726}]  |
|11    |[{32, 5.046121}, {30, 4.6833315}, {18, 4.6645722}, {27, 4.486559}, {8, 4.1586075}]   |
|21    |[{29, 4.324222}, {52, 4.246}, {76, 3.7285264}, {63, 3.5023663}, {53, 3.4783237}]     |
|22    |[{51, 4.459827}, {75, 4.4141407}, {22, 4.110449}, {74, 4.095386}, {88, 4.073851}]    |
|2     |[{93, 4.236278}, {83, 4.1585765}, {8, 4.05

                                                                                

## Predictions for all data

In [25]:
predictions = model.transform(df)
predictions.show(truncate=False)

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|22    |0      |1     |0.95978665|
|22    |3      |2     |1.625176  |
|22    |5      |2     |2.035528  |
|22    |6      |2     |2.2916586 |
|22    |9      |1     |1.5355445 |
|22    |10     |1     |1.4313806 |
|22    |11     |1     |1.2968231 |
|22    |13     |1     |1.5995504 |
|22    |14     |1     |1.38723   |
|22    |16     |1     |0.70297694|
|22    |18     |3     |3.0267131 |
|22    |19     |1     |1.4562871 |
|22    |22     |5     |4.1104484 |
|22    |25     |1     |0.98768014|
|22    |26     |1     |1.1374228 |
|22    |29     |3     |3.2444606 |
|22    |30     |5     |3.9997764 |
|22    |32     |4     |3.1884887 |
|22    |33     |1     |0.9095676 |
|22    |35     |1     |0.7483205 |
+------+-------+------+----------+
only showing top 20 rows



# EVALUATE MODEL

In [26]:
from pyspark.ml.evaluation import RegressionEvaluator
# Set up evaluator to compute RMSE
evaluator = RegressionEvaluator(
    metricName="rmse", 
    labelCol="rating", 
    predictionCol="prediction"
)

# Calculate RMSE
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error (RMSE) = {rmse}")

Root-mean-square error (RMSE) = 0.5693376676367434


In [27]:
sc.stop()