# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Carrera: Ing. en Sistemas Computacionales** </center>
---
### <center> **Primavera 2025** </center>
---

**Lab 12**: Recommendation System with ALS

**Fecha**: 11 de mayo del 2025

**Nombre del Estudiante**: Marco Albanese, Vicente Siloe

**Profesor**: Pablo Camarillo Ramirez

In [25]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [26]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-Recommender-Systems") \
    .master("spark://2da3617855ce:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

#### Preparación de datos

In [27]:
from equipo_mcqueen.spark_utils import SparkUtils

movie_ratings_data = [
    ("userId", "IntegerType"),
    ("movieId", "IntegerType"),
    ("rating", "IntegerType"),
    ("date", "TimestampType")
]

ratings_schema = SparkUtils.generate_schema(movie_ratings_data)

ratings_df = spark.read.schema(ratings_schema).option("header", "false").option("delimiter", "::").csv("/home/jovyan/notebooks/data/sample_movielens_ratings.txt")
ratings_df = ratings_df.drop("date")

#### Configure ALS model

In [28]:
from pyspark.ml.recommendation import ALS

als = ALS(
    userCol="userId", 
    itemCol="movieId", 
    ratingCol="rating", 
    maxIter=10, 
    regParam=0.1, 
    rank=5, # Controls the dimensionality of the latent vector space for 
            # users and items.
    coldStartStrategy="drop"  # Avoids NaN predictions
)

#### Training

In [29]:
model = als.fit(ratings_df)

#### Predictions

In [30]:
# Generate recommendations for each user
user_recommendations = model.recommendForAllUsers(numItems=5)

# Show recommendations
user_recommendations.show(truncate=False)

+------+-------------------------------------------------------------------------------------+
|userId|recommendations                                                                      |
+------+-------------------------------------------------------------------------------------+
|0     |[{92, 2.5840385}, {2, 2.316802}, {62, 2.2325232}, {25, 2.157748}, {93, 2.1528697}]   |
|10    |[{92, 2.768342}, {2, 2.6728113}, {93, 2.6242015}, {25, 2.5927775}, {49, 2.5867324}]  |
|20    |[{22, 3.5597918}, {68, 3.1278815}, {94, 3.084497}, {51, 3.0827737}, {77, 3.0246763}] |
|1     |[{22, 2.9029422}, {68, 2.630123}, {77, 2.5238972}, {62, 2.501064}, {90, 2.4797387}]  |
|11    |[{32, 5.082464}, {18, 4.705235}, {30, 4.6826043}, {27, 4.5120797}, {8, 4.229401}]    |
|21    |[{29, 4.320379}, {52, 4.2401457}, {76, 3.716108}, {63, 3.5063725}, {53, 3.4859684}]  |
|22    |[{51, 4.458179}, {75, 4.418395}, {22, 4.118836}, {74, 4.1007586}, {88, 4.0829244}]   |
|2     |[{93, 4.2531066}, {83, 4.1469526}, {8, 4.0

                                                                                

#### Movie metadata

In [31]:
movies = [
    (16, "The Matrix"),
    (23, "Shawshank Redemption"),
    (41, "The Dark Knight"),
    (55, "Inception"),
    (79, "The Lord of the Rings: The Return of the King")
]

movies_schema = SparkUtils.generate_schema([
    ("movieId", "IntegerType"),
    ("title", "StringType")
])

movies_df = spark.createDataFrame(movies, schema=movies_schema)

In [32]:
from pyspark.sql.functions import explode

# Explode recommendations for easier reading
recommendations = user_recommendations.select("userId", explode("recommendations").alias("rec"))
recommendations = recommendations.join(movies_df, recommendations.rec.movieId == movies_df.movieId).select("movieId", "title", "rec.rating")

# Show user-song recommendations with titles
recommendations.show(truncate=False)

[Stage 601:>                                                        (0 + 1) / 2]

+-------+--------------------+---------+
|movieId|title               |rating   |
+-------+--------------------+---------+
|23     |Shawshank Redemption|3.1364992|
|55     |Inception           |3.4145792|
|55     |Inception           |3.9458332|
|55     |Inception           |2.5469325|
|55     |Inception           |2.670787 |
|55     |Inception           |3.322591 |
|55     |Inception           |4.7129517|
|55     |Inception           |4.7998195|
+-------+--------------------+---------+



                                                                                

#### Predictions for all data

In [33]:
predictions = model.transform(ratings_df)
predictions.show(truncate=False)

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|22    |0      |1     |0.96697557|
|22    |3      |2     |1.6326257 |
|22    |5      |2     |2.0366673 |
|22    |6      |2     |2.2972772 |
|22    |9      |1     |1.5513803 |
|22    |10     |1     |1.4349127 |
|22    |11     |1     |1.2901659 |
|22    |13     |1     |1.617328  |
|22    |14     |1     |1.389045  |
|22    |16     |1     |0.7093756 |
|22    |18     |3     |3.0116072 |
|22    |19     |1     |1.4644071 |
|22    |22     |5     |4.118836  |
|22    |25     |1     |0.97840077|
|22    |26     |1     |1.1323681 |
|22    |29     |3     |3.2431226 |
|22    |30     |5     |3.9942718 |
|22    |32     |4     |3.217519  |
|22    |33     |1     |0.8903809 |
|22    |35     |1     |0.7503733 |
+------+-------+------+----------+
only showing top 20 rows



#### Model evaluation

In [34]:
from pyspark.ml.evaluation import RegressionEvaluator
# Set up evaluator to compute RMSE
evaluator = RegressionEvaluator(
    metricName="rmse", 
    labelCol="rating", 
    predictionCol="prediction"
)

# Calculate RMSE
rmse = evaluator.evaluate(predictions)
print(f"Root-mean-square error (RMSE) = {rmse}")

Root-mean-square error (RMSE) = 0.5691166521341573


In [35]:
sc.stop()