# Hybrid Recommendation System

**Team Structure:**
- Member 1: Infrastructure, Data Loading, Fusion & Evaluation
- Member 2: Collaborative Filtering (ALS)
- Member 3: Content-Based Filtering (TF-IDF + LSH)

## 1. Setup

### 1.1 Imports

In [1]:
import os
import sys
import urllib.request
import zipfile
from math import log2

# Fix for Windows
os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType, FloatType, StructType, StructField

### 1.2 Download Data

In [2]:
DATA_URL = "https://files.grouplens.org/datasets/movielens/ml-1m.zip"
DATA_DIR = "data"
DATASET_DIR = os.path.join(DATA_DIR, "ml-1m")
ZIP_PATH = os.path.join(DATA_DIR, "ml-1m.zip")

In [3]:
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

In [None]:
if not os.path.exists(DATASET_DIR):

    if not os.path.exists(ZIP_PATH):
        print("Downloading MovieLens ml-1m...")
        urllib.request.urlretrieve(DATA_URL, ZIP_PATH)

    print("Extracting...")
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(DATA_DIR)

### 1.3 Spark Session

In [5]:
spark = SparkSession.builder.appName("MMDS").getOrCreate()
spark.sparkContext.setLogLevel("WARN")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/15 12:31:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### 1.4 Load Data

In [6]:
users_df = spark.read.text(os.path.join(DATASET_DIR, "users.dat")).select(
    F.split(F.col("value"), "::").getItem(0).cast(IntegerType()).alias("user_id"),
    F.split(F.col("value"), "::").getItem(1).alias("gender"),
    F.split(F.col("value"), "::").getItem(2).cast(IntegerType()).alias("age"),
    F.split(F.col("value"), "::").getItem(3).cast(IntegerType()).alias("occupation"),
    F.split(F.col("value"), "::").getItem(4).alias("zip_code")
)
users_df.count()

6040

In [7]:
items_df = spark.read.text(os.path.join(DATASET_DIR, "movies.dat")).select(
    F.split(F.col("value"), "::").getItem(0).cast(IntegerType()).alias("item_id"),
    F.split(F.col("value"), "::").getItem(1).alias("title"),
    F.split(F.col("value"), "::").getItem(2).alias("genres")
)
items_df.count()

3883

In [8]:
ratings_df = spark.read.text(os.path.join(DATASET_DIR, "ratings.dat")).select(
    F.split(F.col("value"), "::").getItem(0).cast(IntegerType()).alias("user_id"),
    F.split(F.col("value"), "::").getItem(1).cast(IntegerType()).alias("item_id"),
    F.split(F.col("value"), "::").getItem(2).cast(FloatType()).alias("rating"),
    F.split(F.col("value"), "::").getItem(3).cast(IntegerType()).alias("timestamp")
)
ratings_df.count()

1000209

In [9]:
users_df.show(5)

+-------+------+---+----------+--------+
|user_id|gender|age|occupation|zip_code|
+-------+------+---+----------+--------+
|      1|     F|  1|        10|   48067|
|      2|     M| 56|        16|   70072|
|      3|     M| 25|        15|   55117|
|      4|     M| 45|         7|   02460|
|      5|     M| 25|        20|   55455|
+-------+------+---+----------+--------+
only showing top 5 rows


In [10]:
items_df.show(5)

+-------+--------------------+--------------------+
|item_id|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Animation|Childre...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|        Comedy|Drama|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows


In [11]:
ratings_df.show(5)

+-------+-------+------+---------+
|user_id|item_id|rating|timestamp|
+-------+-------+------+---------+
|      1|   1193|   5.0|978300760|
|      1|    661|   3.0|978302109|
|      1|    914|   3.0|978301968|
|      1|   3408|   4.0|978300275|
|      1|   2355|   5.0|978824291|
+-------+-------+------+---------+
only showing top 5 rows


## 2. Exploratory Data Analysis

### 2.1 Rating matrix

In [12]:
num_users = users_df.count()
num_items = items_df.count()
num_ratings = ratings_df.count()
sparsity = (1 - (num_ratings / (num_users * num_items))) * 100

print(f"Users:            {num_users:,}")
print(f"Movies:           {num_items:,}")
print(f"Ratings:          {num_ratings:,}")
print(f"Sparsity:         {sparsity:.2f}%")
print(f"Avg ratings/user: {num_ratings/num_users:.1f}")
print(f"Avg ratings/movie:{num_ratings/num_items:.1f}")

Users:            6,040
Movies:           3,883
Ratings:          1,000,209
Sparsity:         95.74%
Avg ratings/user: 165.6
Avg ratings/movie:257.6


### 2.2 Rating distribution

In [13]:
ratings_df.groupBy("rating").count().orderBy("rating").show()



+------+------+
|rating| count|
+------+------+
|   1.0| 56174|
|   2.0|107557|
|   3.0|261197|
|   4.0|348971|
|   5.0|226310|
+------+------+



                                                                                

### 2.3 Genre distribution

In [14]:
items_df.select(F.explode(F.split(F.col("genres"), "\\|")).alias("genre")) \
    .groupBy("genre").count().orderBy(F.desc("count")).show()

+-----------+-----+
|      genre|count|
+-----------+-----+
|      Drama| 1603|
|     Comedy| 1200|
|     Action|  503|
|   Thriller|  492|
|    Romance|  471|
|     Horror|  343|
|  Adventure|  283|
|     Sci-Fi|  276|
| Children's|  251|
|      Crime|  211|
|        War|  143|
|Documentary|  127|
|    Musical|  114|
|    Mystery|  106|
|  Animation|  105|
|    Fantasy|   68|
|    Western|   68|
|  Film-Noir|   44|
+-----------+-----+



### 2.4 User gender distribution

In [15]:
users_df.groupBy("gender").count().show()

+------+-----+
|gender|count|
+------+-----+
|     F| 1709|
|     M| 4331|
+------+-----+



### 2.5 User age distribution

In [16]:
users_df.groupBy("age").count().orderBy("age").show()

+---+-----+
|age|count|
+---+-----+
|  1|  222|
| 18| 1103|
| 25| 2096|
| 35| 1193|
| 45|  550|
| 50|  496|
| 56|  380|
+---+-----+



## 3. Train/Test Split

In [17]:
train_df, test_df = ratings_df.randomSplit([0.8, 0.2], seed=42)
train_df = train_df.cache()
test_df = test_df.cache()

In [18]:
train_df.count()

                                                                                

800053

In [19]:
test_df.count()

                                                                                

200156

## 4. Collaborative Filtering (ALS)

Implement using `pyspark.ml.recommendation.ALS`

In [20]:
class CollaborativeFilter:

    def __init__(self, rank=10, regParam=0.1, maxIter=10):
        self.als = ALS(
            userCol="user_id",
            itemCol="item_id",
            ratingCol="rating",
            rank=rank,
            regParam=regParam,
            maxIter=maxIter,
            coldStartStrategy="drop",
            nonnegative=True
        )
        self.model = None

    def train(self, df):
        self.model = self.als.fit(df)

    def get_recommendations(self, df, k=10):
        """Get top-K recommendations"""
        if self.model is None:
            raise ValueError("Call train() first.")

        users = df.select("user_id").distinct()
        user_recs = self.model.recommendForUserSubset(users, k)

        return user_recs.select(
            F.col("user_id"),
            F.explode("recommendations").alias("rec")
        ).select(
            F.col("user_id"),
            F.col("rec.item_id").cast(IntegerType()).alias("item_id"),
            F.col("rec.rating").alias("prediction")
        )

    def predict(self, df):
        """Predict ratings for user-item pairs in test"""
        if self.model is None:
            raise ValueError("Train should be called first")
        return self.model.transform(df)

In [21]:
cf = CollaborativeFilter(rank=10, regParam=0.1, maxIter=10)
cf.train(train_df)

26/01/15 12:31:20 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

In [22]:
als_recs = cf.get_recommendations(test_df, k=100).withColumnRenamed("prediction", "als_score")

In [23]:
als_recs.show(10)



+-------+-------+---------+
|user_id|item_id|als_score|
+-------+-------+---------+
|      1|   3233| 4.569002|
|      1|    128|4.4839425|
|      1|    527| 4.477343|
|      1|    318| 4.457619|
|      1|   1851|4.4368587|
|      1|   3172|4.4209924|
|      1|    953|4.4157286|
|      1|    919| 4.412204|
|      1|    858| 4.402227|
|      1|   1207|4.3944917|
+-------+-------+---------+
only showing top 10 rows


                                                                                

### Bonus: Hyperparameter Tuning

In [24]:
# TODO

## 5. Content-Based Filtering (TF-IDF + LSH)

Implement using `pyspark.ml.feature` (Tokenizer, HashingTF, IDF, BucketedRandomProjectionLSH)

In [25]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, NGram, MinHashLSH
from pyspark.ml import Pipeline
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import (
    col, split, concat_ws, regexp_extract, udf, array_intersect, 
    size, sum as _sum, desc, row_number
)

class ContentBasedFilter:
    def __init__(self, num_features=5000, bigram_features=3000, num_hash_tables=10):
        self.num_features = num_features
        self.bigram_features = bigram_features
        self.num_hash_tables = num_hash_tables
        self.jaccard_threshold = 0.1
        self.genre_weight = 4
        self.min_genre_overlap = 1
        

        self.lsh_model = None
        self.vector_model = None
        self.movies_binary = None
        self.similar_pairs = None

    def combine_binarize_udf(self):
        def process_vectors(v1, v2):
            if v1 is None: 
                return None
            indices = [int(i) for i in v1.indices]
            
            if v2 is not None:
                offset = int(v1.size)
                indices += [int(i) + offset for i in v2.indices]
                total_size = offset + int(v2.size)
            else:
                total_size = int(v1.size)
            
            values = [1.0] * len(indices)
            return Vectors.sparse(total_size, sorted(indices), values)
            
        return udf(process_vectors, VectorUDT())

    def train_features(self, items_df):
        print("building content features (tfidf + bigrams)")
        
        df = (
            items_df
            .withColumn("year", regexp_extract(col("title"), r"\((\d{4})\)", 1))
            .withColumn("decade", regexp_extract(col("title"), r"\((\d{3})\d\)", 1))
            .withColumn("genres_spaced", F.regexp_replace(col("genres"), r"\|", " "))
            .withColumn("content", concat_ws(
                " ", 
                col("title"),
                col("genres_spaced"), col("genres_spaced"), col("genres_spaced"), col("genres_spaced"),
                col("year"), col("decade")
            ))
        )

        stages = []

        stages += [
            Tokenizer(inputCol="content", outputCol="raw_words"),
            StopWordsRemover(inputCol="raw_words", outputCol="words")
        ]
        
        stages += [
            HashingTF(inputCol="words", outputCol="tf_uni", numFeatures=self.num_features),
            IDF(inputCol="tf_uni", outputCol="tfidf_uni", minDocFreq=1)
        ]
        
        stages += [
            NGram(n=2, inputCol="words", outputCol="bigrams"),
            HashingTF(inputCol="bigrams", outputCol="tf_bi", numFeatures=self.bigram_features),
            IDF(inputCol="tf_bi", outputCol="tfidf_bi", minDocFreq=1)
        ]
        
        pipeline = Pipeline(stages=stages)
        self.vector_model = pipeline.fit(df)
        features_df = self.vector_model.transform(df)
        
        combiner = self.combine_binarize_udf()
        self.movies_binary = (
            features_df
            .withColumn("binary_features", combiner("tfidf_uni", "tfidf_bi"))
            .select("item_id", "title", "genres", "binary_features")
            .cache()
        )
        
        print(f"checked {self.movies_binary.count()} movies")

    def build_lsh_index(self):
        print("indexing with minhash LSH")
        
        mh = MinHashLSH(
            inputCol="binary_features", 
            outputCol="hashes", 
            numHashTables=self.num_hash_tables,
            seed=42
        )
        self.lsh_model = mh.fit(self.movies_binary)
        
        dist_threshold = 1.0 - self.jaccard_threshold
        
        print(f"cmputing similarity graph")
        raw_pairs = self.lsh_model.approxSimilarityJoin(
            self.movies_binary, self.movies_binary, 
            threshold=dist_threshold, 
            distCol="jaccard_dist"
        )
        
        pairs = raw_pairs.select(
            col("datasetA.item_id").alias("item_a"),
            col("datasetB.item_id").alias("item_b"),
            (1.0 - col("jaccard_dist")).alias("similarity")
        ).filter("item_a != item_b")
        
        self.similar_pairs = pairs.cache()
        print(f"indexed {self.similar_pairs.count()} similar item pairs")

    def recommend_for_users(self, train_df, items_df, k=10):
        print("generating content-based recommendations")
        
        user_history = train_df.filter(col("rating") >= 4.0).select(
            col("user_id"), col("item_id").alias("seed_item"), col("rating")
        )

        candidates = user_history.join(
            self.similar_pairs,
            user_history.seed_item == self.similar_pairs.item_a
        )
        
        genre_df = items_df.select("item_id", split(col("genres"), r"\|").alias("genres_arr"))        
        
        candidates_enriched = (
            candidates.alias("c")
            .join(
                genre_df.alias("seed_g"), 
                col("c.seed_item") == col("seed_g.item_id")
            )
            .join(
                genre_df.alias("cand_g"), 
                col("c.item_b") == col("cand_g.item_id")
            )
            .select(
                "c.user_id", 
                col("c.item_b").alias("candidate_item"),
                "c.rating", 
                "c.similarity",
                size(array_intersect(
                    col("seed_g.genres_arr"), 
                    col("cand_g.genres_arr")
                )).alias("genre_overlap")
            )
        )

        filtered = candidates_enriched.filter(col("genre_overlap") >= self.min_genre_overlap)

        scored = filtered.withColumn(
            "score", 
            col("rating") * col("similarity") * (1.0 + 0.25 * col("genre_overlap"))
        )

        recs = scored.groupBy("user_id", "candidate_item").agg(_sum("score").alias("content_score"))
        
        seen_items = train_df.select("user_id", "item_id").distinct().alias("seen")
        recs_alias = recs.alias("recs")
        
        final_recs = recs_alias.join(
            seen_items,
            (col("recs.user_id") == col("seen.user_id")) & 
            (col("recs.candidate_item") == col("seen.item_id")),
            "left_anti"
        ).select(
            col("recs.user_id"), 
            col("recs.candidate_item").alias("item_id"), 
            col("recs.content_score")
        )

        window = Window.partitionBy("user_id").orderBy(desc("content_score"))
        return (
            final_recs
            .withColumn("rank", row_number().over(window))
            .filter(col("rank") <= k)
            .drop("rank")
        )

cb_filter = ContentBasedFilter()
cb_filter.train_features(items_df)
cb_filter.build_lsh_index()


building content features (tfidf + bigrams)


                                                                                

checked 3883 movies
indexing with minhash LSH
cmputing similarity graph




indexed 988212 similar item pairs


                                                                                

In [26]:
content_recs = cb_filter.recommend_for_users(train_df, items_df, k=100)

generating content-based recommendations


In [27]:
content_recs = content_recs.cache()
content_recs.show(5)



+-------+-------+-----------------+
|user_id|item_id|    content_score|
+-------+-------+-----------------+
|    148|   1591| 89.0484413983273|
|    148|   1744|85.95443869216115|
|    148|   1626|81.63936115826036|
|    148|   2058|73.81954842988361|
|    148|   2916|72.60978868408795|
+-------+-------+-----------------+
only showing top 5 rows


                                                                                

## 6.Fusion & Evaluation

In [28]:
ALPHA = 0.7
K = 10
RELEVANCE_THRESHOLD = 4.0

### 6.1 Normalization

In [29]:
def normalize(df, col_name):
    stats = df.agg(F.min(col_name).alias("min"), F.max(col_name).alias("max")).collect()[0]
    if stats["max"] == stats["min"]:
        return df.withColumn(col_name + "_norm", F.lit(0.5))
    return df.withColumn(col_name + "_norm", (F.col(col_name) - stats["min"]) / (stats["max"] - stats["min"]))

### 6.2 Hybrid Fusion

In [30]:
als_norm = normalize(als_recs, "als_score")
content_norm = normalize(content_recs, "content_score")

hybrid_recs = als_norm.select("user_id", "item_id", "als_score_norm") \
    .join(content_norm.select("user_id", "item_id", "content_score_norm"), ["user_id", "item_id"], "full_outer") \
    .fillna(0) \
    .withColumn("final_score", ALPHA * F.col("als_score_norm") + (1 - ALPHA) * F.col("content_score_norm"))

                                                                                

In [31]:
hybrid_recs.orderBy(F.desc("final_score")).show(10)

[Stage 343:>                                                        (0 + 4) / 4]

+-------+-------+------------------+------------------+------------------+
|user_id|item_id|    als_score_norm|content_score_norm|       final_score|
+-------+-------+------------------+------------------+------------------+
|   4028|   1796|               1.0|               0.0|               0.7|
|   4277|     53|0.6681412660360792|0.7317264247269524|0.6872168136433411|
|   4169|     53|0.6548254695380753|0.7582726855607065|0.6858596343448646|
|   2155|    108|0.9795053197530404|               0.0|0.6856537238271283|
|   5328|   1796|0.9702602974655985|               0.0|0.6791822082259189|
|   1445|   1138|0.9614800392212809|               0.0|0.6730360274548965|
|   4110|   2192|0.9608376664849905|               0.0|0.6725863665394933|
|   1445|   2962|0.9584632469582222|               0.0|0.6709242728707555|
|   4703|   1796|0.9517324311833929|               0.0| 0.666212701828375|
|   4277|   2624|0.6255933786116243| 0.754186712875991|0.6641713788909344|
+-------+-------+--------

                                                                                

In [32]:
hybrid_recs.orderBy(F.desc("final_score")).filter((F.col("als_score_norm") > 0.2) & (F.col("content_score_norm") > 0.2)).show(10)

                                                                                

+-------+-------+------------------+-------------------+------------------+
|user_id|item_id|    als_score_norm| content_score_norm|       final_score|
+-------+-------+------------------+-------------------+------------------+
|   4277|     53|0.6681412660360792| 0.7317264247269524|0.6872168136433411|
|   4169|     53|0.6548254695380753| 0.7582726855607065|0.6858596343448646|
|   4277|   2624|0.6255933786116243|  0.754186712875991|0.6641713788909344|
|     53|     53|0.7598887207001292|0.35001256829402444|0.6369258749782978|
|   1698|     53|0.7094904438537074| 0.4297322213632261| 0.625562977106563|
|   4169|   2503|0.6107187996310084| 0.6442511305292286|0.6207784989004744|
|    195|     53|0.6920743994723191| 0.4047206792058041|0.6058682833923645|
|   1680|     53|0.6438432894186257|  0.513718058524496|0.6048057201503868|
|   3902|    733| 0.766619280922882|0.20011356009613945|0.5966675646748592|
|   1448|     53|0.6440464533194415| 0.4829970743570718|0.5957316396307306|
+-------+---

In [33]:
als_pairs = als_recs.select("user_id", "item_id").distinct()
content_pairs = content_recs.select("user_id", "item_id").distinct()

overlap = als_pairs.intersect(content_pairs).count()
print(f"als pairs: {als_pairs.count()}")
print(f"content pairs: {content_pairs.count()}")
print(f"overlap: {overlap}")

                                                                                

als pairs: 603800




content pairs: 603709
overlap: 11312


                                                                                

### 6.3 Ground Truth

In [34]:
ground_truth = test_df.filter(F.col("rating") >= RELEVANCE_THRESHOLD) \
    .groupBy("user_id").agg(F.collect_list("item_id").alias("relevant_items"))

### Evaluation Functions

In [35]:
def get_top_k(recs_df, score_col, k):
    window = Window.partitionBy("user_id").orderBy(F.desc(score_col))
    return recs_df.withColumn("rank", F.row_number().over(window)) \
        .filter(F.col("rank") <= k) \
        .groupBy("user_id").agg(F.collect_list("item_id").alias("recommended_items"))

In [36]:
def precision_at_k(top_k_df, ground_truth_df, k):
    joined = top_k_df.join(ground_truth_df, "user_id")
    result = joined.withColumn("hits", F.size(F.array_intersect("recommended_items", "relevant_items"))) \
        .agg(F.avg(F.col("hits") / k)).collect()[0][0]
    return result or 0.0

In [37]:
def recall_at_k(top_k_df, ground_truth_df):
    joined = top_k_df.join(ground_truth_df, "user_id")
    result = joined.withColumn("hits", F.size(F.array_intersect("recommended_items", "relevant_items"))) \
        .withColumn("recall", F.when(F.size("relevant_items") > 0, F.col("hits") / F.size("relevant_items")).otherwise(0)) \
        .agg(F.avg("recall")).collect()[0][0]
    return result or 0.0

In [38]:
def ndcg_at_k(top_k_df, ground_truth_df, k):
    joined = top_k_df.join(ground_truth_df, "user_id")
    exploded = joined.select(
        "user_id",
        "relevant_items",
        F.posexplode("recommended_items").alias("pos", "item_id")
    ).withColumn("item_id", F.col("item_id").cast(IntegerType()))

    with_dcg = exploded \
        .withColumn("rel", F.when(F.array_contains("relevant_items", F.col("item_id")), 1.0).otherwise(0.0)) \
        .withColumn("dcg", F.col("rel") / F.log2(F.col("pos") + 2)) \
        .groupBy("user_id", "relevant_items").agg(F.sum("dcg").alias("dcg"))

    idcg_vals = [sum(1.0 / log2(i + 2) for i in range(n)) for n in range(k + 1)]
    idcg_map = F.create_map(*[x for i, v in enumerate(idcg_vals) for x in (F.lit(i), F.lit(v))])

    result = with_dcg \
        .withColumn("num_rel", F.least(F.size("relevant_items"), F.lit(k))) \
        .withColumn("idcg", idcg_map[F.col("num_rel")]) \
        .withColumn("ndcg", F.when(F.col("idcg") > 0, F.col("dcg") / F.col("idcg")).otherwise(0)) \
        .agg(F.avg("ndcg")).collect()[0][0]
    return result or 0.0

In [39]:
def evaluate(recs_df, score_col, name):
    if recs_df.count() == 0:
        print(f"{name}: No recommendations (not implemented)")
        return {"Precision@10": 0.0, "Recall@10": 0.0, "NDCG@10": 0.0}

    top_k = get_top_k(recs_df, score_col, K)
    p = precision_at_k(top_k, ground_truth, K)
    r = recall_at_k(top_k, ground_truth)
    n = ndcg_at_k(top_k, ground_truth, K)

    print(f"{name}: P@{K}={p:.4f}, R@{K}={r:.4f}, NDCG@{K}={n:.4f}")
    return {"Precision@10": p, "Recall@10": r, "NDCG@10": n}

### Evaluation

In [40]:
als_metrics = evaluate(als_recs, "als_score", "ALS")



ALS: P@10=0.0265, R@10=0.0194, NDCG@10=0.0249


                                                                                

In [41]:
content_metrics = evaluate(content_recs, "content_score", "Content-Based")



Content-Based: P@10=0.0280, R@10=0.0217, NDCG@10=0.0319


                                                                                

In [42]:
hybrid_metrics = evaluate(hybrid_recs, "final_score", "Hybrid")



Hybrid: P@10=0.0352, R@10=0.0240, NDCG@10=0.0359


                                                                                

### Bonus: GBT Re-Ranking

In [43]:
# TODO

## Results Summary

In [44]:
summary = [
    ("ALS", als_metrics["Precision@10"], als_metrics["Recall@10"], als_metrics["NDCG@10"]),
    ("Content-Based", content_metrics["Precision@10"], content_metrics["Recall@10"], content_metrics["NDCG@10"]),
    ("Hybrid", hybrid_metrics["Precision@10"], hybrid_metrics["Recall@10"], hybrid_metrics["NDCG@10"]),
]
spark.createDataFrame(summary, ["Model", "Precision@10", "Recall@10", "NDCG@10"]).show()

[Stage 1757:>                                                       (0 + 1) / 1]

+-------------+-------------------+--------------------+--------------------+
|        Model|       Precision@10|           Recall@10|             NDCG@10|
+-------------+-------------------+--------------------+--------------------+
|          ALS|0.02651287194918031|0.019449604815379978|0.024926569370517664|
|Content-Based|0.02800066867268473|0.021737730303375884| 0.03185693701620872|
|       Hybrid|0.03523905048478774|0.024030692726811353| 0.03587917595862495|
+-------------+-------------------+--------------------+--------------------+



                                                                                