In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.ml.linalg import SparseVector, Vectors
from pyspark.sql.functions import col, desc, sum as spark_sum, avg as spark_avg
import pyspark.sql.functions as F
import pyspark.sql.types as T

# Load dataset

In [4]:
spark = SparkSession.builder.getOrCreate()

In [11]:
import os

path = "/content/drive/MyDrive/MMDS/Endterm/Question3/ratings2k.csv"

if os.path.exists(path):
    print("Path exists!")
else:
    print("Path does not exist.")

Path exists!


In [12]:
df = spark.read.csv(path, header=True, inferSchema=True)

In [14]:
df.show(5)

+-----+----+----+------+
|index|user|item|rating|
+-----+----+----+------+
|    0|  73|  52|   4.0|
|    1|  36| 239|   3.0|
|    2|  72|  26|   1.0|
|    3|  59| 430|   2.5|
|    4|  72| 284|   3.0|
+-----+----+----+------+
only showing top 5 rows



In [15]:
INDEX_COL="index"
df = df.drop(INDEX_COL)

Number of Users and Number of Items

In [18]:
N_users = df.select('user').distinct().count()
N_items = df.select('item').distinct().count()

In [41]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, collect_list, struct, lit, explode
from pyspark.mllib.linalg import SparseVector
from pyspark.ml.linalg import Vectors
import numpy as np

class CollaborativeFiltering:
    def __init__(self,
                N: int, # N: Number of similar users to consider
                dataset: DataFrame # dataset: PySpark DataFrame
                        # contains the dataset (user | item | rating)
    ):
        self.N = N
        self.dataset = dataset

    def _pearson_correlation(self, vec1, vec2):
        """
        Calculate Pearson's correlation coefficient between two sparse vectors.

        Params:
            vec1: SparseVector for user 1
            vec2: SparseVector for user 2

        Return:
            Pearson's correlation coefficient
        """

        # Check that 2 different users have common items
        common_items = set(vec1.indices).intersection(set(vec2.indices))
        if not common_items:
            return 0

        ratings1 = np.array([vec1.values[np.where(vec1.indices == item)[0][0]]
                                            for item in common_items])

        ratings2 = np.array([vec2.values[np.where(vec2.indices == item)[0][0]]
                                            for item in common_items])

        mean1 = np.mean(ratings1)
        mean2 = np.mean(ratings2)

        numerator = np.sum((ratings1 - mean1) * (ratings2 - mean2))
        denominator = np.sqrt(np.sum((ratings1 - mean1) ** 2)
                                    * np.sum((ratings2 - mean2) ** 2))

        return numerator / denominator if denominator != 0 else 0

    def predict(self, user_input: SparseVector, num_recommendations: int):
        """
        Recommend products to a user based on collaborative filtering.

        Params:
            SparseVector containing ratings of the active user
            num_recommendations: Number of products to recommend
        Return:
            PySpark DataFrame containing the recommended products and scores
        """
        # Transform dataset into a user-item matrix (sparse vectors for each user)
        user_item_matrix = (
            self.dataset.groupBy("user")
            .agg(
                collect_list(struct("item", "rating")).alias("ratings")
            )
        )

        # Convert user input SparseVector to dict {key=item_id, value=item_value}
        sparse_user_input = {user_input.indices[i]: user_input.values[i]
                               for i in range(len(user_input.indices))}

        pearson_correlation = self._pearson_correlation
        def compute_similarity(sparse_user_input, row):
            other_user_ratings = {r['item']: r['rating'] for r in row['ratings']}
            similarity = pearson_correlation(sparse_user_input,
                                        other_user_ratings)
            return row['user'], similarity

        similarities_df = (
            user_item_matrix.rdd
            .map(lambda row: compute_similarity(sparse_user_input, row))
            .toDF(["user", "similarity"])
            .filter(col("similarity") > 0)
            .orderBy(col("similarity").desc())
            .limit(self.N)
        )

        # Get similar users
        similar_users = similarities_df.select("user", "similarity")

        # Get ratings from similar users and calculate weighted ratings
        weighted_ratings = (
            self.dataset.join(similar_users, "user")
            .withColumn("weighted_rating", col("rating") * col("similarity"))
        )

        # Aggregate and recommend items not already rated by the active user
        recommendations = (
            weighted_ratings.groupBy("item")
            .agg(F.expr("sum(weighted_rating) as score"))
            .filter(~col("item").isin(list(sparse_user_input.keys())))
            .orderBy(col("score").desc())
            .limit(num_recommendations)
        )

        return recommendations

# Testing

In [42]:
recommender = CollaborativeFiltering(N=10, dataset=df)

user_vector = Vectors.sparse(
    N_items,
    [(0, 5.0), (2, 4.0)]
)

recommendations = recommender.predict(user_vector, num_recommendations=5)

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pyspark/serializers.py", line 459, in dumps
    return cloudpickle.dumps(obj, pickle_protocol)
  File "/usr/local/lib/python3.10/dist-packages/pyspark/cloudpickle/cloudpickle_fast.py", line 73, in dumps
    cp.dump(obj)
  File "/usr/local/lib/python3.10/dist-packages/pyspark/cloudpickle/cloudpickle_fast.py", line 632, in dump
    return Pickler.dump(self, obj)
  File "/usr/local/lib/python3.10/dist-packages/pyspark/context.py", line 466, in __getnewargs__
    raise PySparkRuntimeError(
pyspark.errors.exceptions.base.PySparkRuntimeError: [CONTEXT_ONLY_VALID_ON_DRIVER] It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.


PicklingError: Could not serialize object: PySparkRuntimeError: [CONTEXT_ONLY_VALID_ON_DRIVER] It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.

In [None]:
recommendations.show()