In [137]:
# import pyspark
from pyspark.sql import SparkSession , DataFrame
from pyspark.ml.linalg import SparseVector , VectorUDT
from pyspark.sql.functions import col, collect_list, struct , udf , avg
from pyspark.sql.types import FloatType
import collections
import os

In [138]:
# I need this to run comment this code if you don't need it
os.environ['PYSPARK_PYTHON'] = '.venv/Scripts/python.exe'
os.environ['PYSPARK_DRIVER_PYTHON'] = '.venv/Scripts/python.exe'
print(os.environ.get("JAVA_HOME"))

C:\Program Files\Eclipse Adoptium\jdk-11.0.27.6-hotspot\


In [139]:
spark: SparkSession = SparkSession.builder \
    .appName("CF movielens") \
    .getOrCreate()

In [140]:
file_path = "ml-latest-small/ml-latest-small/ratings.csv"
ratings_df = spark.read.csv(file_path, header=True, inferSchema=True) \
    .drop("timestamp")
print("Number of ratings:", ratings_df.count())
ratings_df.show(5)

Number of ratings: 100836
+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|      3|   4.0|
|     1|      6|   4.0|
|     1|     47|   5.0|
|     1|     50|   5.0|
+------+-------+------+
only showing top 5 rows



In [141]:
train_df, test_df = ratings_df.randomSplit([0.9, 0.1], seed=42)
print("Number of training ratings:", train_df.count())

Number of training ratings: 90673


In [142]:
avg_rating = train_df.agg({"rating": "avg"}).collect()[0][0]
print("Average rating in training set:", avg_rating)
# (avg rating of user x ) - μ
rating_deviation_of_user = train_df.groupBy("userId") \
    .agg(avg("rating").alias("avg_rating")) \
    .withColumn("rating_deviation_user", col("avg_rating") - avg_rating) \
    .select("userId", "rating_deviation_user")
rating_deviation_of_user.show(5)
rating_deviation_of_movie = train_df.groupBy("movieId") \
    .agg(avg("rating").alias("avg_rating")) \
    .withColumn("rating_deviation_movie", col("avg_rating") - avg_rating) \
    .select("movieId", "rating_deviation_movie")
rating_deviation_of_movie.show(5)

rating_deviation_of_user_dict = rating_deviation_of_user.rdd \
    .map(lambda row: (row.userId, row.rating_deviation_user)) \
    .collectAsMap()
rating_deviation_of_movie_dict = rating_deviation_of_movie.rdd \
    .map(lambda row: (row.movieId, row.rating_deviation_movie)) \
    .collectAsMap()
@udf(returnType=FloatType())
def calculate_baseline_rating(userId, movieId):
    user_deviation = rating_deviation_of_user_dict.get(userId, 0.0)
    movie_deviation = rating_deviation_of_movie_dict.get(movieId, 0.0)
    return avg_rating + user_deviation + movie_deviation

train_df = train_df.withColumn(
    "baseline_rating",
    calculate_baseline_rating(col("userId"), col("movieId"))
)
train_df.select("userId", "movieId", "baseline_rating").show(5)


Average rating in training set: 3.503325135376573
+------+---------------------+
|userId|rating_deviation_user|
+------+---------------------+
|   148|  0.23417486462342696|
|   463|   0.3932265887613582|
|   471|   0.4057657737143363|
|   496| -0.15147328352472123|
|   243|    0.610960578909141|
+------+---------------------+
only showing top 5 rows

+-------+----------------------+
|movieId|rating_deviation_movie|
+-------+----------------------+
|   1580|  -0.02144107740555823|
|   2366|   0.13667486462342726|
|   3175|   0.08363138636255751|
|  32460|    0.7466748646234271|
|   1238|    0.5522304201789825|
+-------+----------------------+
only showing top 5 rows

+------+-------+---------------+
|userId|movieId|baseline_rating|
+------+-------+---------------+
|     1|      1|      4.7871666|
|     1|      3|      4.1031566|
|     1|      6|       4.805284|
|     1|     47|      4.8449597|
|     1|     50|      5.1087117|
+------+-------+---------------+
only showing top 5 rows



In [143]:
test_df.show(5)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|    101|   5.0|
|     1|    151|   5.0|
|     1|    943|   4.0|
|     1|   1031|   5.0|
|     1|   1220|   5.0|
+------+-------+------+
only showing top 5 rows



In [144]:
# calculate RMSE
def calculate_rmse(predictions):
    predictions = predictions.withColumnRenamed("rating", "actual_rating")
    predictions = predictions.withColumnRenamed("predicted_rating", "predicted_rating")
    rmse = predictions.withColumn(
        "squared_error",
        (col("actual_rating") - col("predicted_rating")) ** 2
    ).agg({"squared_error": "avg"}).collect()[0][0] ** 0.5
    return rmse

predicted_ratings_df = test_df.withColumn(
    "predicted_rating",
    calculate_baseline_rating(col("userId"), col("movieId"))
)
rmse = calculate_rmse(predicted_ratings_df)
print("RMSE of baseline model:", rmse)

RMSE of baseline model: 0.9062214757366772


In [145]:
train_norm_df = train_df.withColumn(
    "rating",
    col("rating") - col("baseline_rating")
).withColumnRenamed("rating", "normalized_rating") \
.select("userId", "movieId", "normalized_rating")
train_norm_df.show(5)


+------+-------+--------------------+
|userId|movieId|   normalized_rating|
+------+-------+--------------------+
|     1|      1| -0.7871665954589844|
|     1|      3|-0.10315656661987305|
|     1|      6| -0.8052840232849121|
|     1|     47| 0.15504026412963867|
|     1|     50|-0.10871171951293945|
+------+-------+--------------------+
only showing top 5 rows



In [146]:
last_user_id = train_df.select("userId").agg({"userId": "max"}).collect()[0][0]

@udf(returnType=VectorUDT())
def build_sparse_vector(ratings):
    user_rating_map = collections.OrderedDict(sorted([(r[0], r[1]) for r in ratings]))

    user_ids = list(user_rating_map.keys())
    values = list(user_rating_map.values())

    return SparseVector(last_user_id+1, user_ids, values)

sparse_vector_df = train_df.groupBy("movieId") \
    .agg(collect_list(struct("userId", "rating")).alias("ratings")) \
    .select("movieId", build_sparse_vector(col("ratings")).alias("ratings_vector"))
sparse_vector_df.show()
sparse_vector_df.count()

+-------+--------------------+
|movieId|      ratings_vector|
+-------+--------------------+
|      1|(611,[1,5,7,17,18...|
|      2|(611,[6,8,18,19,2...|
|      3|(611,[1,6,19,32,4...|
|      4|(611,[6,14,84,262...|
|      5|(611,[6,31,43,45,...|
|      6|(611,[1,6,11,18,2...|
|      7|(611,[6,14,19,31,...|
|      8|(611,[6,20,43,274...|
|      9|(611,[151,179,217...|
|     10|(611,[6,8,11,19,2...|
|     11|(611,[6,8,33,35,3...|
|     12|(611,[19,44,120,1...|
|     13|(611,[6,19,20,288...|
|     14|(611,[90,109,182,...|
|     15|(611,[6,19,93,136...|
|     16|(611,[6,18,28,42,...|
|     17|(611,[6,31,33,38,...|
|     18|(611,[44,66,95,10...|
|     19|(611,[6,14,21,40,...|
|     20|(611,[78,199,217,...|
+-------+--------------------+
only showing top 20 rows



9358

In [147]:
# cosine similarity function
def cosine_similarity(vec1: SparseVector, vec2: SparseVector) -> float:
    """
    Calculate the cosine similarity between two SparseVectors.
    Values near 1 indicate high similarity, while values near 0 indicate low similarity.
    """
    if vec1 is None or vec2 is None:
        return None
    dot_product = vec1.dot(vec2) # type: ignore
    norm1 = vec1.norm(2)
    norm2 = vec2.norm(2)
    if norm1 == 0 or norm2 == 0:
        return 0.0
    return dot_product / (norm1 * norm2)
