In [31]:
# import pyspark
from pyspark.sql import SparkSession , DataFrame
from pyspark.ml.feature import BucketedRandomProjectionLSH
from pyspark.ml.linalg import SparseVector , VectorUDT , Vectors
from pyspark.sql.functions import col, collect_list, struct , udf , avg
from pyspark.sql.types import FloatType , ArrayType, StructType, StructField , IntegerType
import collections
import os
import time

In [2]:
# I need this to run comment this code if you don't need it
os.environ['PYSPARK_PYTHON'] = '.venv/Scripts/python.exe'
os.environ['PYSPARK_DRIVER_PYTHON'] = '.venv/Scripts/python.exe'
print(os.environ.get("JAVA_HOME"))

C:\Program Files\Eclipse Adoptium\jdk-11.0.27.6-hotspot\


In [3]:
spark: SparkSession = SparkSession.builder \
    .appName("CF movielens") \
    .getOrCreate()

In [4]:
file_path = "ml-latest-small/ml-latest-small/ratings.csv"
ratings_df = spark.read.csv(file_path, header=True, inferSchema=True) \
    .drop("timestamp")
print("Number of ratings:", ratings_df.count())
ratings_df.show(5)

Number of ratings: 100836
+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|      3|   4.0|
|     1|      6|   4.0|
|     1|     47|   5.0|
|     1|     50|   5.0|
+------+-------+------+
only showing top 5 rows



In [5]:
train_df, test_df = ratings_df.randomSplit([0.9, 0.1], seed=42)
print("Number of training ratings:", train_df.count())

Number of training ratings: 90673


In [6]:
avg_rating = train_df.agg({"rating": "avg"}).collect()[0][0]
print("Average rating in training set:", avg_rating)
# (avg rating of user x ) - μ
rating_deviation_of_user = train_df.groupBy("userId") \
    .agg(avg("rating").alias("avg_rating")) \
    .withColumn("rating_deviation_user", col("avg_rating") - avg_rating) \
    .select("userId", "rating_deviation_user")
rating_deviation_of_user.show(5)
rating_deviation_of_movie = train_df.groupBy("movieId") \
    .agg(avg("rating").alias("avg_rating")) \
    .withColumn("rating_deviation_movie", col("avg_rating") - avg_rating) \
    .select("movieId", "rating_deviation_movie")
rating_deviation_of_movie.show(5)

rating_deviation_of_user_dict = rating_deviation_of_user.rdd \
    .map(lambda row: (row.userId, row.rating_deviation_user)) \
    .collectAsMap()
rating_deviation_of_movie_dict = rating_deviation_of_movie.rdd \
    .map(lambda row: (row.movieId, row.rating_deviation_movie)) \
    .collectAsMap()

def calculate_baseline_rating(userId, movieId):
    user_deviation = rating_deviation_of_user_dict.get(userId, 0.0)
    movie_deviation = rating_deviation_of_movie_dict.get(movieId, 0.0)
    return avg_rating + user_deviation + movie_deviation

calculate_baseline_rating_udf = udf(calculate_baseline_rating, FloatType())

train_df = train_df.withColumn(
    "baseline_rating",
    calculate_baseline_rating_udf(col("userId"), col("movieId"))
)
train_df.select("userId", "movieId", "baseline_rating").show(5)


Average rating in training set: 3.503325135376573
+------+---------------------+
|userId|rating_deviation_user|
+------+---------------------+
|   148|  0.23417486462342696|
|   463|   0.3932265887613582|
|   471|   0.4057657737143363|
|   496| -0.15147328352472123|
|   243|    0.610960578909141|
+------+---------------------+
only showing top 5 rows

+-------+----------------------+
|movieId|rating_deviation_movie|
+-------+----------------------+
|   1580|  -0.02144107740555823|
|   2366|   0.13667486462342726|
|   3175|   0.08363138636255751|
|  32460|    0.7466748646234271|
|   1238|    0.5522304201789825|
+-------+----------------------+
only showing top 5 rows

+------+-------+---------------+
|userId|movieId|baseline_rating|
+------+-------+---------------+
|     1|      1|      4.7871666|
|     1|      3|      4.1031566|
|     1|      6|       4.805284|
|     1|     47|      4.8449597|
|     1|     50|      5.1087117|
+------+-------+---------------+
only showing top 5 rows



In [7]:
test_df.show(5)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|    101|   5.0|
|     1|    151|   5.0|
|     1|    943|   4.0|
|     1|   1031|   5.0|
|     1|   1220|   5.0|
+------+-------+------+
only showing top 5 rows



In [8]:
# calculate RMSE
def calculate_rmse(predictions):
    predictions = predictions.withColumnRenamed("rating", "actual_rating")
    predictions = predictions.withColumnRenamed("predicted_rating", "predicted_rating")
    rmse = predictions.withColumn(
        "squared_error",
        (col("actual_rating") - col("predicted_rating")) ** 2
    ).agg({"squared_error": "avg"}).collect()[0][0] ** 0.5
    return rmse

predicted_ratings_df = test_df.withColumn(
    "predicted_rating",
    calculate_baseline_rating_udf(col("userId"), col("movieId"))
)
rmse = calculate_rmse(predicted_ratings_df)
print("RMSE of baseline model:", rmse)

RMSE of baseline model: 0.9062214757366772


In [9]:
train_norm_df = train_df.withColumn(
    "rating",
    col("rating") - col("baseline_rating")
).withColumnRenamed("rating", "normalized_rating") \
.select("userId", "movieId", "normalized_rating")
train_norm_df.show(5)


+------+-------+--------------------+
|userId|movieId|   normalized_rating|
+------+-------+--------------------+
|     1|      1| -0.7871665954589844|
|     1|      3|-0.10315656661987305|
|     1|      6| -0.8052840232849121|
|     1|     47| 0.15504026412963867|
|     1|     50|-0.10871171951293945|
+------+-------+--------------------+
only showing top 5 rows



In [10]:
length_user = train_norm_df.select("userId").distinct().count() +1

@udf(returnType=VectorUDT())
def build_sparse_vector(ratings):
    user_rating_map = collections.OrderedDict(sorted([(r[0], r[1]) for r in ratings]))

    user_ids = list(user_rating_map.keys())
    values = list(user_rating_map.values())

    return Vectors.sparse(length_user, user_ids, values)

sparse_vector_df = train_norm_df.groupBy("movieId") \
    .agg(collect_list(struct("userId", "normalized_rating")).alias("ratings")) \
    .select("movieId", build_sparse_vector(col("ratings")).alias("ratings_vector"))
sparse_vector_df.show()
sparse_vector_df.count()

+-------+--------------------+
|movieId|      ratings_vector|
+-------+--------------------+
|      1|(611,[1,5,7,17,18...|
|      2|(611,[6,8,18,19,2...|
|      3|(611,[1,6,19,32,4...|
|      4|(611,[6,14,84,262...|
|      5|(611,[6,31,43,45,...|
|      6|(611,[1,6,11,18,2...|
|      7|(611,[6,14,19,31,...|
|      8|(611,[6,20,43,274...|
|      9|(611,[151,179,217...|
|     10|(611,[6,8,11,19,2...|
|     11|(611,[6,8,33,35,3...|
|     12|(611,[19,44,120,1...|
|     13|(611,[6,19,20,288...|
|     14|(611,[90,109,182,...|
|     15|(611,[6,19,93,136...|
|     16|(611,[6,18,28,42,...|
|     17|(611,[6,31,33,38,...|
|     18|(611,[44,66,95,10...|
|     19|(611,[6,14,21,40,...|
|     20|(611,[78,199,217,...|
+-------+--------------------+
only showing top 20 rows



9358

In [11]:
# cosine similarity function
def cosine_similarity(vec1: SparseVector, vec2: SparseVector) -> float:
    """
    Calculate the cosine similarity between two SparseVectors.
    Values near 1 indicate high similarity, while values near 0 indicate low similarity.
    """
    if vec1 is None or vec2 is None:
        return None
    dot_product = vec1.dot(vec2) # type: ignore
    norm1 = vec1.norm(2)
    norm2 = vec2.norm(2)
    if norm1 == 0 or norm2 == 0:
        return 0.0
    return dot_product / (norm1 * norm2)

v1 = sparse_vector_df.select("ratings_vector").collect()[0][0]
v2 = sparse_vector_df.select("ratings_vector").collect()[2][0]
print("First two vectors:", v1, v2)
similarity = cosine_similarity(v1, v2)
print("Cosine similarity between first two vectors:", similarity)

First two vectors: (611,[1,5,7,17,18,19,21,27,31,32,33,40,43,44,45,46,50,54,57,63,64,66,68,71,73,78,82,86,89,91,93,96,98,103,107,112,119,121,130,132,134,135,137,140,141,144,145,151,153,155,156,159,160,161,166,167,169,171,177,178,179,182,185,186,191,193,201,202,206,213,214,216,217,223,226,232,233,239,240,247,249,252,263,264,266,270,273,274,275,276,277,279,280,288,290,291,292,293,298,304,307,314,323,328,330,332,334,336,337,339,341,347,350,353,357,359,364,367,372,373,378,380,381,382,385,389,391,396,399,411,412,420,422,432,436,438,448,451,453,456,460,462,468,469,470,471,474,476,477,480,483,484,488,490,492,500,504,509,514,517,522,524,525,528,529,533,534,541,544,550,555,559,560,561,562,567,570,572,573,579,580,584,587,590,596,597,599,600,601,603,604,605,606,607,608,609,610],[-0.7871665954589844,-0.04973268508911133,0.849456787109375,-0.13986873626708984,-0.6643929481506348,0.9539303779602051,-0.1831974983215332,-0.9882071018218994,0.6734814643859863,-1.1497859954833984,-1.1918792724609375,0.7

In [12]:
sparse_vector_df.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- ratings_vector: vector (nullable = true)



In [48]:
NORMALIZE = True
BUCKET_LENGTH = 0.1
NUM_HASH_TABLES: int  = 20
DISTANCE_THRESHOLD = 0.25

@udf(returnType=VectorUDT())
def normalize_vector(vector: SparseVector):
    norm = vector.norm(2)
    if norm == 0:
        return 
    return Vectors.sparse(
        vector.size,
        vector.indices,
        vector.values / norm
    )
    
if NORMALIZE:
    df = sparse_vector_df.withColumn(
        "features",
        normalize_vector(col("ratings_vector"))
    )
else:
    df = sparse_vector_df.withColumnRenamed("ratings_vector", "features")

brp = BucketedRandomProjectionLSH(
    inputCol="features",
    outputCol="hashes",
    bucketLength=BUCKET_LENGTH,
    numHashTables=NUM_HASH_TABLES
)

model = brp.fit(df)
transformed_df = model.transform(df)
transformed_df.show(5)

+-------+--------------------+--------------------+--------------------+
|movieId|      ratings_vector|            features|              hashes|
+-------+--------------------+--------------------+--------------------+
|      1|(611,[1,5,7,17,18...|(611,[1,5,7,17,18...|[[-1.0], [-1.0], ...|
|      2|(611,[6,8,18,19,2...|(611,[6,8,18,19,2...|[[-1.0], [-1.0], ...|
|      3|(611,[1,6,19,32,4...|(611,[1,6,19,32,4...|[[-1.0], [-1.0], ...|
|      4|(611,[6,14,84,262...|(611,[6,14,84,262...|[[-1.0], [-1.0], ...|
|      5|(611,[6,31,43,45,...|(611,[6,31,43,45,...|[[-1.0], [-1.0], ...|
+-------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [49]:
def find_similar_movies(movie_vector_q: SparseVector):
    """
    Find movies similar to the given movie_id based on the LSH model.
    Returns a list of tuples (movieId, similarity_score).
    """
    similar_movies = model.approxNearestNeighbors(
        transformed_df,
        movie_vector_q,
        numNearestNeighbors=10,
        distCol="distance",
    ).filter(
        col("movieId") != movie_id
    ).withColumn(
        "distance",
        1-col("distance") ** 2 / 2 # since A and B are normalized -> ∣∣A−B∣∣^2=2−2cos(θ) <=> cos(θ) = 1 - ∣∣A−B∣∣^2/2
    ).filter(
        col("distance") > DISTANCE_THRESHOLD
    ).select("movieId", "distance")
    return similar_movies.collect()
# Example usage
movie_id = 1  # Replace with the movieId you want to find similar movies for
movie_vector = transformed_df.filter(col("movieId") == movie_id).select("features").first()[0]
t = time.time()
similar_movies = find_similar_movies(movie_vector)
print(f"Similar movies to movieId {movie_id} (took {time.time()-t:.2f}s):")
for movie, score in similar_movies:
    print(f"MovieId: {movie}, Similarity Score: {score}")

Similar movies to movieId 1 (took 2.43s):
MovieId: 3114, Similarity Score: 0.28742060925799084


In [50]:
def approximate_rating(user_id: int, movie_id: int) -> float:
    """
    Approximate the rating for a user and movie using the LSH model.
    Returns the average rating of similar movies weighted by similarity.
    """
    movie_vector = transformed_df.filter(col("movieId") == movie_id).select("features").first()[0]
    similar_movies = find_similar_movies(movie_vector)
    if not similar_movies:
        return None
    
    total_weighted_rating = 0.0
    total_similarity = 0.0
    
    for sim_movie_id, similarity in similar_movies:
        rating = train_norm_df.filter(
            (col("userId") == user_id) & (col("movieId") == sim_movie_id)
        ).select("normalized_rating").first()
        
        if rating is not None:
            total_weighted_rating += rating[0] * similarity
            total_similarity += similarity
            
    if total_similarity == 0:
        return calculate_baseline_rating(user_id, movie_id)
    
    return total_weighted_rating / total_similarity + calculate_baseline_rating(user_id, movie_id)

# Example usage
user_id = 1  # Replace with the userId you want to approximate the rating for
movie_id = 101  # Replace with the movieId you want to approximate the rating for
approx_rating = approximate_rating(user_id, movie_id)
print(f"Approximate rating for userId {user_id} and movieId {movie_id}: {approx_rating} (actual rating: 5.0)")

Approximate rating for userId 1 and movieId 101: 4.778156346104909 (actual rating: 5.0)


In [None]:
DISTANCE_THRESHOLD_EUCLIDIAN = (2-2*DISTANCE_THRESHOLD)**0.5
near_df = model.approxSimilarityJoin(
    transformed_df.select("movieId", "features").limit(1000),
    transformed_df.select("movieId", "features").limit(10000),
    DISTANCE_THRESHOLD_EUCLIDIAN,
    distCol="distance"
).withColumn(
    "distance",
    1 - col("distance") ** 2 / 2  # since A and B are normalized -> ∣∣A−B∣∣^2=2−2cos(θ) <=> cos(θ) = 1 - ∣∣A−B∣∣^2/2
).filter(
    col("datasetA.movieId") != col("datasetB.movieId")
).select(
    col("datasetA.movieId").alias("movieId_A"),
    col("datasetB.movieId").alias("movieId_B"),
    "distance"
)
print(near_df.count())