In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=ec9e4e25bad3349243e7d62b58e926128bbbf0c5470f3d1f2f2c0d28c6c8987a
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

In [3]:
from pyspark import SparkContext
sc = spark.sparkContext

In [41]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [49]:
df_ratings = spark.read. \
option("header","True"). \
option("inferSchema","True"). \
csv("ratings.csv")

df_ratings_reduced = df_ratings.select("userId","movieId","rating")

df = df_ratings_reduced.withColumn("movieId_int", F.col("movieId").cast("int"))

joined = df.alias('df1').withColumnRenamed('userId', 'user1').join(df.alias('df2'), on=['movieId_int'], how='left') \
                        .filter(F.col('user1') < F.col('df2.userId'))

final_df = joined.groupBy('user1', 'df2.userId').agg(F.count('*').alias('same_movie_count'), 
                                                     F.round(F.sqrt(F.abs(F.sum(F.col('df1.rating')-F.col('df2.rating')))),2).alias('rating_diff_sum'))

final_df.show(10)

+-----+------+----------------+---------------+
|user1|userId|same_movie_count|rating_diff_sum|
+-----+------+----------------+---------------+
|    1|   587|              25|            2.0|
|    2|   582|               9|           2.35|
|    4|   599|             128|            5.2|
|    7|   475|              42|           7.38|
|    9|   564|               3|            0.0|
|   12|   589|               1|           0.71|
|   16|   584|               6|           2.24|
|   18|   552|              47|            3.0|
|   21|   600|             110|           7.07|
|   26|   594|               7|           2.74|
+-----+------+----------------+---------------+
only showing top 10 rows



In [50]:
final_df = final_df.withColumn("similarities", F.round((final_df.same_movie_count / final_df.rating_diff_sum),2))
final_df.show(5)

+-----+------+----------------+---------------+------------+
|user1|userId|same_movie_count|rating_diff_sum|similarities|
+-----+------+----------------+---------------+------------+
|    1|   587|              25|            2.0|        12.5|
|    2|   582|               9|           2.35|        3.83|
|    4|   599|             128|            5.2|       24.62|
|    7|   475|              42|           7.38|        5.69|
|    9|   564|               3|            0.0|        null|
+-----+------+----------------+---------------+------------+
only showing top 5 rows



In [51]:
final_df = final_df.na.fill(value=0,subset=["similarities"])

In [52]:
w = Window.partitionBy('user1').orderBy(F.desc('similarities'))
top10similarities = final_df.select('*', F.rank().over(w).alias('rank')).filter(F.col('rank') <= 10).drop('rank')
top10similarities = top10similarities.sort(top10similarities["user1"].asc(),top10similarities["similarities"].desc())
top10similarities.show(15)

+-----+------+----------------+---------------+------------+
|user1|userId|same_movie_count|rating_diff_sum|similarities|
+-----+------+----------------+---------------+------------+
|    1|   573|              66|            1.0|        66.0|
|    1|   171|              32|            1.0|        32.0|
|    1|   339|              34|           1.22|       27.87|
|    1|   380|             126|           4.69|       26.87|
|    1|   610|              69|            3.0|        23.0|
|    1|   532|              23|            1.0|        23.0|
|    1|   201|              39|           1.73|       22.54|
|    1|    45|             100|           4.58|       21.83|
|    1|    62|              29|           1.41|       20.57|
|    1|   382|              29|           1.41|       20.57|
|    2|   247|              15|           0.71|       21.13|
|    2|   212|              14|           0.71|       19.72|
|    2|   111|              17|            1.0|        17.0|
|    2|   274|          