In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=0827af016a62a15c712b9c84d24c05bd940d61952f96e12b23f8d9922c239242
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

In [3]:
from pyspark import SparkContext
sc = spark.sparkContext

In [4]:
from pyspark.sql import functions as F

In [5]:
df_tags = spark.read. \
option("header","True"). \
option("inferSchema","True"). \
csv("tags.csv")
df_tags.show(3)

+------+-------+---------------+----------+
|userId|movieId|            tag| timestamp|
+------+-------+---------------+----------+
|     2|  60756|          funny|1445714994|
|     2|  60756|Highly quotable|1445714996|
|     2|  60756|   will ferrell|1445714992|
+------+-------+---------------+----------+
only showing top 3 rows



In [6]:
df_tags.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- tag: string (nullable = true)
 |-- timestamp: integer (nullable = true)



In [9]:
df_tags1 = df_tags.withColumn("to_join", F.concat(F.col("userId"),F.lit(" - "),F.col("movieId")))
df_tags1 = df_tags1.select("tag","to_join")
df_tags1.show(3)

+---------------+---------+
|            tag|  to_join|
+---------------+---------+
|          funny|2 - 60756|
|Highly quotable|2 - 60756|
|   will ferrell|2 - 60756|
+---------------+---------+
only showing top 3 rows



In [10]:
df_ratings = spark.read. \
option("header","True"). \
option("inferSchema","True"). \
csv("ratings.csv")
df_ratings.show(3)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
|     1|      6|   4.0|964982224|
+------+-------+------+---------+
only showing top 3 rows



In [11]:
df_ratings1 = df_ratings.withColumn("to_join", F.concat(F.col("userId"),F.lit(" - "),F.col("movieId")))
df_ratings1 = df_ratings1.select("to_join","rating")
df_ratings1.show(3)

+-------+------+
|to_join|rating|
+-------+------+
|  1 - 1|   4.0|
|  1 - 3|   4.0|
|  1 - 6|   4.0|
+-------+------+
only showing top 3 rows



In [12]:
joinedDF = df_tags1.join(df_ratings1,"to_join","outer")
joinedDF.show(5)

+--------+----+------+
| to_join| tag|rating|
+--------+----+------+
|   1 - 1|null|   4.0|
|1 - 1009|null|   3.0|
| 1 - 101|null|   5.0|
|1 - 1023|null|   5.0|
|1 - 1024|null|   5.0|
+--------+----+------+
only showing top 5 rows



In [13]:
joinedDF_no_null = joinedDF.na.drop()
joinedDF_no_null.show(10)

+------------+--------------------+------+
|     to_join|                 tag|rating|
+------------+--------------------+------+
|   103 - 260|                EPIC|   4.0|
|   103 - 260|    great soundtrack|   4.0|
|   103 - 296|       good dialogue|   5.0|
|   103 - 296|    great soundtrack|   5.0|
|   103 - 296|          non-linear|   5.0|
|106 - 106489|           adventure|   5.0|
|  106 - 4896|Everything you wa...|   5.0|
|   112 - 260|      classic sci-fi|   5.0|
|   112 - 260|engrossing adventure|   5.0|
|   112 - 260|                EPIC|   5.0|
+------------+--------------------+------+
only showing top 10 rows



In [17]:
joinedDF_no_null = joinedDF_no_null.withColumn("tag", F.lower(joinedDF_no_null["tag"]))
group_df = joinedDF_no_null.groupBy("tag").agg(F.mean('rating'), F.count('tag'))
sorted_df = group_df.sort(group_df["avg(rating)"].desc(),group_df["count(tag)"].desc())
sorted_df.show(30)

+--------------------+-----------+----------+
|                 tag|avg(rating)|count(tag)|
+--------------------+-----------+----------+
|        great acting|        5.0|         3|
|               drama|        5.0|         3|
|          irreverent|        5.0|         3|
|     highly quotable|        5.0|         3|
|        space action|        5.0|         2|
|        cult classic|        5.0|         2|
|          very funny|        5.0|         2|
|            morality|        5.0|         2|
|         nick hornby|        5.0|         2|
|      classic sci-fi|        5.0|         2|
|          characters|        5.0|         2|
|          surrealism|        5.0|         2|
|         wall street|        5.0|         2|
|psychological thr...|        5.0|         2|
|      great dialogue|        5.0|         2|
|           offensive|        5.0|         2|
|         complicated|        5.0|         2|
|           satirical|        5.0|         2|
|    heroic bloodshed|        5.0|

In [18]:
sorted_df.filter(sorted_df["count(tag)"]>10).show(20)

+-----------------+------------------+----------+
|              tag|       avg(rating)|count(tag)|
+-----------------+------------------+----------+
|           satire| 4.708333333333333|        12|
|         mindfuck| 4.666666666666667|        15|
|     black comedy| 4.607142857142857|        14|
|     twist ending|             4.525|        20|
|     imdb top 250|               4.5|        11|
|      dark comedy|              4.45|        20|
|       psychology|             4.425|        20|
|    psychological| 4.409090909090909|        11|
|       disturbing| 4.384615384615385|        13|
|             dark|             4.375|        16|
|      atmospheric| 4.280487804878049|        41|
|         suspense|             4.275|        20|
|          surreal| 4.270833333333333|        24|
|           quirky|4.2272727272727275|        22|
|thought-provoking| 4.181818181818182|        22|
|            anime| 4.166666666666667|        12|
|           action| 4.142857142857143|        14|
