In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("TopRatedMovies").getOrCreate()

rating_file_path = "gs://dataprocbucket_new/rating.csv"

ratings_df = spark.read.csv(rating_file_path, header=True, inferSchema=True)

movie_rating_counts = ratings_df.groupBy("movieID").count()

top_rated_movies = movie_rating_counts.orderBy("count", ascending=False).limit(20)
top_rated_movies.show()

spark.stop()

+-------+-----+
|movieID|count|
+-------+-----+
|    356|81491|
|    318|81482|
|    296|79672|
|    593|74127|
|   2571|72674|
|    260|68717|
|    480|64144|
|    527|60411|
|    110|59184|
|   2959|58773|
|    589|57379|
|   1196|57361|
|      1|57309|
|   4993|55736|
|     50|55366|
|   1210|54917|
|   1198|54675|
|   2858|53689|
|    858|52498|
|   5952|51138|
+-------+-----+



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, round

spark = SparkSession.builder.appName("TopRatedMovies").getOrCreate()

rating_file_path = "gs://dataprocbucket_new/rating.csv"
ratings_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(rating_file_path)

movie_avg_rating = ratings_df.groupBy("movieID").agg(round(avg("rating"), 2).alias("avg_rating"), count("*").alias("num_ratings"))

top_rated_movies = movie_avg_rating.filter(col("num_ratings") >= 20).orderBy(col("avg_rating").desc()).limit(20)

top_rated_movies.select("movieID", "num_ratings", "avg_rating").show()

spark.stop()

+-------+-----------+----------+
|movieID|num_ratings|avg_rating|
+-------+-----------+----------+
| 171011|       1124|      4.48|
| 159817|       1747|      4.46|
|    318|      81482|      4.41|
| 170705|       1356|       4.4|
| 171495|        277|      4.33|
|    858|      52498|      4.32|
| 179135|        659|      4.29|
|     50|      55366|      4.28|
| 174551|         36|      4.28|
| 198185|        288|      4.27|
|   1221|      34188|      4.26|
| 163809|        546|      4.26|
| 176601|        456|      4.26|
| 142115|        564|      4.25|
|   2019|      13367|      4.25|
|    527|      60411|      4.25|
| 147250|        147|      4.25|
|   1203|      16569|      4.24|
| 159819|        229|      4.24|
| 147124|         29|      4.24|
+-------+-----------+----------+

