In [1]:
# Importing necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, desc
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Movie Ratings Analysis") \
    .getOrCreate()

In [None]:
# Load the dataset into a DataFrame
ratings_data = spark.read.csv("ratings.csv", header=True, inferSchema=True)

# Display the first few rows of the dataset
ratings_data.show(5)

In [None]:
# Calculate the average rating for each movie
average_ratings_data = ratings_data.groupBy("movieId").agg(avg("rating").alias("avg_rating"))

# Calculate the number of ratings each movie received
ratings_count_data = ratings_data.groupBy("movieId").agg(count("rating").alias("num_ratings"))

# Filter movies that have at least 50 ratings
popular_movies_data = ratings_count_data.filter(col("num_ratings") >= 50)

In [None]:
# Join with average ratings DataFrame to get the stats for popular movies
popular_movie_stats_data = popular_movies_data.join(average_ratings_data, on="movieId")

# Display the top 10 movies with the highest average ratings that have at least 50 ratings
popular_movie_stats_data.orderBy(col("avg_rating").desc()).show(10)

In [None]:
# Calculate the average rating given by each user
user_avg_ratings_data = ratings_data.groupBy("userId").agg(avg("rating").alias("user_avg_rating"))

# Display the top 10 users with the highest average rating
user_avg_ratings_data.orderBy(col("user_avg_rating").desc()).show(10)

In [None]:
# Calculate the distribution of ratings (number of occurrences of each rating value)
rating_distribution_data = ratings_data.groupBy("rating").count().orderBy(col("rating"))

# Show the distribution of ratings
rating_distribution_data.show()

In [None]:
# Calculate the top 10 users who have rated the most movies
top_users_data = ratings_data.groupBy("userId").agg(count("rating").alias("num_ratings")).orderBy(col("num_ratings").desc())

# Display the top 10 most active users
top_users_data.show(10)

In [None]:
# Convert the rating distribution DataFrame to pandas for plotting
rating_distribution_pd = rating_distribution_data.toPandas()

# Setting up Seaborn style
sns.set_theme(style="whitegrid")

# Create the bar chart with Seaborn
plt.figure(figsize=(10, 6))
bar_plot = sns.barplot(x='rating', y='count', data=rating_distribution_pd, palette='ocean', edgecolor='black')

# Add labels on the bars
for i in range(len(rating_distribution_pd)):
    bar_plot.text(i, rating_distribution_pd['count'][i], f'{rating_distribution_pd["count"][i]:,}', 
                ha='center', va='bottom', fontsize=12, color='black')

# Set labels and title
plt.xlabel('Rating', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.title('Distribution of Movie Ratings', fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Show the graph
plt.tight_layout()
plt.show()

In [None]:
# Stop the Spark session
spark.stop()