In [1]:
import os
# Find the latest version of spark 3.x  from https://downloads.apache.org/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.5.5'
spark_version = 'spark-3.5.5'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:3 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [69.9 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,675 kB]
Get:13 https://developer.download.nvidia.com/compute/cuda/repos/ub

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import json

In [3]:
# Initialize Spark session
spark = SparkSession.builder.appName("MoviesData").getOrCreate()

# Load the JSON file into a PySpark DataFrame
df = spark.read.json("movie_results.json")  # Replace with your JSON file path

# Show the structure of the JSON to understand the data
df.show(5)

+-----+--------------------+---------------------+---------+-----+--------------------+--------------------+-------+----------+--------------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+------------+----------+-------+--------------------+--------+--------------------+--------------------+-----+------------+----------+
|adult|       backdrop_path|belongs_to_collection|   budget|error|              genres|            homepage|     id|   imdb_id|origin_country|original_language|      original_title|            overview|popularity|         poster_path|production_companies|production_countries|release_date|   revenue|runtime|    spoken_languages|  status|             tagline|               title|video|vote_average|vote_count|
+-----+--------------------+---------------------+---------+-----+--------------------+--------------------+-------+----------+--------------+-----------------+------------------

In [4]:
# Now join with ratings DataFrame (assuming 'ratings.csv' exists)
ratings_df = spark.read.csv('ratings.csv', header=True, inferSchema=True)
links_df = spark.read.csv('links.csv', header=True, inferSchema=True)

# Join df with links_df on the 'id' column of df and 'tmdbId' in links_df
df_with_movieId = df.join(links_df, df.id == links_df.tmdbId, how="inner")

# Join df_with_movieId with ratings_df on 'movieId'
final_df = df_with_movieId.join(ratings_df, df_with_movieId.movieId == ratings_df.movieId, how="inner")

# Create a new DataFrame containing only userId, id (tmdbId), and rating
user_rating_df = final_df.select("userId", "id","title", "rating")

# Show the new DataFrame
user_rating_df.show(5)


+------+---+--------------------+------+
|userId| id|               title|rating|
+------+---+--------------------+------+
|   610| 85|Raiders of the Lo...|   5.0|
|   606| 85|Raiders of the Lo...|   3.5|
|   603| 85|Raiders of the Lo...|   4.0|
|   601| 85|Raiders of the Lo...|   4.0|
|   600| 85|Raiders of the Lo...|   4.0|
+------+---+--------------------+------+
only showing top 5 rows



In [5]:
from pyspark.sql.types import IntegerType, FloatType

# Cast columns to correct types
user_rating_df = user_rating_df \
    .withColumn("userId", user_rating_df["userId"].cast(IntegerType())) \
    .withColumn("id", user_rating_df["Id"].cast(IntegerType())) \
    .withColumn("rating", user_rating_df["rating"].cast(FloatType()))

# Show the transformed DataFrame
user_rating_df.show(5)

+------+---+--------------------+------+
|userId| id|               title|rating|
+------+---+--------------------+------+
|   610| 85|Raiders of the Lo...|   5.0|
|   606| 85|Raiders of the Lo...|   3.5|
|   603| 85|Raiders of the Lo...|   4.0|
|   601| 85|Raiders of the Lo...|   4.0|
|   600| 85|Raiders of the Lo...|   4.0|
+------+---+--------------------+------+
only showing top 5 rows



In [6]:
# Split the data into training and test sets (80% train, 20% test)
(training_data, test_data) = user_rating_df.randomSplit([0.8, 0.2], seed=42)

# Show counts to verify split
print(f"Training data count: {training_data.count()}")
print(f"Test data count: {test_data.count()}")


Training data count: 59527
Test data count: 15029


In [7]:
# Initialize ALS model
als = ALS(
    maxIter=10,         # Maximum number of iterations
    regParam=0.1,       # Regularization parameter
    rank=10,            # Rank of the factorization (latent factors)
    userCol="userId",   # Column name for user
    itemCol="id",   # Column name for item
    ratingCol="rating", # Column name for rating
    coldStartStrategy="drop"  # Drop rows with NaN predictions
)

# Train the ALS model on the training data
model = als.fit(training_data)


In [8]:
# Make predictions on the test data
predictions = model.transform(test_data)

# Show the first few predictions
predictions.show(5)

+------+-----+--------------------+------+----------+
|userId|   id|               title|rating|prediction|
+------+-----+--------------------+------+----------+
|    42| 1645|      A Time to Kill|   5.0| 3.7372363|
|    62|  496|Borat: Cultural L...|   4.0| 3.7295632|
|    63|  496|Borat: Cultural L...|   3.0| 3.3783438|
|    63|99861|Avengers: Age of ...|   4.0| 2.9949172|
|    87| 7340|              Carrie|   3.5| 3.2054973|
+------+-----+--------------------+------+----------+
only showing top 5 rows



In [9]:
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize the evaluator for different metrics
rmse_evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
r2_evaluator = RegressionEvaluator(metricName="r2", labelCol="rating", predictionCol="prediction")
mae_evaluator = RegressionEvaluator(metricName="mae", labelCol="rating", predictionCol="prediction")

# Calculate RMSE, R2, and MAE on the test data
rmse = rmse_evaluator.evaluate(predictions)
r2 = r2_evaluator.evaluate(predictions)
mae = mae_evaluator.evaluate(predictions)

# Print the evaluation metrics
print(f"Root Mean Squared Error (RMSE) = {rmse}")
print(f"R-squared (R2) = {r2}")
print(f"Mean Absolute Error (MAE) = {mae}")


Root Mean Squared Error (RMSE) = 0.8513211907740307
R-squared (R2) = 0.3232850925430484
Mean Absolute Error (MAE) = 0.6579991852370197


In [10]:
from pyspark.sql import functions as F
# Generate recommendations for a specific user
user_recommendations = model.recommendForUserSubset(training_data.filter(training_data.userId == 1), 10)


# Flatten the recommendations array
user_recommendations_flat = user_recommendations.withColumn("movie", F.explode("recommendations"))

# Extract id and rating from the struct (instead of movieId, we now use id)
user_recommendations_flat = user_recommendations_flat.withColumn("movieId", user_recommendations_flat.movie.getItem("id")) \
                                                     .withColumn("rating", user_recommendations_flat.movie.getItem("rating")) \
                                                     .drop("movie")

# Rename the 'userId' column in user_rating_df to avoid ambiguity
user_rating_df = user_rating_df.withColumnRenamed("userId", "rating_userId")

# Rename 'rating' column in user_recommendations_flat to avoid ambiguity
user_recommendations_flat = user_recommendations_flat.withColumnRenamed("rating", "predicted_rating")

# Perform the join as usual
recommendations_with_titles = user_recommendations_flat.join(user_rating_df,
                                                            user_recommendations_flat.movieId == user_rating_df.id,
                                                            how="inner")

# Select the relevant columns: userId, title, and the predicted_rating column
recommendations_with_titles = recommendations_with_titles.select("userId", "title", "predicted_rating")



#  Get distinct titles per user
distinct_recommendations = recommendations_with_titles.distinct()

distinct_recommendations.show(5)

+------+--------------------+----------------+
|userId|               title|predicted_rating|
+------+--------------------+----------------+
|     1|Hunt for the Wild...|        5.199287|
|     1|           The Trial|        5.218118|
|     1|Mamma Mia! Here W...|       5.2895665|
|     1|      Enter the Void|       5.4748116|
|     1|         Logan Lucky|       5.2895665|
+------+--------------------+----------------+
only showing top 5 rows

