In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=601da725972c9009a76e76f02ff69c5fb70e60d8b025ccf310564364376e2e66
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


## Actual Start from here

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Movie Recommendation System').getOrCreate()

# load dataset
data = spark.read.csv('/content/rating.csv', inferSchema = True, header = True)

# displaying sample data
data.show(5)

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|      2|   3.5|2005-04-02 23:53:47|
|     1|     29|   3.5|2005-04-02 23:31:16|
|     1|     32|   3.5|2005-04-02 23:33:39|
|     1|     47|   3.5|2005-04-02 23:32:07|
|     1|     50|   3.5|2005-04-02 23:29:40|
+------+-------+------+-------------------+
only showing top 5 rows



In [None]:
# show the statistics of the data
data.describe().show()

+-------+------------------+------------------+-----------------+
|summary|            userId|           movieId|           rating|
+-------+------------------+------------------+-----------------+
|  count|             97833|             97832|            97832|
|   mean|355.48587899788413|  8543.65226101889|3.509276105977594|
| stddev|192.62263141528499|19001.182096633605|1.061531765466636|
|    min|                 1|                 1|              0.5|
|    max|               685|            128488|              5.0|
+-------+------------------+------------------+-----------------+



In [None]:
# Show schema and data
# movies_df.printSchema()
data.printSchema()

# Handle missing values (if any)
# movies_df = movies_df.dropna()
ratings_df = data.dropna()


root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [None]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

# split data into training and test sets with 70% in training data
train, test = data.randomSplit([0.7, 0.3], seed = 42)

# Build recommendation model using ALS
als = ALS(maxIter = 5, regParam = 0.01, userCol = "userId", itemCol = "movieId", ratingCol = "rating", coldStartStrategy = "drop")
model = als.fit(train)

# Generate predictions
predictions = model.transform(test)

# Evaluate the model
evaluator = RegressionEvaluator(metricName = "rmse", labelCol="rating", predictionCol = "prediction")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE): " + str(rmse))

Root Mean Squared Error (RMSE): 1.1537641880849052


In [None]:
user_recs = model.recommendForAllUsers(10)
user_recs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{66934, 5.582471...|
|     2|[{2924, 7.167936}...|
|     3|[{48877, 7.07324}...|
|     4|[{2083, 7.635319}...|
|     5|[{4866, 11.025259...|
|     6|[{326, 9.224367},...|
|     7|[{2583, 5.5500984...|
|     8|[{3844, 8.260239}...|
|     9|[{34437, 8.222774...|
|    10|[{1211, 6.8058424...|
|    11|[{36525, 7.130010...|
|    12|[{942, 6.866213},...|
|    13|[{1176, 9.729571}...|
|    14|[{8973, 6.3701262...|
|    15|[{2937, 5.9242325...|
|    16|[{2583, 8.831705}...|
|    17|[{46970, 11.14433...|
|    18|[{27803, 5.183718...|
|    19|[{1211, 7.078035}...|
|    20|[{48322, 6.739762...|
+------+--------------------+
only showing top 20 rows



In [None]:
# Display recommendations for a specific user ID
user_id = 3  # Replace with the desired user ID
recommendations = user_recs.filter(user_recs['userId'] == user_id).collect()

if recommendations:
    print(f"Recommendations for user {user_id}:")
    for movie_rec in recommendations[0]['recommendations']:
        print(f"Movie ID: {movie_rec['movieId']}, Predicted Rating: {movie_rec['rating']}")
else:
    print(f"No recommendations found for user {user_id}.")

Recommendations for user 3:
Movie ID: 48877, Predicted Rating: 7.073239803314209
Movie ID: 66934, Predicted Rating: 6.661294460296631
Movie ID: 7371, Predicted Rating: 6.524487018585205
Movie ID: 56251, Predicted Rating: 6.485762596130371
Movie ID: 1934, Predicted Rating: 6.451491355895996
Movie ID: 36525, Predicted Rating: 6.446295261383057
Movie ID: 334, Predicted Rating: 6.422379493713379
Movie ID: 6270, Predicted Rating: 6.325499534606934
Movie ID: 5291, Predicted Rating: 6.304076194763184
Movie ID: 2068, Predicted Rating: 6.299541473388672


In [None]:
# Load movies dataset
movies = spark.read.csv('/content/movie.csv', inferSchema=True, header=True)
movies.show(5)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [None]:
# Join ratings with movies on movieId
ratings_with_movies = ratings_df.join(movies, on='movieId')
ratings_with_movies.show(5)

+-------+------+------+-------------------+--------------------+--------------------+
|movieId|userId|rating|          timestamp|               title|              genres|
+-------+------+------+-------------------+--------------------+--------------------+
|      2|     1|   3.5|2005-04-02 23:53:47|      Jumanji (1995)|Adventure|Childre...|
|     29|     1|   3.5|2005-04-02 23:31:16|City of Lost Chil...|Adventure|Drama|F...|
|     32|     1|   3.5|2005-04-02 23:33:39|Twelve Monkeys (a...|Mystery|Sci-Fi|Th...|
|     47|     1|   3.5|2005-04-02 23:32:07|Seven (a.k.a. Se7...|    Mystery|Thriller|
|     50|     1|   3.5|2005-04-02 23:29:40|Usual Suspects, T...|Crime|Mystery|Thr...|
+-------+------+------+-------------------+--------------------+--------------------+
only showing top 5 rows



In [None]:
# Filter by movie title or genre
movie_title = "Toy Story (1995)"
genre = "Adventure"

# Filter movies by title or genre
selected_movies = movies.filter((movies['title'] == movie_title) | (movies['genres'].contains(genre)))
selected_movies.show()

# Get movieId(s) for selected movies
selected_movie_ids = [row.movieId for row in selected_movies.collect()]


+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      8| Tom and Huck (1995)|  Adventure|Children|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     13|        Balto (1995)|Adventure|Animati...|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     29|City of Lost Chil...|Adventure|Drama|F...|
|     33|Wings of Courage ...|Adventure|Romance...|
|     44|Mortal Kombat (1995)|Action|Adventure|...|
|     53|     Lamerica (1994)|     Adventure|Drama|
|     56|Kids of the Round...|Adventure|Childre...|
|     60|Indian in the Cup...|Adventure|Childre...|
|     86| White Squall (1996)|Action|Adventure|...|
|     95| Broken Arrow (1996)|Action|Adventure|...|
|    101|Bottle Rocket (1996)|Adventure|Comedy|...|
|    107|Muppet Treasure I...|Adventure|Childre...|
|    112|Rum

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|     1|[{4103, 5.249239}...|
|     2|[{41569, 6.626610...|
|     3|[{2720, 6.3664618...|
|     4|[{2384, 8.900747}...|
|     5|[{1566, 8.614773}...|
|     6|[{87232, 7.281047...|
|     7|[{31658, 5.734188...|
|     8|[{87232, 9.710864...|
|     9|[{1884, 3.4004688...|
|    10|[{940, 5.146997},...|
|    11|[{940, 6.930997},...|
|    12|[{2077, 6.428654}...|
|    13|[{1030, 9.953707}...|
|    14|[{940, 5.4838448}...|
|    15|[{26662, 5.742769...|
|    16|[{5463, 6.5301504...|
|    17|[{5301, 7.8078485...|
|    18|[{55269, 7.644380...|
|    19|[{1030, 10.300052...|
|    20|[{59387, 5.549627...|
+------+--------------------+
only showing top 20 rows



In [None]:
# split data into training and test sets with 80% in training data
train, test = data.randomSplit([0.8, 0.2], seed = 42)

# Build recommendation model using ALS
als = ALS(maxIter = 5, regParam = 0.01, userCol = "userId", itemCol = "movieId", ratingCol = "rating", coldStartStrategy = "drop")
model = als.fit(train)

# Generate predictions
predictions = model.transform(test)

# Evaluate the model
evaluator = RegressionEvaluator(metricName = "rmse", labelCol="rating", predictionCol = "prediction")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE): " + str(rmse))

Root Mean Squared Error (RMSE): 1.11732560245536
