In [1]:
# 🎬 Movie Recommendation System (Collaborative + User-based)

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from math import sqrt

In [2]:
#Step 1: Load datasets
movies = pd.read_csv("movies.csv")   # movieId, title, genres
ratings = pd.read_csv("ratings.csv") # userId, movieId, rating, timestamp

print("✅ Data loaded successfully")
print("Movies:", movies.shape)
print("Ratings:", ratings.shape)

✅ Data loaded successfully
Movies: (9742, 3)
Ratings: (100836, 4)


In [3]:
#Step 2: Merge datasets
data = pd.merge(ratings, movies, on="movieId")
print("✅ Data merged. Shape:", data.shape)

✅ Data merged. Shape: (100836, 6)


In [5]:
#Step 3: Create User-Movie Rating Matrix
user_movie_matrix = data.pivot_table(index="userId", columns="title", values="rating")
print("✅ User-Movie matrix created. Shape:", user_movie_matrix.shape)

✅ User-Movie matrix created. Shape: (610, 9719)


In [6]:
# Fill NaN with 0
user_movie_matrix = user_movie_matrix.fillna(0)

In [7]:
#Step 4: Compute Movie Similarity (Item-Based CF)
movie_similarity = cosine_similarity(user_movie_matrix.T)
movie_similarity_df = pd.DataFrame(movie_similarity, index=user_movie_matrix.columns, columns=user_movie_matrix.columns)

print("✅ Similarity matrix ready!")


✅ Similarity matrix ready!


In [8]:
#Step 5: Movie Recommendation Function
def recommend_movies(movie_name, num_recommendations=5):
    if movie_name not in movie_similarity_df.columns:
        print("❌ Movie not found in database.")
        return []

    similar_scores = movie_similarity_df[movie_name].sort_values(ascending=False)
    recommendations = similar_scores.iloc[1:num_recommendations+1].index.tolist()

    print(f"\n🎬 Because you liked **{movie_name}**, you might also enjoy:")
    for i, rec in enumerate(recommendations, 1):
        print(f"{i}. {rec}")
    return recommendations

In [9]:
#Step 6: User-Based Collaborative Filtering ---
#Train-Test split for evaluation
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [10]:
# Build user-item matrix for training
train_matrix = train.pivot_table(index="userId", columns="movieId", values="rating").fillna(0)
user_similarity = cosine_similarity(train_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=train_matrix.index, columns=train_matrix.index)

print("✅ User similarity computed!")

✅ User similarity computed!


In [11]:
# Function to predict rating using user-based CF
def predict_rating(user_id, movie_id):
    if movie_id not in train_matrix.columns:
        return 0

    sim_scores = user_similarity_df[user_id]
    ratings = train_matrix[movie_id]

    # Weighted average of ratings
    numerator = np.dot(sim_scores, ratings)
    denominator = np.abs(sim_scores).sum()
    if denominator == 0:
        return 0
    return numerator / denominator

In [12]:
#Step 7: Evaluate Model with RMSE
y_true, y_pred = [], []
for row in test.itertuples():
    true_rating = row.rating
    predicted = predict_rating(row.userId, row.movieId)
    if predicted > 0:
        y_true.append(true_rating)
        y_pred.append(predicted)

rmse = sqrt(mean_squared_error(y_true, y_pred))
print(f"\n📊 Model Evaluation: RMSE = {rmse:.4f}")


📊 Model Evaluation: RMSE = 3.2249


In [13]:
#Step 8: Example Runs
recommend_movies("Toy Story (1995)", num_recommendations=5)


🎬 Because you liked **Toy Story (1995)**, you might also enjoy:
1. Toy Story 2 (1999)
2. Jurassic Park (1993)
3. Independence Day (a.k.a. ID4) (1996)
4. Star Wars: Episode IV - A New Hope (1977)
5. Forrest Gump (1994)


['Toy Story 2 (1999)',
 'Jurassic Park (1993)',
 'Independence Day (a.k.a. ID4) (1996)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Forrest Gump (1994)']

In [14]:
recommend_movies("Jurassic Park (1993)", num_recommendations=5)


🎬 Because you liked **Jurassic Park (1993)**, you might also enjoy:
1. Terminator 2: Judgment Day (1991)
2. Forrest Gump (1994)
3. Braveheart (1995)
4. Fugitive, The (1993)
5. Speed (1994)


['Terminator 2: Judgment Day (1991)',
 'Forrest Gump (1994)',
 'Braveheart (1995)',
 'Fugitive, The (1993)',
 'Speed (1994)']

In [15]:
recommend_movies("Richie Rich (1994)", num_recommendations=5)


🎬 Because you liked **Richie Rich (1994)**, you might also enjoy:
1. Next Karate Kid, The (1994)
2. Jury Duty (1995)
3. Son in Law (1993)
4. RoboCop 3 (1993)
5. Tom and Huck (1995)


['Next Karate Kid, The (1994)',
 'Jury Duty (1995)',
 'Son in Law (1993)',
 'RoboCop 3 (1993)',
 'Tom and Huck (1995)']

In [16]:
# Predict a rating for a user
sample_user = 1
sample_movie = 1  # movieId = 1 -> Toy Story
pred = predict_rating(sample_user, sample_movie)
print(f"\n⭐ Predicted rating of user {sample_user} for movieId {sample_movie}: {pred:.2f}")


⭐ Predicted rating of user 1 for movieId 1: 1.46
