In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load datasets
ratings_data = pd.read_csv("/content/drive/MyDrive/CSE426_DataMining&WarehouseLAB/Assignment01/ratings.csv")
movies_data = pd.read_csv("/content/drive/MyDrive/CSE426_DataMining&WarehouseLAB/Assignment01/movies.csv")


In [4]:
# Display raw data
ratings_data.head()
movies_data.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
# Step 1: Create movie-to-movie similarity matrix
user_movie_pivot = ratings_data.pivot_table(index='userId', columns='movieId', values='rating')
similarity_matrix = user_movie_pivot.corr(method='pearson')  # Using Pearson correlation
similarity_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.330978,0.487109,1.0,0.310971,0.106465,0.208402,0.968246,0.095913,-0.021409,...,,,,,,,,,,
2,0.330978,1.0,0.419564,,0.562791,0.16351,0.430261,0.415227,0.27735,0.016626,...,,,,,,,,,,
3,0.487109,0.419564,1.0,,0.602266,0.345069,0.554088,0.333333,0.458591,-0.050276,...,,,,,,,,,,
4,1.0,,,1.0,0.654654,,0.203653,,,0.870388,...,,,,,,,,,,
5,0.310971,0.562791,0.602266,0.654654,1.0,0.291302,0.609119,0.555556,0.319173,0.218263,...,,,,,,,,,,


In [6]:
# Step 2: Movie recommendation based on a given movie
def get_similar_movies(target_movie_id, num_recommendations=5):
    if target_movie_id not in similarity_matrix:
        return "Selected movie not found in the dataset."

    similarity_scores = similarity_matrix[target_movie_id].dropna()
    top_matches = similarity_scores.sort_values(ascending=False)[1:num_recommendations+1]

    top_movies = movies_data[movies_data["movieId"].isin(top_matches.index)][["movieId", "title"]]
    return top_movies


In [7]:
# Example: Recommend movies similar to movieId = 1
top_recommendations = get_similar_movies(1, num_recommendations=5)
top_recommendations

Unnamed: 0,movieId,title
1467,1992,Child's Play 2 (1990)
2648,3545,Cabaret (1972)
4158,5988,Quicksilver (1986)
4190,6041,Amen. (2002)
6141,43919,Date Movie (2006)


In [14]:
selected_user = int(input("Enter your user ID: "))
user_rated = ratings_data[ratings_data['userId'] == selected_user]
user_rated.head()


Enter your user ID: 15


Unnamed: 0,userId,movieId,rating,timestamp
1434,15,1,2.5,1510577970
1435,15,44,1.0,1299424916
1436,15,47,3.5,1510571970
1437,15,158,1.0,1299424840
1438,15,172,1.0,1299424762


In [9]:
# Step 2: Find the highest-rated movie by the user
fav_movie = user_rated.loc[user_rated['rating'].idxmax()]
fav_movie

Unnamed: 0,1440
userId,15.0
movieId,260.0
rating,5.0
timestamp,1510572000.0


In [10]:
# Step 3: Identify movies not rated by the user
all_movie_ids = set(movies_data["movieId"])
rated_by_user = set(user_rated["movieId"])
not_rated_yet = all_movie_ids - rated_by_user

unseen_movies = movies_data[movies_data["movieId"].isin(not_rated_yet)]
unseen_movies.head()


Unnamed: 0,movieId,title,genres
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller


In [11]:
# Step 4: Recommend movies not rated by the user, sorted by average rating
def recommend_unseen_top_movies(user_id, num_movies=5):
    user_history = ratings_data[ratings_data["userId"] == user_id]
    movies_rated = set(user_history["movieId"])
    full_list = set(movies_data["movieId"])
    movies_left = full_list - movies_rated

    candidate_movies = movies_data[movies_data["movieId"].isin(movies_left)]

    avg_movie_scores = ratings_data.groupby("movieId")["rating"].mean()

    final_recommendations = candidate_movies.merge(
        avg_movie_scores, on="movieId", how="left"
    ).sort_values(by="rating", ascending=False).head(num_movies)

    return final_recommendations[["movieId", "title", "rating"]]


In [13]:
user_id = int(input("Enter your user ID: "))
final_suggestions = recommend_unseen_top_movies(user_id=user_id, num_movies=10)
final_suggestions


Enter your user ID: 15


Unnamed: 0,movieId,title,rating
45,53,Lamerica (1994),5.0
9576,187717,Won't You Be My Neighbor? (2018),5.0
4709,7122,King of Hearts (1966),5.0
4694,7096,Rivers and Tides (2001),5.0
4673,7071,"Woman Under the Influence, A (1974)",5.0
8695,131237,What Men Talk About (2010),5.0
6103,44851,Go for Zucker! (Alles auf Zucker!) (2004),5.0
6109,44943,9/11 (2002),5.0
6131,45503,Peaceful Warrior (2006),5.0
6203,47736,"Chump at Oxford, A (1940)",5.0
