In [1]:
# ✅ Import libraries
from pymongo import MongoClient
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# ✅ Connect to MongoDB Atlas
MONGO_URI = "mongodb+srv://admin:Ananya%402801@cluster0.jann4mj.mongodb.net/movie_recommender?retryWrites=true&w=majority&tls=true"
client = MongoClient(MONGO_URI)
db = client["movie_recommender"]
collection = db["histories"]

In [3]:
# ✅ Load data
data = list(collection.find())
df = pd.DataFrame(data)

In [4]:
# ✅ Drop irrelevant columns
df = df.drop(columns=['_id', 'timestamp', '__v'], errors='ignore')

In [5]:
# ✅ Convert ObjectId to string for 'user'
df['user'] = df['user'].astype(str)

In [6]:
df

Unnamed: 0,user,title
0,67fcab79c055d149ee6dca27,OMG – Oh My God!
1,67fcab79c055d149ee6dca27,OMG – Oh My God!
2,67fcab79c055d149ee6dca27,Uri: The Surgical Strike
3,67fcab79c055d149ee6dca27,Uri: The Surgical Strike
4,67fcab79c055d149ee6dca27,Black Friday (2007 film)
5,67fcab79c055d149ee6dca27,Black Friday (2007 film)
6,67fcab79c055d149ee6dca27,Gol Maal
7,67fcab79c055d149ee6dca27,Gol Maal
8,67fcab79c055d149ee6dca27,Jaane Bhi Do Yaaro
9,67fcab79c055d149ee6dca27,Jaane Bhi Do Yaaro


In [7]:
# ✅ Create user-movie matrix
user_movie_matrix = pd.crosstab(df['user'], df['title'])
user_movie_matrix

title,2 States,Andaz Apna Apna,Article 15,Bareilly Ki Barfi,Beta,Bhoot (film),Black Friday (2007 film),Chhalia,Dehraadun Diary,Dhoom 2,...,Section 375,Shamitabh,Talvar,Tezaab,Uri,Uri: The Surgical Strike,Vishwaroopam,War,Welcome,Yeh Jawaani Hai Deewani
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
661eabc123abc123abc123aa,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
661eabc123abc123abc123bb,0,1,0,0,1,0,0,0,0,1,...,0,0,0,1,0,0,0,1,1,0
661eabc123abc123abc123cc,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
67fcab79c055d149ee6dca27,0,0,0,0,0,2,2,1,1,0,...,3,1,0,0,0,2,1,0,0,0


In [8]:
# ✅ Compute cosine similarity
user_similarity = cosine_similarity(user_movie_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_movie_matrix.index, columns=user_movie_matrix.index)

In [9]:
# ✅ Function to recommend movies to a user
def recommend_movies_for_user(target_user_id, top_n_similar_users=5):
    if target_user_id not in user_similarity_df.index:
        return f"User {target_user_id} not found."
    
    # Get similar users
    similar_users = user_similarity_df[target_user_id].sort_values(ascending=False)[1:top_n_similar_users+1].index
    
    # Movies watched by similar users but not by target user
    target_user_movies = set(df[df['user'] == target_user_id]['title'])
    recommendations = set()
    
    for user in similar_users:
        user_movies = set(df[df['user'] == user]['title'])
        recommendations.update(user_movies - target_user_movies)
    
    return sorted(recommendations)

In [10]:
# ✅ Example: Recommend movies for a specific user
target_user = df['user'].iloc[1]  # Or any known user ID from the DataFrame
recommend_movies_for_user(target_user)

['2 States',
 'Andaz Apna Apna',
 'Article 15',
 'Bareilly Ki Barfi',
 'Beta',
 'Dhoom 2',
 'Dil To Pagal Hai',
 'Dilwale Dulhania Le Jayenge',
 'Hum Tum',
 'Judaai',
 'Kabhi Khushi Kabhie Gham',
 'Kahaani',
 'Kuch Kuch Hota Hai',
 'Luka Chuppi',
 'Masaan',
 'Mimi',
 'Mr. India',
 'October',
 'Pathaan',
 'Piku',
 'Raazi',
 'Ram Lakhan',
 'Rang De Basanti',
 'Sardar Udham',
 'Talvar',
 'Tezaab',
 'Uri',
 'War',
 'Welcome',
 'Yeh Jawaani Hai Deewani']

In [12]:
# ✅ Create movie-user matrix (transpose of user-movie)
movie_user_matrix = pd.crosstab(df['title'], df['user'])

In [13]:
# ✅ Compute item-item cosine similarity
item_similarity = cosine_similarity(movie_user_matrix)
item_similarity_df = pd.DataFrame(item_similarity, index=movie_user_matrix.index, columns=movie_user_matrix.index)

In [14]:
def recommend_similar_movies(movie_title, top_n=5):
    """
    Recommends movies similar to the given movie title based on item-item similarity.
    
    Parameters:
        movie_title (str): Title of the reference movie.
        top_n (int): Number of similar movies to return.
    
    Returns:
        list: List of recommended movie titles.
    """
    if movie_title not in item_similarity_df.index:
        return f"❌ Movie '{movie_title}' not found in similarity matrix."

    # Get similarity scores and sort them
    similar_scores = item_similarity_df[movie_title].sort_values(ascending=False)

    # Skip the movie itself (first one), return top_n similar movies
    recommended = similar_scores.iloc[1:top_n+1].index.tolist()

    return recommended

In [16]:
# Test the function
recommend_similar_movies("2 States")  # replace with any movie title from your DB

['Dil To Pagal Hai',
 'Mimi',
 'Luka Chuppi',
 'Kuch Kuch Hota Hai',
 'Kabhi Khushi Kabhie Gham']

In [18]:
with open('user_similarity.pkl', 'wb') as f:
    pickle.dump(user_similarity_df, f)

In [19]:
with open('item_similarity.pkl', 'wb') as f:
    pickle.dump(user_similarity_df, f)