In [15]:
import pandas as pd
import psycopg2
conn = psycopg2.connect(
    host="localhost",
    dbname="moviedb",
    user="ajinkyaambadkar",
    password="Achiever216"
)

# Load all genre mappings
movie_genres_df = pd.read_sql("""
    SELECT mg.movie_id, m.title, g.genre_name
    FROM movie_genres mg
    JOIN movies m ON mg.movie_id = m.movie_id
    JOIN genres g ON mg.genre_id = g.genre_id
""", conn)

conn.close()

movie_genres_df.head()


  movie_genres_df = pd.read_sql("""


Unnamed: 0,movie_id,title,genre_name
0,242,Kolya (1996),Comedy
1,302,L.A. Confidential (1997),Crime
2,302,L.A. Confidential (1997),Film-Noir
3,302,L.A. Confidential (1997),Mystery
4,302,L.A. Confidential (1997),Thriller


In [16]:
# Create one-hot encoded genre matrix
one_hot = pd.get_dummies(movie_genres_df['genre_name'])
movies_with_genres = pd.concat([movie_genres_df[['movie_id', 'title']], one_hot], axis=1)

# Group by movie to collapse duplicates
movies_genre_matrix = movies_with_genres.groupby(['movie_id', 'title']).sum().reset_index()

movies_genre_matrix.head()
movies_genre_matrix.to_csv("data/processed/movie_genres_matrix.csv", index=False)

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

# Drop non-feature columns
genre_features = movies_genre_matrix.drop(['movie_id', 'title'], axis=1)

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(genre_features)

# Index mapping: movie title → row index
movie_indices = pd.Series(movies_genre_matrix.index, index=movies_genre_matrix['title']).drop_duplicates()


In [18]:
def recommend_similar_movies(title, top_n=10):
    if title not in movie_indices:
        return ["Movie not found."]
    
    idx = movie_indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    
    return movies_genre_matrix.iloc[[i[0] for i in sim_scores]]['title'].tolist()


In [19]:
recommend_similar_movies("Toy Story (1995)")


['Aladdin and the King of Thieves (1996)',
 'Aladdin (1992)',
 'Goofy Movie, A (1995)',
 'Santa Clause, The (1994)',
 'Home Alone (1990)',
 'Aristocats, The (1970)',
 'D3: The Mighty Ducks (1996)',
 'Love Bug, The (1969)',
 'Wrong Trousers, The (1993)',
 'Grand Day Out, A (1992)']

In [9]:
recommend_similar_movies("shawshank redemption (1994)")


['Movie not found.']

In [20]:
from surprise import SVD, Dataset, Reader
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Train SVD model
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'movie_id', 'rating']], reader)
trainset = data.build_full_trainset()
model = SVD()
model.fit(trainset)

# Genre matrix for cosine similarity
genre_features = movies_genre_matrix.drop(['movie_id', 'title'], axis=1)
cosine_sim = cosine_similarity(genre_features)

# Mapping from title → index (for cosine_sim lookup)
movie_indices = pd.Series(movies_genre_matrix.index, index=movies_genre_matrix['title']).drop_duplicates()


In [32]:
import numpy as np

alpha = 0.7  # Weight for collaborative filtering

ratings_df = pd.read_csv("data/processed/ratings_clean.csv")
movies_genre_matrix = pd.read_csv("data/processed/movie_genres_matrix.csv")
movie_id_to_title = pd.read_csv("data/processed/movies_clean.csv").set_index("movie_id")["title"].to_dict()


def hybrid_recommend(user_id, top_n=10):
    # All movie IDs
    all_movie_ids = ratings_df['movie_id'].unique()
    
    # Movies already rated by the user
    rated_movies = ratings_df[ratings_df['user_id'] == user_id]['movie_id'].tolist()
    unrated_movies = [mid for mid in all_movie_ids if mid not in rated_movies]
    print(f"User {user_id} has {len(unrated_movies)} unrated movies")

    # Get user's top rated movie (used for content-based similarity)
    user_top_rated = ratings_df[ratings_df['user_id'] == user_id].sort_values(by='rating', ascending=False).head(1)
    if user_top_rated.empty:
        return ["User has no ratings. Cannot recommend."]
    
    top_movie_id = int(user_top_rated['movie_id'].iloc[0])
    top_movie_title = movie_id_to_title.get(top_movie_id, None)
    
    if top_movie_title not in movie_indices:
        return ["User's top rated movie not in similarity matrix."]
    
    top_movie_idx = movie_indices[top_movie_title]
    
    # Prepare scores list
    scores = []
    
    for mid in unrated_movies:
        # Collaborative filtering score
        cf_pred = model.predict(user_id, mid).est
        
        # Content-based score (similarity with user's top movie)
        try:
            movie_title = movie_id_to_title.get(mid, None)
            if movie_title is None or movie_title not in movie_indices:
                continue
            cb_score = float(cosine_sim[top_movie_idx][movie_indices[movie_title]])
        except:
            cb_score = 0.0

        final_score = float(alpha * cf_pred + (1 - alpha) * cb_score)
        scores.append((mid, final_score))

    # Sort and get top N
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[:top_n]
    
    # Map to titles
    recommendations = []
    for mid, score in scores:
        title = movie_id_to_title.get(mid, "Unknown")
        recommendations.append(f"{title} (ID: {mid}) - Score: {score:.2f}")
    
    return recommendations[:top_n]  # where top_n=10



In [33]:
hybrid_recommend(user_id=100,top_n= 10)


User 100 has 1622 unrated movies


['Amadeus (1984) (ID: 191) - Score: 3.24',
 'Secrets & Lies (1996) (ID: 285) - Score: 3.22',
 'Sling Blade (1996) (ID: 223) - Score: 3.15',
 "Schindler's List (1993) (ID: 318) - Score: 3.14",
 'Psycho (1960) (ID: 185) - Score: 3.10',
 '12 Angry Men (1957) (ID: 178) - Score: 3.05',
 'Close Shave, A (1995) (ID: 408) - Score: 3.04',
 'Some Folks Call It a Sling Blade (1993) (ID: 963) - Score: 3.04',
 'Usual Suspects, The (1995) (ID: 12) - Score: 3.02',
 'Three Colors: Red (1994) (ID: 59) - Score: 3.01']