In [9]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import warnings
warnings.filterwarnings("ignore")

In [10]:
movies = pd.read_csv("data/movies_metadata.csv", low_memory=False)
ratings = pd.read_csv("data/ratings_small.csv")  
links = pd.read_csv("data/links_small.csv")

In [11]:
movies = movies[['id','title','genres','overview']].copy()
movies['id'] = pd.to_numeric(movies['id'], errors='coerce')
movies = movies.dropna(subset=['id'])
movies['id'] = movies['id'].astype(int)
movies['overview'] = movies['overview'].fillna("")

In [12]:
def convert_genres(obj):
    genres = []
    for i in ast.literal_eval(obj):
        genres.append(i['name'])
    return " ".join(genres)

In [None]:
movies['tags'] = movies['overview'] + " " + movies['genres'].astype(str)

cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(movies['tags'])

# Recommender function 
def recommend_content(movie_title, top_n=5):
    if movie_title not in movies['title'].values:
        return ["Movie not found in dataset."]
    
    idx = movies[movies['title'] == movie_title].index[0]
    movie_vector = vectors[idx] 
    distances = cosine_similarity(movie_vector, vectors).flatten() 
    movie_list = sorted(list(enumerate(distances)), key=lambda x: x[1], reverse=True)[1:top_n+1]
    return [movies.iloc[i[0]]['title'] for i in movie_list]
    


In [14]:
links = links.dropna(subset=['tmdbId'])
links['tmdbId'] = links['tmdbId'].astype(int)

ratings_links = ratings.merge(links[['movieId','tmdbId']], on='movieId')
user_item = ratings_links.pivot_table(index='userId', columns='tmdbId', values='rating').fillna(0)

sparse_matrix = csr_matrix(user_item.values)
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(sparse_matrix)

0,1,2
,n_neighbors,5
,radius,1.0
,algorithm,'brute'
,leaf_size,30
,metric,'cosine'
,p,2
,metric_params,
,n_jobs,


In [19]:
from unittest import result


def recommend_collaborative(user_id, top_n=5):
    if user_id not in user_item.index:
        return ["User not found."]
    distances, indices = model_knn.kneighbors([user_item.loc[user_id]], n_neighbors=6)
    similar_users = [user_item.index[i] for i in indices.flatten()[1:]]
    sim_ratings = ratings_links[ratings_links['userId'].isin(similar_users)]
    recs = sim_ratings.groupby('tmdbId')['rating'].mean().sort_values(ascending=False).head(top_n)
    result = []
    for tid in recs.index:
        title = movies[movies['id']==tid]['title']
        if not title.empty:
            result.append(title.values[0])
    return result
print(result)


<module 'unittest.result' from 'C:\\Program Files\\Python313\\Lib\\unittest\\result.py'>


In [20]:
print("🎬 Content-based (similar to 'Avatar'):")
print(recommend_content("Avatar"))

print("\n👤 Collaborative (for userId=2):")
print(recommend_collaborative(1))


🎬 Content-based (similar to 'Avatar'):
['Avatar 2', 'The Flash 2 - Revenge of the Trickster', 'Fly Me to the Moon', 'Rebirth of Mothra', 'Tales of an Ancient Empire']

👤 Collaborative (for userId=2):
['The Godfather', 'The Godfather: Part II', 'Fargo', 'The Shawshank Redemption', 'Pulp Fiction']
