In [1]:
import pandas as pd

# Load ratings & movies from .dat
ratings = pd.read_csv('ratings.dat', sep='::', engine='python',
                      names=['userId','movieId','rating','timestamp'])
movies = pd.read_csv('movies.dat', sep='::', engine='python',
                        names=['movieId','title','genres'], encoding='ISO-8859-1')

# Merge
merged_df = ratings.merge(movies, on='movieId', how='left')


In [2]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Pivot to movies (rows) × users (cols)
pivot = merged_df.pivot_table(index='title', columns='userId', values='rating')

# Keep only reasonably popular movies (speeds things up & improves quality)
min_ratings = 20
popular_titles = merged_df.groupby('title').size().loc[lambda s: s>=min_ratings].index
pivot = pivot.loc[popular_titles]

# Mean-center per user; leave unrated as 0 for cosine
user_means = pivot.mean(axis=0)
pivot_centered = pivot.sub(user_means, axis=1).fillna(0)


In [5]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

X = csr_matrix(pivot_centered.values)
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(X)

# quick helpers
title_to_idx = {t:i for i,t in enumerate(pivot_centered.index)}
idx_to_title = list(pivot_centered.index)


In [6]:
def recommend(title, top_n=10):
    if title not in title_to_idx:
        # simple fuzzy fallback: show close matches if exact title missing
        import difflib
        matches = difflib.get_close_matches(title, pivot_centered.index, n=5, cutoff=0.6)
        raise ValueError(f"Title not found: {title}. Did you mean: {matches}?")

    idx = title_to_idx[title]
    distances, indices = knn.kneighbors(X[idx], n_neighbors=top_n+1)
    rec_indices = indices.flatten()[1:]  # skip itself
    rec_titles = [idx_to_title[i] for i in rec_indices]

    # attach counts for nicer display
    counts = merged_df.groupby('title').size()
    out = pd.DataFrame({'title': rec_titles})
    out['num_ratings'] = out['title'].map(counts)
    return out.sort_values('num_ratings', ascending=False).reset_index(drop=True)

# Example:
# recommend("Avatar (2009)", top_n=10)


In [7]:
recommend('Bait (2000)', top_n=10)

Unnamed: 0,title,num_ratings
0,"Art of War, The (2000)",144
1,Loser (2000),138
2,Get Carter (2000),100
3,"Watcher, The (2000)",94
4,Urban Legends: Final Cut (2000),92
5,"Crossing Guard, The (1995)",50
6,"Crew, The (2000)",40
7,Johns (1996),26
8,Man of the House (1995),24
9,Curdled (1996),20


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

movies['genres'] = movies['genres'].fillna('')
cv = CountVectorizer(max_features=200, stop_words='english')
count_matrix = cv.fit_transform(movies['genres'])
content_sim = cosine_similarity(count_matrix)


In [10]:
user_movie_ratings = ratings.pivot(index='userId', columns='movieId', values='rating')
user_movie_ratings = user_movie_ratings.fillna(0)

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
collab_sim = cosine_similarity(user_movie_ratings.T)  # Transpose for item-item similarity

In [16]:
# 1. Get common movie IDs
common_movie_ids = list(set(movies['movieId']).intersection(set(ratings['movieId'])))

# 2. Filter movies dataframe
movies = movies[movies['movieId'].isin(common_movie_ids)].reset_index(drop=True)

# 3. Filter ratings pivot table
ratings = ratings[ratings['movieId'].isin(common_movie_ids)]

# 4. Recompute content_sim and collab_sim after filtering



In [17]:
# 1. Get common movie IDs
common_movie_ids = list(set(movies['movieId']).intersection(set(ratings['movieId'])))

# 2. Filter movies dataframe
movies = movies[movies['movieId'].isin(common_movie_ids)].reset_index(drop=True)

# 3. Filter ratings pivot table
ratings = ratings[ratings['movieId'].isin(common_movie_ids)]

# 4. Recompute content_sim and collab_sim after filtering


In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Convert genres into bag-of-words
movies['genres'] = movies['genres'].fillna('')
cv = CountVectorizer(max_features=200, stop_words='english')
count_matrix = cv.fit_transform(movies['genres'])

# Content similarity matrix
content_sim = cosine_similarity(count_matrix)


In [20]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Create user-movie rating matrix
user_movie_ratings = ratings.pivot(index='userId', columns='movieId', values='rating')
user_movie_ratings = user_movie_ratings.fillna(0)

# Item-Item similarity (transpose so movies are rows)
collab_sim = cosine_similarity(user_movie_ratings.T)


In [21]:
# Normalize
content_sim_norm = content_sim / content_sim.max()
collab_sim_norm = collab_sim / collab_sim.max()

# Hybrid similarity (50-50 weight, can tune later)
hybrid_sim = 0.5 * content_sim_norm + 0.5 * collab_sim_norm


In [22]:
def hybrid_recommend(movie_title, top_n=10):
    try:
        idx = movies[movies['title'] == movie_title].index[0]
    except IndexError:
        return f"Movie '{movie_title}' not found in dataset."
    
    sim_scores = list(enumerate(hybrid_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_movies = [movies.iloc[i[0]]['title'] for i in sim_scores[1:top_n+1]]
    return top_movies

# Example
print(hybrid_recommend("Toy Story (1995)", top_n=5))


['Toy Story 2 (1999)', "Bug's Life, A (1998)", 'Aladdin (1992)', 'Chicken Run (2000)', 'American Tail, An (1986)']
