In [4]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process

class MovieRecommender:
    def __init__(self, ratings_path, movies_path):
        # Load the data
        self.ratings = pd.read_csv(ratings_path)
        self.movies = pd.read_csv(movies_path)
        self.matrix, self.user_mapper, self.movie_mapper, self.user_inv_mapper, self.movie_inv_mapper = self.create_matrix(self.ratings)

    def create_matrix(self, df):
        unique_users = df['userId'].nunique()
        unique_movies = df['movieId'].nunique()

        user_mapper = dict(zip(np.unique(df["userId"]), list(range(unique_users))))
        movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(unique_movies))))
        
        user_inv_mapper = dict(zip(list(range(unique_users)), np.unique(df["userId"])))
        movie_inv_mapper = dict(zip(list(range(unique_movies)), np.unique(df["movieId"])))
        
        user_index = [user_mapper[i] for i in df['userId']]
        item_index = [movie_mapper[i] for i in df['movieId']]

        matrix = csr_matrix((df["rating"], (user_index, item_index)), shape=(unique_users, unique_movies))
        
        return matrix, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

    def movie_finder(self, title):
        all_titles = self.movies['title'].tolist()
        closest_match = process.extractOne(title, all_titles)
        matched_title = closest_match[0]  # Get the best match title
        matched_id = self.movies[self.movies['title'] == matched_title]['movieId'].iloc[0]  # Get the corresponding movie ID
        return matched_id, matched_title

    def find_similar_movies(self, movie_id, k=50, metric='cosine'):
        matrix = self.matrix.T
        movie_ind = self.movie_mapper[movie_id]
        movie_vec = matrix[movie_ind]
        if isinstance(movie_vec, (np.ndarray)):
            movie_vec = movie_vec.reshape(1,-1)
        kNN = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric)
        kNN.fit(matrix)
        neighbour = kNN.kneighbors(movie_vec, return_distance=False)
        neighbour_ids = [self.movie_inv_mapper[n] for n in neighbour[0] if n != movie_ind]
        return neighbour_ids


In [5]:
# You can instantiate and use the MovieRecommender class in the same notebook cell or separate ones
# For example:
recommender = MovieRecommender("ml-25m/ratings.csv", "ml-25m/movies.csv")
movie_id, movie_title = recommender.movie_finder("Toy Story")
recommendations = recommender.find_similar_movies(movie_id)
print(f"Because you watched {movie_title}:")
for id in recommendations:
    print(recommender.movies[recommender.movies['movieId'] == id]['title'].iloc[0])


Because you watched Toy Story (1995):
Star Wars: Episode IV - A New Hope (1977)
Toy Story 2 (1999)
Back to the Future (1985)
Forrest Gump (1994)
Jurassic Park (1993)
Star Wars: Episode VI - Return of the Jedi (1983)
Independence Day (a.k.a. ID4) (1996)
Lion King, The (1994)
Aladdin (1992)
Star Wars: Episode V - The Empire Strikes Back (1980)
Shrek (2001)
Men in Black (a.k.a. MIB) (1997)
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
Monsters, Inc. (2001)
Matrix, The (1999)
Pulp Fiction (1994)
Groundhog Day (1993)
Mission: Impossible (1996)
Willy Wonka & the Chocolate Factory (1971)
Shawshank Redemption, The (1994)
Finding Nemo (2003)
Silence of the Lambs, The (1991)
Terminator 2: Judgment Day (1991)
Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
E.T. the Extra-Terrestrial (1982)
Braveheart (1995)
Apollo 13 (1995)
Indiana Jones and the Last Crusade (1989)
Beauty and the Beast (1991)
Sixth Sense, The (1999)
Fargo (1996)
Bug's Life, A (1998)
Incredibles, The (20

In [10]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process
import ast  # To safely evaluate string literals as lists

class MovieRecommender:
    def __init__(self, ratings_path, movies_path):
        self.ratings = pd.read_csv(ratings_path)
        self.movies = pd.read_csv(movies_path)
        self.movies['genres'] = self.movies['genres'].apply(ast.literal_eval)  # Evaluate the genre string to list
        self.matrix, self.user_mapper, self.movie_mapper, self.user_inv_mapper, self.movie_inv_mapper = self.create_matrix(self.ratings)

    def create_matrix(self, df):
        unique_users = df['userId'].nunique()
        unique_movies = df['movieId'].nunique()

        user_mapper = dict(zip(np.unique(df["userId"]), list(range(unique_users))))
        movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(unique_movies))))
        
        user_inv_mapper = dict(zip(list(range(unique_users)), np.unique(df["userId"])))
        movie_inv_mapper = dict(zip(list(range(unique_movies)), np.unique(df["movieId"])))
        
        user_index = [user_mapper[i] for i in df['userId']]
        item_index = [movie_mapper[i] for i in df['movieId']]

        matrix = csr_matrix((df["rating"], (user_index, item_index)), shape=(unique_users, unique_movies))
        
        return matrix, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

    def movie_finder(self, title):
        all_titles = self.movies['title'].tolist()
        closest_match = process.extractOne(title, all_titles)
        matched_title = closest_match[0]
        matched_id = self.movies[self.movies['title'] == matched_title]['movieId'].iloc[0]
        return matched_id, matched_title

    def find_similar_movies(self, movie_id, k=50, metric='cosine', desired_genre=None):
        matrix = self.matrix.T
        movie_ind = self.movie_mapper[movie_id]
        movie_vec = matrix[movie_ind]
        if isinstance(movie_vec, (np.ndarray)):
            movie_vec = movie_vec.reshape(1,-1)
        kNN = NearestNeighbors(n_neighbors=k+10, algorithm="brute", metric=metric)  # Increase neighbors to filter by genre
        kNN.fit(matrix)
        neighbours = kNN.kneighbors(movie_vec, return_distance=False)
        neighbour_ids = [self.movie_inv_mapper[n] for n in neighbours[0] if n != movie_ind]
        
        if desired_genre:
            filtered_ids = []
            for movie_id in neighbour_ids:
                genres = self.movies[self.movies['movieId'] == movie_id]['genres'].iloc[0]
                if desired_genre in genres:
                    filtered_ids.append(movie_id)
                if len(filtered_ids) >= k:  # Ensure only k movies are recommended
                    break
            return filtered_ids
        return neighbour_ids


    # Usage
recommender = MovieRecommender("ml-25m/ratings.csv", "ml-25m/movies2.csv")
movie_id, movie_title = recommender.movie_finder("Toy Story")
recommendations = recommender.find_similar_movies(movie_id, desired_genre='')
print(f"Because you watched {movie_title}:")
for id in recommendations:
    print(recommender.movies[recommender.movies['movieId'] == id]['title'].iloc[0])