In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process
from IPython.display import display 

# Import Data

In [2]:
df_movies = pd.read_csv("assets/movies.csv", usecols = ["movieId", "title"])
df_ratings = pd.read_csv("assets/ratings.csv", usecols = ["userId", "movieId", "rating"])

print(f"Movies dataframe: ({len(df_movies):,} rows)")
display(df_movies.head())

print(f"Ratings dataframe: ({len(df_ratings):,} rows)")
display(df_ratings.head())

Movies dataframe: (58,098 rows)


Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


Ratings dataframe: (27,753,444 rows)


Unnamed: 0,userId,movieId,rating
0,1,307,3.5
1,1,481,3.5
2,1,1091,1.5
3,1,1257,4.5
4,1,1449,4.5


# Create Sparse Matrix

In [3]:
# in part credit to Kevy / ChatGPT

# but fixed a logical error of index mismatch by including categories from df_movies
movies_cat = pd.Categorical(df_ratings['movieId'], categories=df_movies['movieId'])
users_cat = pd.Categorical(df_ratings['userId'])

# create sparse matrix
mat_movies_users = csr_matrix((df_ratings['rating'], (movies_cat.codes, users_cat.codes)))

mat_movies_users

<58098x283228 sparse matrix of type '<class 'numpy.float64'>'
	with 27753444 stored elements in Compressed Sparse Row format>

# Set up Model

In [4]:
# cosine simularity as distance metric, and brute force to go through all vectors
model_KNN = NearestNeighbors(metric = "cosine", algorithm = "brute")
model_KNN.fit(mat_movies_users) # fitting to sparse matrix

# Recommendation Function

In [5]:

def recommender(movie_name, model = model_KNN, data = mat_movies_users, n_recommendations = 9):
    idx = process.extractOne(movie_name, df_movies["title"])[2]

    distances, indices = model.kneighbors(data[idx], n_neighbors = n_recommendations + 1)

    indices = indices.flatten()[1:]
    
    # print results:
    print(f"Movie Selected: \"{df_movies.loc[idx]['title']}\"\n") # selected movie title
    for a, i in enumerate(indices): # looping through indices:
        print(f"{a + 1}. {df_movies.loc[i]['title']}") # print each title in order from closest to farthest

# Test function for index mismatch search

In [6]:
def test(indices):
    """
    test that indices align properly, insert any iterable of multiple indices to check
    """
    for idx in indices:
        id = df_movies.loc[idx]["movieId"]
        sum_dataframe = df_ratings[df_ratings["movieId"] == id]["rating"].sum()
        sum_matrix = mat_movies_users[idx].sum()

        errors = 0

        if sum_dataframe != sum_matrix:
            print(f"Comparing index {idx}, result: {sum_dataframe == sum_matrix} ({sum_dataframe, sum_matrix})")
            errors += 1
        
    if errors == 0:
        print("All indices tested with no mismatches")
    print(f"{errors} errors found")

In [7]:
indices = [i for i in range(9700, 9742)]
test(indices)

All indices tested with no mismatches
0 errors found


# Enter a movie to get recommendations

In [8]:
recommender("amityville")

Movie Selected: "Amityville 1992: It's About Time (1992)"

1. Amityville: A New Generation (1993)
2. Amityville: Dollhouse (1996)
3. Amityville Curse, The (1990)
4. Amityville 3-D (1983)
5. Amityville II: The Possession (1982)
6. Children of the Corn II: The Final Sacrifice (1993)
7. Children of the Corn III (1994)
8. Body Parts (1991)
9. Gate II: Trespassers, The (1990)
