In [1]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix

from sklearn.neighbors import NearestNeighbors

from fuzzywuzzy import process

from IPython.display import display # used for displaying multiple dataframes in same cell, or in print statements

# Read in Dataset

In [2]:
df_movies = pd.read_csv("assets/movies.csv", usecols = ["movieId", "title"])
df_ratings = pd.read_csv("assets/ratings.csv", usecols = ["userId", "movieId", "rating"])

for dataframe, title in zip([df_movies, df_ratings], ["Movies", "Ratings"]):
    print(f"{title} dataframe: ({len(dataframe):,} rows)")
    display(dataframe.head())
    print("")

# TODO: consider changing datatypes

Movies dataframe: (58,098 rows)


Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)



Ratings dataframe: (27,753,444 rows)


Unnamed: 0,userId,movieId,rating
0,1,307,3.5
1,1,481,3.5
2,1,1091,1.5
3,1,1257,4.5
4,1,1449,4.5





# Create Sparse Matrix

In [3]:
# credit Kevy

# picking out columns to use
movies = pd.Categorical(df_ratings['movieId'])
users = pd.Categorical(df_ratings['userId'])

# creating sparse matrix
mat_movies_users = csr_matrix((df_ratings['rating'], (movies.codes, users.codes)))

mat_movies_users

<53889x283228 sparse matrix of type '<class 'numpy.float64'>'
	with 27753444 stored elements in Compressed Sparse Row format>

# Create Model (KNN)

In [4]:
# using cosine simularity to compare vector angles
# brute force approach, looking through all movies in dataset in comparison
model_KNN = NearestNeighbors(metric = "cosine", algorithm = "brute")

model_KNN.fit(mat_movies_users) # fitting model to sparse matrix

# Recommend Movies

In [5]:
def recommender(movie_name, data = mat_movies_users, model = model_KNN, n_recommendations = 9, debug_mode = False):

    # match input string with closest resembling title in dataframe
    # extractOne returns tuple formatted as ("title", score, index)
    # where score is title simularity, and index is the index of matched title in the dataframe
    idx = process.extractOne(movie_name, df_movies["title"])[2]

    # calculating closest vectors (cosine simularity) to idx of sparse matrix
    distances, indices = model.kneighbors(data[idx], n_neighbors = n_recommendations + 1) # +1 to compensate for excluding first resuls (not recommending self)

    # flatten indices (from [[0, 1, 2, 3]] to [0, 1, 2, 3])
    # remove first index to not include self in results (from [0, 1, 2, 3] to [1, 2, 3]) 
    indices = indices.flatten()[1:]

    if debug_mode: # print some more information if debug mode is set to True
        print(idx) # index of matched title to input string
        print(indices) # index of closest n titles
        print(distances) # cosine simularity to input title
        print("")
    
    # print results:
    print(f"Movie Selected: \"{df_movies.loc[idx]['title']}\"\n") # selected movie title
    for a, i in enumerate(indices): # looping through indices:
        print(f"{a + 1}. {df_movies.loc[i]['title']}") # print each title in order from closest to farthest

### Enter a movie title to get recommendations

In [6]:
recommender("amityville")

Movie Selected: "Amityville 1992: It's About Time (1992)"

1. Amityville: A New Generation (1993)
2. Amityville: Dollhouse (1996)
3. Amityville Curse, The (1990)
4. Amityville 3-D (1983)
5. Amityville II: The Possession (1982)
6. Children of the Corn II: The Final Sacrifice (1993)
7. Children of the Corn III (1994)
8. Body Parts (1991)
9. Gate II: Trespassers, The (1990)
