In [4]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors  
from fuzzywuzzy import process

In [5]:
movies = "../Labb/Small/movies.csv"
ratings = "../Labb/Small/ratings.csv"

df_movies = pd.read_csv(movies, usecols=['movieId', 'title'], dtype={'movieId': 'int32', 'title': 'str'})
df_ratings = pd.read_csv(ratings, usecols=['userId', 'movieId', 'rating'], dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [6]:
# Sparse Matrix
# Example of a sparse matrix
#        Users
#       [4,4,5] A
# Movies[3,3,4]  B == Cos(A,B) => 0.95
#       [3,2,1]

movie_users = df_ratings.pivot(index="movieId", columns="userId", values="rating").fillna(0) 
mat_movies_users = csr_matrix(movie_users.values) # Creates a sparce matrix

In [7]:
df_movies.describe()

Unnamed: 0,movieId
count,9742.0
mean,42200.353623
std,52160.494854
min,1.0
25%,3248.25
50%,7300.0
75%,76232.0
max,193609.0


In [8]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


model_knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=20) # brute is used because the data is sparse## Examples of difference in calculating for distance in KNN

- Ecludian Distance is the square root of the sum of the squared differences between the two points
- Manhattan distance is the sum of the absolute differences between the two points
- Minkowski Distance is the sum of the absolute differences between the two points raised to the power of n

- Cosine similarity is the dot product of the two vectors divided by the product of the magnitude of the two vectors

In [9]:
model_knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=20) # brute is used because the data is sparse

In [10]:
model_knn.fit(mat_movies_users) # Fitting the model

In [11]:
# Recommender function
# Recommender(movie_name) => [list of recommended movies] recommends to us.

def recommender(movie_name, data, model, n_recommendations):
    model.fit(data) # Fitting the model
    idx=process.extractOne(movie_name, df_movies['title'])[2] # Extracting the index of the movie
    print('Movie Selected: ', df_movies['title'][idx], 'Index: ', idx) # Printing the movie and index
    print('Searching for recommendations...') # Printing message
    distances, indices = model.kneighbors(data[idx], n_neighbors=n_recommendations+1) # Extracting distances and indices
    for i in indices:
        print(df_movies['title'][i].where(i != idx)) # Printing the recommendations

recommender("Princess and the frog", mat_movies_users, model_knn, 20) # Testing the recommender function

Movie Selected:  Princess and the Frog, The (2009) Index:  7207
Searching for recommendations...
7207                                                  NaN
7428                                        Sintel (2010)
7364                                Enter the Void (2009)
8960                             Da geht noch was! (2013)
7377                       Kids Are All Right, The (2010)
6757                                Strangers, The (2008)
8181                            Way, Way Back, The (2013)
7109           Christopher Columbus: The Discovery (1992)
5039    Legend, The (Legend of Fong Sai-Yuk, The) (Fon...
7337                             Cemetery Junction (2010)
8396                                          Noah (2014)
7392                             Two Escobars, The (2010)
8234               Batman: Mystery of the Batwoman (2003)
7112                             It Might Get Loud (2008)
7902    Dragon Ball: Sleeping Princess in Devil's Cast...
8346                     Jack Rya