In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors  
from fuzzywuzzy import process

In [2]:
movies = "../Labb/Small/movies.csv"
ratings = "../Labb/Small/ratings.csv"

df_movies = pd.read_csv(movies, usecols=['movieId', 'title'], dtype={'movieId': 'int32', 'title': 'str'})
df_ratings = pd.read_csv(ratings, usecols=['userId', 'movieId', 'rating'], dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [3]:
df_movies['title']

0                                Toy Story (1995)
1                                  Jumanji (1995)
2                         Grumpier Old Men (1995)
3                        Waiting to Exhale (1995)
4              Father of the Bride Part II (1995)
                          ...                    
9737    Black Butler: Book of the Atlantic (2017)
9738                 No Game No Life: Zero (2017)
9739                                 Flint (2017)
9740          Bungo Stray Dogs: Dead Apple (2018)
9741          Andrew Dice Clay: Dice Rules (1991)
Name: title, Length: 9742, dtype: object

In [4]:
# Sparse Matrix
# Example of a sparse matrix
#        Users
#       [4,4,5] A
# Movies[3,3,4]  B == Cos(A,B) => 0.95
#       [3,2,1]

movie_users = df_ratings.pivot(index="movieId", columns="userId", values="rating").fillna(0) 
mat_movies_users = csr_matrix(movie_users.values) # Creates a sparce matrix

In [5]:
mat_movies_users

<9724x610 sparse matrix of type '<class 'numpy.float32'>'
	with 100836 stored elements in Compressed Sparse Row format>

In [6]:
df_movies.describe()

Unnamed: 0,movieId
count,9742.0
mean,42200.353623
std,52160.494854
min,1.0
25%,3248.25
50%,7300.0
75%,76232.0
max,193609.0


In [7]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


model_knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=20) # brute is used because the data is sparse## Examples of difference in calculating for distance in KNN

- Ecludian Distance is the square root of the sum of the squared differences between the two points
- Manhattan distance is the sum of the absolute differences between the two points
- Minkowski Distance is the sum of the absolute differences between the two points raised to the power of n

- Cosine similarity is the dot product of the two vectors divided by the product of the magnitude of the two vectors

In [8]:
model_knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=20) # brute is used because the data is sparse

In [9]:
model_knn.fit(mat_movies_users) # Fitting the model

In [28]:
# Recommender function
# Recommender(movie_name) => [list of recommended movies] recommends to us.

def recommender(movie_name, data, model, n_recommendations):
    idx=process.extractOne(movie_name, df_movies['title'])[2] # Extracting the index of the movie
    print('Movie Selected: ', df_movies['title'][idx], 'Index: ', idx) # Printing the movie and index
    print('Searching for recommendations...') # Printing message
    distances, indices = model.kneighbors(data[idx], n_neighbors=n_recommendations+1) # Extracting distances and indices
    print(distances, indices, idx)
    print()
    for i in indices:
        print(df_movies['title'][i].where(i != idx)) # Printing the recommendations

recommender("The revenant", mat_movies_users, model_knn, 10) # Testing the recommender function

Movie Selected:  The Revenant (2015) Index:  8990
Searching for recommendations...
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]] [[8545 5585 9321 9616 8990 7108 7057 9117 8151 4940 8071]] 8990

8545               Nightcrawler (2014)
5585       Sonatine (Sonachine) (1993)
9321    Pelé: Birth of a Legend (2016)
9616     Maz Jobrani: Immigrant (2017)
8990                               NaN
7108                    G-Force (2009)
7057                   Year One (2009)
9117             Idaho Transfer (1973)
8151                 Iron Man 3 (2013)
4940                Man on Fire (2004)
8071       John Dies at the End (2012)
Name: title, dtype: object


In [25]:
def recommender_tester(movie_name, data, model, n_recommendations):
    idx=process.extractOne(movie_name, df_movies['title'])[2] # Extracting the index of the movie
    print('Movie Selected: ', df_movies['title'][idx], 'Index: ', idx) # Printing the movie and index
    print('Searching for recommendations...') # Printing message
    distances, indices = model.kneighbors(data[idx], n_neighbors=n_recommendations+1) # Extracting distances and indices
    #print(distances, indices)
    for i in range(indices):
        print(df_movies['title'][indices[0]]) # Printing the recommendations

recommender_tester("The revenant", mat_movies_users, model_knn, 10) # Testing the recommender function

Movie Selected:  The Revenant (2015) Index:  8990
Searching for recommendations...


TypeError: only integer scalar arrays can be converted to a scalar index