In [1]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors  
from fuzzywuzzy import process

In [2]:
movies = "../Labb/Small/movies.csv"
ratings = "../Labb/Small/ratings.csv"

df_movies = pd.read_csv(movies, usecols=['movieId', 'title'], dtype={'movieId': 'int32', 'title': 'str'})
df_ratings = pd.read_csv(ratings, usecols=['userId', 'movieId', 'rating'], dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [3]:
df_movies['title']

0                                Toy Story (1995)
1                                  Jumanji (1995)
2                         Grumpier Old Men (1995)
3                        Waiting to Exhale (1995)
4              Father of the Bride Part II (1995)
                          ...                    
9737    Black Butler: Book of the Atlantic (2017)
9738                 No Game No Life: Zero (2017)
9739                                 Flint (2017)
9740          Bungo Stray Dogs: Dead Apple (2018)
9741          Andrew Dice Clay: Dice Rules (1991)
Name: title, Length: 9742, dtype: object

In [4]:
# merge the the title column from df_movies to df_ratings

df = pd.merge(df_ratings, df_movies, on='movieId')

In [5]:
df

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)
...,...,...,...,...
100831,610,160341,2.5,Bloodmoon (1997)
100832,610,160527,4.5,Sympathy for the Underdog (1971)
100833,610,160836,3.0,Hazard (2005)
100834,610,163937,3.5,Blair Witch (2016)


In [6]:
# Sparse Matrix
# Example of a sparse matrix
#        Users
#       [4,4,5] A
# Movies[3,3,4]  B == Cos(A,B) => 0.95
#       [3,2,1]

movie_users = df.pivot(index="movieId", columns="userId", values="rating").fillna(0) 
mat_movies_users = csr_matrix(movie_users.values) # Creates a sparce matrix

model_knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=20) # brute is used because the data is sparse## Examples of difference in calculating for distance in KNN

- Ecludian Distance is the square root of the sum of the squared differences between the two points
- Manhattan distance is the sum of the absolute differences between the two points
- Minkowski Distance is the sum of the absolute differences between the two points raised to the power of n

- Cosine similarity is the dot product of the two vectors divided by the product of the magnitude of the two vectors

In [7]:
model_knn = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=20) # brute is used because the data is sparse

In [8]:
model_knn.fit(mat_movies_users) # Fitting the model

In [9]:

def recommender(movie_name, data, model, n_recommendations):
    model.fit(data) # Fitting the model
    idx=process.extractOne(movie_name, df_movies['title'])[2] # Extracting the index of the movie
    print('Movie Selected: ', df_movies['title'][idx], 'Index: ', idx) # Printing the movie and index
    print('Searching for recommendations...') # Printing message
    distances, indices = model.kneighbors(data[idx], n_neighbors=n_recommendations+1) # Extracting distances and indices
    for i in indices:
        print(df_movies['title'][i].where(i != idx)) # Printing the recommendations

recommender("star trek", mat_movies_users, model_knn, 10) # Testing the recommender function



Movie Selected:  Star Trek: Generations (1994) Index:  287
Searching for recommendations...
287                                  NaN
275                      Stargate (1994)
307      Clear and Present Danger (1994)
509                        Batman (1989)
126                Batman Forever (1995)
337                     True Lies (1994)
378                   Cliffhanger (1993)
398                 Fugitive, The (1993)
508            Dances with Wolves (1990)
418                 Jurassic Park (1993)
138    Die Hard: With a Vengeance (1995)
Name: title, dtype: object


In [10]:
df_ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [11]:
movie_users

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
df_movies

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017)
9738,193583,No Game No Life: Zero (2017)
9739,193585,Flint (2017)
9740,193587,Bungo Stray Dogs: Dead Apple (2018)


In [15]:
import numpy as np


def test(indices):
    for idx in indices:
        id = df_movies.loc[idx]["movieId"]
        sum_dataframe = df_ratings[df_ratings["movieId"] == id]["rating"].sum()
        sum_matrix = mat_movies_users[idx].sum()

        print(f"Comparing index {idx}, result: {sum_dataframe == sum_matrix}")

indices = np.random.randint(800, 820, 10)
test(indices)

Comparing index 807, result: True
Comparing index 811, result: True
Comparing index 807, result: True
Comparing index 805, result: True
Comparing index 817, result: False
Comparing index 801, result: True
Comparing index 803, result: True
Comparing index 805, result: True
Comparing index 811, result: True
Comparing index 817, result: False
