# 1.0 Recommender system - warm up

In [12]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process                      # matches strings even though not completly same

data from https://grouplens.org/datasets/movielens/latest/

In [13]:
df_movies = pd.read_csv('../lab/data/movies_small.csv', usecols=['movieId', 'title'], dtype={'movieID':'int32', 'title': 'str'})
df_ratings = pd.read_csv('../lab/data/ratings_small.csv', usecols=['userId', 'movieId', 'rating'], dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [14]:
df_movies.head(3)

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)


In [15]:
df_ratings.head(3)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0


In [16]:
# creating a pivot table with NaNs and replcing NaN with 0 by fillna(0)
movies_users = df_ratings.pivot(index='movieId', columns='userId', values='rating').fillna(0)
movies_users.head(3)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0


In [17]:
movies_users.values

# each row is a vector

array([[4. , 0. , 0. , ..., 2.5, 3. , 5. ],
       [0. , 0. , 0. , ..., 2. , 0. , 0. ],
       [4. , 0. , 0. , ..., 2. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]], dtype=float32)

In [18]:
# crating a sparse matrix
# indexes position in table (row, col) and gives value for all non-zero entries
matrix_movies_users = csr_matrix(movies_users.values) # same result with just (movies_users)
print(type(matrix_movies_users))
print(matrix_movies_users.shape)
print(matrix_movies_users[0]) 

<class 'scipy.sparse._csr.csr_matrix'>
(9724, 610)
  (0, 0)	4.0
  (0, 4)	4.0
  (0, 6)	4.5
  (0, 14)	2.5
  (0, 16)	4.5
  (0, 17)	3.5
  (0, 18)	4.0
  (0, 20)	3.5
  (0, 26)	3.0
  (0, 30)	5.0
  (0, 31)	3.0
  (0, 32)	3.0
  (0, 39)	5.0
  (0, 42)	5.0
  (0, 43)	3.0
  (0, 44)	4.0
  (0, 45)	5.0
  (0, 49)	3.0
  (0, 53)	3.0
  (0, 56)	5.0
  (0, 62)	5.0
  (0, 63)	4.0
  (0, 65)	4.0
  (0, 67)	2.5
  (0, 70)	5.0
  :	:
  (0, 559)	3.0
  (0, 560)	4.0
  (0, 561)	4.5
  (0, 566)	3.5
  (0, 569)	4.0
  (0, 571)	4.0
  (0, 572)	5.0
  (0, 578)	4.0
  (0, 579)	3.0
  (0, 583)	5.0
  (0, 586)	5.0
  (0, 589)	4.0
  (0, 595)	4.0
  (0, 596)	4.0
  (0, 598)	3.0
  (0, 599)	2.5
  (0, 600)	4.0
  (0, 602)	4.0
  (0, 603)	3.0
  (0, 604)	4.0
  (0, 605)	2.5
  (0, 606)	4.0
  (0, 607)	2.5
  (0, 608)	3.0
  (0, 609)	5.0


In [19]:
# create model:
# metrics: distance (types: Euclidean, Manhattan, Minkowski) or cosine similarity between 2 vectors (rows) -> 0 - 1 where 1 is totally equal
# algoritm: brute compares one vector with all other in dataset
# guessing 20 neighbours

model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20)

In [20]:
# train model

print(model_knn.fit(matrix_movies_users))

NearestNeighbors(algorithm='brute', metric='cosine', n_neighbors=20)


In [21]:
# create function recommender(movie_name) -> list of recommended movies
# data is matrix_movies_users
# idx is index: fuzzywuzzy process used

def recommender(movie_name):
    idx = process.extractOne(movie_name, df_movies['title'])
    print(idx)

recommender('grumpier old men')

('Grumpier Old Men (1995)', 95, 2)


In [22]:
df_movies['title'][2]

'Grumpier Old Men (1995)'

In [25]:
def recommender(movie_name, data, model, n_recommendations):
    model.fit(data)
    idx = process.extractOne(movie_name, df_movies['title'])[2] # to get 2 in example for 'grumpier old men', index in df_movies
    print(f'Movie selected:  {df_movies["title"][idx]};  Index:  {idx}')
    print('Searching for recommendations....')
    distances, indices = model.kneighbors(data[idx], n_neighbors = n_recommendations+1) # distances is cos similarity
    print(distances, indices)
    print([df_movies["title"][i].where(i!=idx) for i in indices]) # list of movie recommendations based on indices; != skip incex for movie_name

recommender('Rocky', matrix_movies_users, model_knn, 5)


Movie selected:  Rocky (1976);  Index:  1431
Searching for recommendations....
[[0.         0.43333    0.4694751  0.4910149  0.50314593 0.5138645 ]] [[1431 1415 2579  728 3263 1630]]
[1431                             NaN
1415             Going My Way (1944)
2579    Good Morning, Vietnam (1987)
728                     Giant (1956)
3263           Something Wild (1986)
1630                     Rope (1948)
Name: title, dtype: object]


In [24]:
print(f' {df_movies["title"][2]}')

 Grumpier Old Men (1995)
