# **Movie Recommender with implicit feedback**

## using the "implicit" package and Bayesian-Personalized-Ranking algorithm with MovieLens 100k dataset

In [1]:
import pandas as pd
import numpy as np
import implicit
from scipy.sparse import coo_matrix, csr_matrix

### Load dataset from .csv (MovieLens 100k)

Download data from: https://www.kaggle.com/rajmehra03/movielens100k

In [2]:
df_ratings = pd.read_csv("data/ratings.csv", sep = ",")
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [3]:
df_movies = pd.read_csv("data/movies.csv", sep = ",")
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# get an array with all titles
titles = np.array(df_movies['title'])
titles

array(['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)',
       ..., 'The Beatles: Eight Days a Week - The Touring Years (2016)',
       'The Gay Desperado (1936)', "Women of '69, Unboxed"], dtype=object)

# Create a sparse matrix from rating data

In [5]:
# remove timestamp column and reorder columns
df_ratings.drop(axis= 1, labels= "timestamp",inplace= True)
df_ratings = df_ratings[df_ratings.movieId < len(titles)]
df_ratings = df_ratings[["movieId","userId","rating"]]

In [6]:
# create a sparse matrix with rows => movieId, column => userId and elements in matrix => rating
row = np.array(df_ratings.movieId)
col = np.array(df_ratings.userId)
data = np.array(df_ratings.rating)

ratings = csr_matrix((data, (row, col)))
ratings

<9019x672 sparse matrix of type '<class 'numpy.float64'>'
	with 84739 stored elements in Compressed Sparse Row format>

In [7]:
# All ratings under 4 stars are 0 (only 4 and 5 is positive feedback)
ratings.data[ratings.data < 4] = 0
ratings

<9019x672 sparse matrix of type '<class 'numpy.float64'>'
	with 84739 stored elements in Compressed Sparse Row format>

In [8]:
# remove entries with zero value
ratings.eliminate_zeros()
ratings.data

array([4., 5., 5., ..., 5., 4., 5.])

In [9]:
# make all 4 or 5 values to 1
ratings.data = np.ones(len(ratings.data))
ratings.data

array([1., 1., 1., ..., 1., 1., 1.])

# Train BPR Model

In [10]:
model = implicit.bpr.BayesianPersonalizedRanking()
model.fit(ratings)

100%|██████████| 100/100 [00:00<00:00, 110.84it/s, correct=91.07%, skipped=17.84%]


# Find similiar movies

In [11]:
tosort = np.arange(len(titles))
user_count = np.ediff1d(ratings.indptr)
user_count

array([  0, 159,  41, ...,   0,   0,   1], dtype=int32)

In [12]:
tosort = np.arange(len(titles))
user_count = np.ediff1d(ratings.indptr)

to_generate = sorted(tosort[0:(len(user_count)-1)], key=lambda x: -user_count[x])
to_generate[0:10]

[318, 296, 356, 593, 260, 527, 2571, 1196, 608, 2858]

In [13]:
l_similiar_items = []

for movieid in to_generate:
    # if this movie has no ratings, skip over (for instance 'Graffiti Bridge' has
    # no ratings > 4 meaning we've filtered out all data for it.
    if ratings.indptr[movieid] != ratings.indptr[movieid + 1]:
        title = titles[movieid]
        for other, score in model.similar_items(movieid, 3):
            #print(movieid, "\t ",title, " - ", titles[other], " - ", score)
            l_similiar_items.append([movieid,title,titles[other],score])

In [14]:
l_similiar_items[0:12]

[[318, 'Crow, The (1994)', 'Crow, The (1994)', 2.882679],
 [318, 'Crow, The (1994)', 'Red Rock West (1992)', 2.1094866],
 [318,
  'Crow, The (1994)',
  'Scream of Stone (Cerro Torre: Schrei aus Stein) (1991)',
  1.8931646],
 [296, 'Tom & Viv (1994)', 'Tom & Viv (1994)', 2.9374273],
 [296, 'Tom & Viv (1994)', 'When Night Is Falling (1995)', 2.3656325],
 [296, 'Tom & Viv (1994)', 'Lamerica (1994)', 2.2914941],
 [356, 'Mirage (1995)', 'Mirage (1995)', 2.72784],
 [356, 'Mirage (1995)', 'Batman (1989)', 1.6823716],
 [356,
  'Mirage (1995)',
  'Mrs. Parker and the Vicious Circle (1994)',
  1.6592661],
 [593, 'Angus (1995)', 'Angus (1995)', 2.5134737],
 [593, 'Angus (1995)', 'Tom & Viv (1994)', 1.9215567],
 [593, 'Angus (1995)', 'When Night Is Falling (1995)', 1.8807013]]

In [15]:
# Search for specific movies
for entry in l_similiar_items:
    if "Juman" in entry[1]:
        print(entry)

[1, 'Jumanji (1995)', 'Jumanji (1995)', 2.6192956]
[1, 'Jumanji (1995)', 'Crime and Punishment in Suburbia (2000)', 1.8780766]
[1, 'Jumanji (1995)', '2001: A Space Odyssey (1968)', 1.6143974]


# Find similiar users

In [16]:
l_users = np.unique(df_ratings.userId)
l_users[0:10]

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=int64)

In [17]:
l_similar_users = []

for user in l_users:
        for other, score in model.similar_users(user, 3):
            l_similar_users.append([user,other,score])

In [18]:
pd.DataFrame(l_similar_users[0:20], columns = ["userId","similar_userId","score"])

Unnamed: 0,userId,similar_userId,score
0,1,1,1.182443
1,1,581,1.069442
2,1,549,1.068702
3,2,2,2.41716
4,2,354,2.370593
5,2,121,2.364691
6,3,3,1.420081
7,3,353,1.245931
8,3,29,1.245247
9,4,4,3.446882


# **Make recommendations for users**

In [19]:
user_items = ratings.T.tocsr() # transpose matrix to get user_item from item_user
user_items

<672x9019 sparse matrix of type '<class 'numpy.float64'>'
	with 44339 stored elements in Compressed Sparse Row format>

In [20]:
dict_user_recommendations = dict()
number_of_recommendations = 10 # number of recommendations per user

# make recommendations for all users
for user in l_users:
    dict_user_recommendations[user] = model.recommend(user, user_items, N=number_of_recommendations, filter_already_liked_items=True, filter_items=None, recalculate_user=False)

In [21]:
# create dictionary with UserId as key and a list of recommendations as values (with movienames)
dict_user_recommendations_names = dict()

for user in dict_user_recommendations:
    num_recommendation = 0
    dict_user_recommendations_names[user] = list(range(0,number_of_recommendations))
    for recommendation in dict_user_recommendations[user]:
        dict_user_recommendations_names[user][num_recommendation] = titles[recommendation[0]] # translate movieid to moviename
        num_recommendation = num_recommendation + 1

## Show example recommendations

In [22]:
dict_user_recommendations_names[1] # example entry

['Halfmoon (Paul Bowles - Halbmond) (1995)',
 'Swingers (1996)',
 'Once Were Warriors (1994)',
 'Private Parts (1997)',
 'Daytrippers, The (1996)',
 'Monty Python and the Holy Grail (1975)',
 'In Search of the Castaways (1962)',
 'Van, The (1996)',
 'Abyss, The (1989)',
 'Gabbeh (1996)']

In [23]:
dict_user_recommendations_names[581] # example entry (most similar user to 1)

['Once Were Warriors (1994)',
 'Halfmoon (Paul Bowles - Halbmond) (1995)',
 'Swingers (1996)',
 'Quest, The (1996)',
 'Abyss, The (1989)',
 'Goodfellas (1990)',
 'Monty Python and the Holy Grail (1975)',
 'People vs. Larry Flynt, The (1996)',
 'My Own Private Idaho (1991)',
 'Bambi (1942)']

In [24]:
dict_user_recommendations_names[368] # example entry (second similar user to 1)

['Halfmoon (Paul Bowles - Halbmond) (1995)',
 'Tom & Viv (1994)',
 'Anaconda (1997)',
 'Switchback (1997)',
 'Austin Powers: International Man of Mystery (1997)',
 'In Search of the Castaways (1962)',
 'Hunted, The (1995)',
 'Lamerica (1994)',
 'Babe (1995)',
 'Van, The (1996)']

### Show intersection between the recommendations for similar users

In [25]:
def intersect(a, b):
    return list(set(a) & set(b))

intersection = (intersect(dict_user_recommendations_names[1], dict_user_recommendations_names[581])) # most similar to user 1
print("Same recommendations for both users: ", intersection)
print("Number of same recommendations for both users: ", len(intersection))

Same recommendations for both users:  ['Once Were Warriors (1994)', 'Abyss, The (1989)', 'Monty Python and the Holy Grail (1975)', 'Halfmoon (Paul Bowles - Halbmond) (1995)', 'Swingers (1996)']
Number of same recommendations for both users:  5


In [26]:
intersection = (intersect(dict_user_recommendations_names[1], dict_user_recommendations_names[368])) # second most similar to user 1
print("Same recommendations for both users: ", intersection)
print("Number of same recommendations for both users: ", len(intersection))

Same recommendations for both users:  ['In Search of the Castaways (1962)', 'Halfmoon (Paul Bowles - Halbmond) (1995)', 'Van, The (1996)']
Number of same recommendations for both users:  3


In [27]:
from implicit.evaluation import precision_at_k, train_test_split,mean_average_precision_at_k

In [28]:
train, test = train_test_split(ratings)

In [40]:
model2 = implicit.bpr.BayesianPersonalizedRanking(iterations = 50)
model2.fit(train)

100%|██████████| 50/50 [00:00<00:00, 126.35it/s, correct=73.65%, skipped=14.42%]


In [41]:
# explaination of precicion at k: https://medium.com/@m_n_malaeb/recall-and-precision-at-k-for-recommender-systems-618483226c54
p = precision_at_k(model2, train.T.tocsr(), test.T.tocsr(), K=10, num_threads=8)
p

100%|██████████| 672/672 [00:00<00:00, 2939.75it/s]


0.19128831528001844