### Stage 1. Data preparation

In [34]:
# import
import pandas as pd
import numpy as np

# scipy library sparse module will be needed 
# for working with sparse matrices (more on that below)
from scipy.sparse import csr_matrix

# from sklearn we import k-nearest neighbors algorithm
from sklearn.neighbors import NearestNeighbors

In [35]:
# read external files (before that they need to be imported) and convert to a dataframe
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv.zip')

In [36]:
# look at the contents of the movies.csv file
# additionally remove the genres column, we don't need it
# (parameter axis = 1 says that we are working with columns, inplace = True, that the changes need to be saved)
movies.drop(['genres'], axis = 1, inplace = True)
movies.head(3)

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)


In [37]:
# and ratings.csv.zip (here we also remove the unnecessary timestamp column)
ratings.drop(['timestamp'], axis = 1, inplace = True)
ratings.head(3)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0


In [38]:
# for this we will use the pivot function and create a pivot table (pivot table)
# horizontally will be movies, vertically - users, values - ratings
user_item_matrix = ratings.pivot(index = 'movieId', columns = 'userId', values= 'rating')
user_item_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,


In [39]:
# NaN gaps need to be converted to zeros
# parameter inplace = True will again help to save the result
user_item_matrix.fillna(0, inplace = True)
user_item_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
# look at the dimensions of the "users x movies" matrix
user_item_matrix.shape

(9724, 610)

#### Now let's remove inactive users and movies with few ratings

In [41]:
# first group (combine) users, take only the rating column
# and calculate how many ratings each user had
users_votes = ratings.groupby('userId')['rating'].agg('count')

# do the same, only for the movie
movies_votes = ratings.groupby('movieId')['rating'].agg('count')

In [42]:
# now create a filter (mask)
user_mask = users_votes[users_votes > 50].index
movie_mask = movies_votes[movies_votes > 10].index

In [43]:
# apply filters and select films with enough ratings
user_item_matrix = user_item_matrix.loc[movie_mask,:]

# as well as active users
user_item_matrix = user_item_matrix.loc[:,user_mask]

In [44]:
# see how many users and movies are left
user_item_matrix.shape

(2121, 378)

#### We have almost completed the first stage. In particular, it remains to convert our sparse matrix (sparce matrix) to compressed row storage (compressed sparse row) using the csr_matrix function of the Scipy library.

In [45]:
# convert sparse matrix to csr format
# the values method will only pass dataframe values to the csr_matrix function
csr_data = csr_matrix(user_item_matrix.values)

# look at the first entries
# map these values to the original table above
print(csr_data[:2,:5])

  (0, 0)	4.0
  (0, 3)	4.5
  (1, 2)	4.0


In [46]:
# it remains only to reset the index with reset_index()
# this is necessary for the convenience of searching for a movie by index
user_item_matrix = user_item_matrix.rename_axis(None, axis = 1).reset_index()
user_item_matrix.head()

Unnamed: 0,movieId,1,4,6,7,10,11,15,16,17,...,600,601,602,603,604,605,606,607,608,610
0,1,4.0,0.0,0.0,4.5,0.0,0.0,2.5,0.0,4.5,...,2.5,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,5.0
1,2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0
2,3,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
3,5,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
4,6,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,5.0


### Stage 2 model training

In [47]:
# use the NearestNeighbors class to find distances
knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_neighbors = 20, n_jobs = -1)

# train the model
knn.fit(csr_data)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

### Stage 3 Making recommendations

#### Set the initial search parameters

In [48]:
# first, let's decide how many recommendations we want to receive 
recommendations = 10
# and based on which movie
search_word = 'Matrix'

#### Find the movie index in the preference matrix

In [49]:
# first find the movie in the movies dataframe headers
movie_search = movies[movies['title'].str.contains(search_word)]
movie_search

Unnamed: 0,movieId,title
1939,2571,"Matrix, The (1999)"
4351,6365,"Matrix Reloaded, The (2003)"
4639,6934,"Matrix Revolutions, The (2003)"


In [50]:
# there can be several options, for simplicity we will always take the first option
# through iloc[0] we take the first row of column ['movieId']
movie_id = movie_search.iloc[0]['movieId']

# further on the index of the movie in the movies dataset, we find the corresponding index
# in preference matrix
movie_id = user_item_matrix[user_item_matrix['movieId'] == movie_id].index[0]
movie_id

901

#### Finding similar films

In [51]:
# now we need to find indexes and distances of films that are similar to our query
# use the kneighbors() method
distances, indices = knn.kneighbors(csr_data[movie_id], n_neighbors = recommendations + 1)

In [52]:
# indexes of recommended films
indices

array([[ 901, 1002,  442,  454,  124,  735,  954, 1362, 1157, 1536,  978]],
      dtype=int64)

In [53]:
# distances to them
distances

array([[0.        , 0.22982441, 0.25401128, 0.27565617, 0.27760886,
        0.28691008, 0.29111012, 0.31393358, 0.31405926, 0.31548004,
        0.31748544]])

In [54]:
# remove extra dimensions with squeeze() and convert arrays to lists with tolist()
indices_list = indices.squeeze().tolist()
distances_list = distances.squeeze().tolist()

# then, using the zip and list functions, we will transform our lists
indices_distances = list(zip(indices_list, distances_list))

# into a set of tuples
print(type(indices_distances[0]))

# and look at the first three pairs/tuples
print(indices_distances[:3])

<class 'tuple'>
[(901, 0.0), (1002, 0.22982440568634488), (442, 0.25401128310081567)]


In [55]:
# it remains to sort the list by distances through key = lambda x: x[1] (that is, by the second element)
# in ascending order reverse = False
indices_distances_sorted = sorted(indices_distances, key = lambda x: x[1], reverse = False)

# and remove the first element at index 901 (because that's the "Matrix")
indices_distances_sorted = indices_distances_sorted[1:]
indices_distances_sorted

[(1002, 0.22982440568634488),
 (442, 0.25401128310081567),
 (454, 0.27565616686043737),
 (124, 0.2776088577731709),
 (735, 0.2869100842838125),
 (954, 0.2911101181714415),
 (1362, 0.31393358217709477),
 (1157, 0.31405925934381695),
 (1536, 0.3154800434449465),
 (978, 0.31748544046311844)]

#### It remains to find which films correspond to the indexes we found

In [56]:
# create an empty list in which we will put the name of the movie and the distance to it
recom_list = []

# now in the loop we will go through the tuples one by one
for ind_dist in indices_distances_sorted:

    # look up movieId in the preference matrix
    matrix_movie_id = user_item_matrix.iloc[ind_dist[0]]['movieId']

    # find out the index of this movie in the movies dataframe
    id = movies[movies['movieId'] == matrix_movie_id].index

    # take the name of the movie and the distance to it
    title = movies.iloc[id]['title'].values[0]
    dist = ind_dist[1]

    # put each pair into a python dictionary
    # which, in turn, will become an element of the list recom_list
    recom_list.append({'Title' : title, 'Distance' : dist})

In [57]:
# look at the first element
recom_list[0]

{'Title': 'Fight Club (1999)', 'Distance': 0.22982440568634488}

In [58]:
# it remains to convert our list to a dataframe
# the index will start from 1, as it should be for the rating
recom_df = pd.DataFrame(recom_list, index = range(1, recommendations + 1))
recom_df

Unnamed: 0,Title,Distance
1,Fight Club (1999),0.229824
2,Star Wars: Episode V - The Empire Strikes Back...,0.254011
3,Star Wars: Episode VI - Return of the Jedi (1983),0.275656
4,Star Wars: Episode IV - A New Hope (1977),0.277609
5,Saving Private Ryan (1998),0.28691
6,"Sixth Sense, The (1999)",0.29111
7,"Lord of the Rings: The Fellowship of the Ring,...",0.313934
8,Gladiator (2000),0.314059
9,"Lord of the Rings: The Return of the King, The...",0.31548
10,American Beauty (1999),0.317485
