In [None]:
import numpy as np
import pandas as pd

In [None]:
!wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip

In [None]:
data = pd.read_csv('ml-1m/ratings.dat',
    names=['user_id', 'movie_id', 'rating', 'time'],
    engine='python', delimiter='::')

movie_data = pd.read_csv('ml-1m/movies.dat',
    names=['movie_id', 'title', 'genre'],
    engine='python', delimiter='::', encoding='latin-1')

In [None]:
movie_data.head()

In [None]:
ratings_mat = np.zeros(
    (data.user_id.max(), data.movie_id.max()), dtype=np.float32
)

for _, row in data.iterrows():
    ratings_mat[int(row.user_id) - 1, int(row.movie_id) - 1] = row.rating

In [None]:
normalised_mat = ratings_mat - np.mean(ratings_mat, axis=1).reshape(-1, 1)
A = normalised_mat.T / np.sqrt(ratings_mat.shape[0] - 1)

U, S, Vt = np.linalg.svd(A, full_matrices=False)

In [None]:
def top_cosine_similarity(data, movie_id, top_n=10):
    index = movie_id - 1
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

def print_similar_movies(movie_data, movie_id, top_indexes):
    print('Recommendations for {0}: \n'.format(
        movie_data[movie_data.movie_id == movie_id].title.values[0]))
    for id in top_indexes + 1:
        print(movie_data[movie_data.movie_id == id].title.values[0])

In [None]:
k = 50
movie_id = 1
top_n = 10

sliced = Vt[:k, :]
indexes = top_cosine_similarity(sliced.T, movie_id, top_n)
print_similar_movies(movie_data, movie_id, indexes)