In [3]:
import numpy as np
import pandas as pd

In [4]:
movie_ratings_data = pd.read_csv('../ml-1m/ratings.dat', names=['user_id', 'movie_id', 'rating', 'time'],
    engine='python', delimiter='::')

In [5]:
movie_ratings_data

Unnamed: 0,user_id,movie_id,rating,time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368


In [4]:
movie_data = pd.read_csv('../ml-1m/movies.dat', names=['movie_id', 'title', 'genre'],
    engine='python', delimiter='::')

In [5]:
ratings_mat = np.ndarray(
    shape=(np.max(movie_ratings_data.user_id.values), np.max(movie_ratings_data.movie_id.values)),
    dtype=np.uint8)
ratings_mat[movie_ratings_data.user_id.values-1, movie_ratings_data.movie_id.values-1] = movie_ratings_data.rating.values

In [6]:
ratings_mat.shape

(6040, 3952)

In [7]:
ratings_mat.shape

(6040, 3952)

In [8]:
normalised_mat = ratings_mat - np.asarray([(np.mean(ratings_mat, 1))]).T

In [124]:
A = normalised_mat.T / np.sqrt(ratings_mat.shape[0] - 1)
U, S, V = np.linalg.svd(A, full_matrices = False)

In [125]:
def top_cosine_similarity(data, movie_id, top_n=10):
    index = movie_id - 1 # Movie id starts from 1
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

# Helper function to print top N similar movies
def print_similar_movies(movie_data, movie_id, top_indexes):
    print('Recommendations for {0}: \n'.format(
    movie_data[movie_data.movie_id == movie_id].title.values[0]))
    for id in top_indexes + 1:
        print(movie_data[movie_data.movie_id == id].title.values[0])

In [126]:
U.shape

(3952, 3952)

In [127]:
V.shape

(3952, 6040)

In [128]:
S.shape

(3952,)

In [13]:
k = 10
movie_id = 0 # Grab an id from movies.dat
top_n = 10

sliced = U[:, :k] # representative data
indexes = top_cosine_similarity(sliced, movie_id, top_n)
indexes
# print_similar_movies(movie_data, movie_id, indexes)

array([3951, 2685, 3175, 2332, 2907, 2335, 3892, 3565, 3184, 3077])

In [14]:
print(ratings_mat[2685][0:100])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [15]:
ratings_mat[0:].shape

(6040, 3952)

In [16]:
ratings_mat

array([[5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [3, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [115]:
index = 0
user_row = ratings_mat[index, :]
top_indexes = np.argsort(predict[0])[::-1]

In [116]:
for id in top_indexes[:10] + 1:
        print(movie_data[movie_data.movie_id == id].title.values[0])

Heat (1995)
Sudden Death (1995)
Postino, Il (The Postman) (1994)
GoldenEye (1995)
Waiting to Exhale (1995)
Friday (1995)
Mr. Wrong (1996)
Assassins (1995)
How to Make an American Quilt (1995)
Georgia (1995)


In [7]:
user_row_reshaped = np.zeros(3952).reshape(1,3952)
user_row_reshaped[0,0] = 5

In [10]:
user_row_reshaped

array([[5., 0., 0., ..., 0., 0., 0.]])

In [106]:
def query(q,V):
    #find q*v, w
    prd=np.dot(q,V)
    Vt=np.transpose(V)
    other=np.dot(prd,Vt)
    return other

In [117]:
predict = np.dot(user_row_reshaped,U)

In [118]:
predict

array([[-8.47053764e+00, -1.29701053e+00, -1.68704034e+00, ...,
         2.69361131e-06, -2.55276403e-06, -9.73256536e-02]])

In [119]:
top_indexes = np.argsort(predict[0])[::-1]

In [120]:
top_indexes

array([ 5,  8, 57, ..., 14,  4,  0])

In [121]:
movie_ratings_sorted

In [122]:
for id in top_indexes[:10] + 1:
        print(movie_data[movie_data.movie_id == id].title.values[0])

Heat (1995)
Sudden Death (1995)
Postino, Il (The Postman) (1994)
GoldenEye (1995)
Waiting to Exhale (1995)
Friday (1995)
Mr. Wrong (1996)
Assassins (1995)
How to Make an American Quilt (1995)
Georgia (1995)


In [31]:
V.shape

(6040, 6040)

In [67]:
U.shape

(3952, 3952)

In [37]:
S

array([2.01680812e+01, 8.62790194e+00, 7.39264526e+00, ...,
       1.46912634e-15, 1.46912634e-15, 1.11368253e-15])

In [69]:
S[0]

20.168081247879016