In [1]:
import numpy as np
import pandas as pd


In [2]:
ratingData = pd.io.parsers.read_csv('ratings.dat', names=['user_id', 'movie_id', 'rating', 'time'],engine='python', delimiter='::')
movieData = pd.io.parsers.read_csv('movies.dat',names=['movie_id', 'title', 'genre'],engine='python', delimiter='::')

print(ratingData)

         user_id  movie_id  rating       time
0              1      1193       5  978300760
1              1       661       3  978302109
2              1       914       3  978301968
3              1      3408       4  978300275
4              1      2355       5  978824291
5              1      1197       3  978302268
6              1      1287       5  978302039
7              1      2804       5  978300719
8              1       594       4  978302268
9              1       919       4  978301368
10             1       595       5  978824268
11             1       938       4  978301752
12             1      2398       4  978302281
13             1      2918       4  978302124
14             1      1035       5  978301753
15             1      2791       4  978302188
16             1      2687       3  978824268
17             1      2018       4  978301777
18             1      3105       5  978301713
19             1      2797       4  978302039
20             1      2321       3

In [3]:
print(movieData)

      movie_id                                              title  \
0            1                                   Toy Story (1995)   
1            2                                     Jumanji (1995)   
2            3                            Grumpier Old Men (1995)   
3            4                           Waiting to Exhale (1995)   
4            5                 Father of the Bride Part II (1995)   
5            6                                        Heat (1995)   
6            7                                     Sabrina (1995)   
7            8                                Tom and Huck (1995)   
8            9                                Sudden Death (1995)   
9           10                                   GoldenEye (1995)   
10          11                     American President, The (1995)   
11          12                 Dracula: Dead and Loving It (1995)   
12          13                                       Balto (1995)   
13          14                    

In [4]:
ratingMatrix = np.ndarray(
    shape=(np.max(ratingData.movie_id.values), np.max(ratingData.user_id.values)),
    dtype=np.uint8)

ratingMatrix[ratingData.movie_id.values-1, ratingData.user_id.values-1] = ratingData.rating.values
print(ratingMatrix)

[[5 0 0 ... 0 0 3]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [5]:
normalizedMatrix = ratingMatrix - np.asarray([(np.mean(ratingMatrix, 1))]).T
print(normalizedMatrix)

[[ 3.57400662 -1.42599338 -1.42599338 ... -1.42599338 -1.42599338
   1.57400662]
 [-0.37152318 -0.37152318 -0.37152318 ... -0.37152318 -0.37152318
  -0.37152318]
 [-0.23874172 -0.23874172 -0.23874172 ... -0.23874172 -0.23874172
  -0.23874172]
 ...
 [-0.03278146 -0.03278146 -0.03278146 ... -0.03278146 -0.03278146
  -0.03278146]
 [-0.02582781 -0.02582781 -0.02582781 ... -0.02582781 -0.02582781
  -0.02582781]
 [-0.24288079 -0.24288079 -0.24288079 ... -0.24288079 -0.24288079
  -0.24288079]]


In [6]:
"""### Computing SVD"""

A = normalizedMatrix.T / np.sqrt(ratingMatrix.shape[0] - 1)
U, S, V = np.linalg.svd(A)


In [7]:
"""### Calculate cosine similarity, sort by most similar and return the top N"""

def similar(ratingData, movie_id, top_n):
    index = movie_id - 1 # Movie id starts from 1
    movie_row = ratingData[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', ratingData, ratingData)) #Einstein summation |  traditional matrix multiplication and is equivalent to np.matmul(a,b)
    similarity = np.dot(movie_row, ratingData.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity) #Perform an indirect sort along the given axis (Last axis)
    return sort_indexes[:top_n]

In [8]:
"""### Select k principal components to represent the movies, a movie_id to find recommendations and print the top_n results"""

k = 50
movie_id = 2
top_n = 5

sliced = V.T[:, :k] # representative data
indexes = similar(sliced, movie_id, top_n)

print('Recommendations for Movie {0}: \n'.format(
movieData[movieData.movie_id == movie_id].title.values[0]))
for id in indexes + 1:
    print(movieData[movieData.movie_id == id].title.values[0])

  import sys


Recommendations for Movie Jumanji (1995): 

Jumanji (1995)
Hook (1991)
Indian in the Cupboard, The (1995)
NeverEnding Story II: The Next Chapter, The (1990)
Dragonheart (1996)
