In [1]:
import numpy as np
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df_rating = pd.read_csv('/content/drive/MyDrive/lab2/ratings.dat', sep='::', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'])
df_movie = pd.read_csv('TMDB_movies.dat', sep='::', header=None, names=['movie_id', 'title', 'genre'],encoding='ISO-8859-1')

mxu matrix with movies as row and users as column

In [4]:
ratings_mat = np.ndarray(
    shape=(np.max(df_rating.movie_id.values), np.max(df_rating.user_id.values)),
    dtype=np.uint8)
ratings_mat[df_rating.movie_id.values-1, df_rating.user_id.values-1] = df_rating.rating.values

In [5]:
ratings_mat

array([[  5, 222,  31, ..., 127,   0,   3],
       [  0,  61,  27, ..., 127,   0,   0],
       [  0, 154,  22, ..., 127,   0,   0],
       ...,
       [  0, 149, 152, ..., 127,   0,   0],
       [  0, 243, 147, ..., 127,   0,   0],
       [192,  81, 143, ..., 127,   0,   0]], dtype=uint8)

normalising the matrix

In [6]:
normalised_mat = ratings_mat - np.asarray([(np.mean(ratings_mat, 1))]).T

SVD on matrix

In [7]:
A = normalised_mat.T / np.sqrt(ratings_mat.shape[0] - 1)
U, S, V = np.linalg.svd(A)

function for cosine similarity 

In [8]:
def cosine_similarity(data, movie_id, top_n=10):
    index = movie_id - 1 
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]

function for printing top 10 similar movies

In [9]:
def print_similar_movies(movie_data, movie_id, top_indexes):
    print('Recommendations for {0}: \n'.format(
    movie_data[movie_data.movie_id == movie_id].title.values[0]))
    for id in top_indexes + 1:
        print(movie_data[movie_data.movie_id == id].title.values[0])

reommendation using top k components from V.T and top n movies to given movie(movie_id)

In [11]:
k = 50
movie_id = 10
n = 10
sliced = V.T[:, :k] 
indexes = cosine_similarity(sliced, movie_id, n)

In [12]:
print_similar_movies(df_movie, movie_id, indexes)

Recommendations for GoldenEye (1995): 

GoldenEye (1995)
World Is Not Enough, The (1999)
Universal Soldier (1992)
Soldier's Daughter Never Cries, A (1998)
Ronin (1998)
Rock, The (1996)
Die Hard 2 (1990)
Clear and Present Danger (1994)
Jules and Jim (Jules et Jim) (1961)
Tomorrow Never Dies (1997)


In [13]:
indexes1 = cosine_similarity(sliced, 306, n)
print_similar_movies(df_movie,306,indexes1)

Recommendations for Three Colors: Red (1994): 

Three Colors: Red (1994)
Outlaw, The (1943)
Best in Show (2000)
Primary Colors (1998)
Gridlock'd (1997)
Romeo and Juliet (1968)
Last Emperor, The (1987)
Happy Go Lovely (1951)
Castle, The (1997)
Love & Human Remains (1993)


In [14]:
indexes2 = cosine_similarity(sliced, 3527, n)
print_similar_movies(df_movie,3527,indexes2)

Recommendations for Predator (1987): 

Predator (1987)
Aliens (1986)
Star Trek: The Wrath of Khan (1982)
Superman (1978)
Mad Max (1979)
Terminator, The (1984)
Star Wars: Episode VI - Return of the Jedi (1983)
Tron (1982)
Mad Max 2 (a.k.a. The Road Warrior) (1981)
Star Wars: Episode V - The Empire Strikes Back (1980)


covariance matrix 

In [15]:
covMatrix = np.cov(normalised_mat,bias=True)

eigen values and vectors for covariance matrix

In [16]:
from numpy.linalg import eig
eigenvalues, eigenvectors = eig(covMatrix)

In [17]:
eigenvalues

array([21205212.47706374,  1897624.10513425,   736459.27580307, ...,
              0.        ,        0.        ,        0.        ])

In [18]:
eigenvectors

array([[ 0.01079223,  0.01633014,  0.00805679, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.0142919 ,  0.02286348,  0.01042404, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.0148288 ,  0.02470356, -0.02050741, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.01688721, -0.00375855, -0.02356996, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.01669735, -0.00398967,  0.01911945, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.01590264, -0.00279717, -0.00267882, ...,  0.        ,
         0.        ,  0.        ]])

reommendation using top k eigen vectors from egienvectors and top n movies for given movie (movie_id)

In [19]:
k = 50
movie_id = 10
n = 10
sliced1 = eigenvectors[:, :k] 
indexes_1 = cosine_similarity(sliced1, movie_id, n)
print_similar_movies(df_movie,movie_id,indexes_1)

Recommendations for GoldenEye (1995): 

GoldenEye (1995)
World Is Not Enough, The (1999)
Universal Soldier (1992)
Soldier's Daughter Never Cries, A (1998)
Ronin (1998)
Rock, The (1996)
Die Hard 2 (1990)
Clear and Present Danger (1994)
Jules and Jim (Jules et Jim) (1961)
Tomorrow Never Dies (1997)


In [20]:
indexes_2 = cosine_similarity(sliced1,306, n)
print_similar_movies(df_movie,306,indexes_2)

Recommendations for Three Colors: Red (1994): 

Three Colors: Red (1994)
Outlaw, The (1943)
Best in Show (2000)
Primary Colors (1998)
Gridlock'd (1997)
Romeo and Juliet (1968)
Last Emperor, The (1987)
Happy Go Lovely (1951)
Castle, The (1997)
Love & Human Remains (1993)
