In [1]:
import numpy as np
import pandas as pd

In [2]:
column_list_ratings = ["UserID", "MovieID", "Ratings","Timestamp"]
ratings_data  = pd.read_csv('ratings.dat', sep='::', names = column_list_ratings, engine='python')
column_list_movies = ["MovieID", "Title", "Genres"]
movies_data = pd.read_csv('movies.dat', sep = '::', names = column_list_movies, engine='python', encoding='iso-8859-1')
column_list_users = ["UserID", "Gender", "Age", "Occupation", "Zip-code"]
user_data = pd.read_csv("users.dat", sep = "::", names = column_list_users, engine='python')

In [3]:
max_user_id = np.max(ratings_data.UserID.values)
max_movie_id = np.max(ratings_data.MovieID.values)
ratings_matrix = np.ndarray(shape = (max_user_id, max_movie_id), dtype=np.uint8)
ratings_matrix[ratings_data.UserID.values - 1, ratings_data.MovieID.values - 1] = ratings_data.Ratings.values
print(ratings_matrix.shape)

(6040, 3952)


In [4]:
std = ratings_matrix.std(axis = 0)
mean = ratings_matrix.mean(axis = 0)
# Z-score Normalization
ratings_matrix = (ratings_matrix - mean)/std

# Making sure we don't have nan values which happens when divided by 0.
ratings_matrix[np.isnan(ratings_matrix)] = 0

  ratings_matrix = (ratings_matrix - mean)/std


In [5]:
U, S, VT = np.linalg.svd(ratings_matrix, full_matrices=True)
S = np.diag(S)

print("U Shape: ", U.shape, "\n")
print("VT Shape: ", VT.shape, "\n")
print("S Shape: ", S.shape, "\n")

U Shape:  (6040, 6040) 

VT Shape:  (3952, 3952) 

S Shape:  (3952, 3952) 



In [6]:
ranks_dict = {}
for r in (100, 1000, 2000, 3000):
    ranks_dict[r] = U[:, :r]@S[:r, :r]@VT[:r, :]

In [7]:
for r in (100, 1000, 2000, 3000):
    for u in (0, 1, 2):
        print("user - ", u + 1, " and rank - ", r, ": ", ranks_dict[r][u][1376])
    print('\n')

user -  1  and rank -  100 :  -0.4803656455160061
user -  2  and rank -  100 :  0.07829760883842156
user -  3  and rank -  100 :  -0.06949374727318039


user -  1  and rank -  1000 :  -0.5317376879295509
user -  2  and rank -  1000 :  0.16743400583236406
user -  3  and rank -  1000 :  -0.1801823192251527


user -  1  and rank -  2000 :  -0.1924455185277844
user -  2  and rank -  2000 :  -0.2714729114815892
user -  3  and rank -  2000 :  -0.47038753847043435


user -  1  and rank -  3000 :  -0.3837378301306327
user -  2  and rank -  3000 :  -0.3585367048689856
user -  3  and rank -  3000 :  -0.5975119399601544




In [8]:
def magnitude(arr):
    return np.sqrt(arr@arr)

def find_movie_data(movie_data, movieID):
    return movie_data[movieID == movie_data.MovieID.values].values[0]

# Sort the movies based on cosine similarity
def top_cosine_similarity(data, movie_id, top_n=5):
    movie_index = movie_id - 1
    movie_column = data[:, movie_index]
    movie_magnitude = magnitude(movie_column)

    magnitudes_array = np.apply_along_axis(magnitude, 0, data)

    XTY = np.dot(movie_column, data)
    modX_modY = movie_magnitude * magnitudes_array
    cosine_similarities = XTY / modX_modY
    cosine_similarities[np.isnan(cosine_similarities)] = -1

    similar_movie_indexes = np.argsort(cosine_similarities)[::-1][1:top_n + 1]
    return similar_movie_indexes

def print_similar_movies(movie_data, movieID, top_indexes):
    print(
        f'Most Similar movies to "{find_movie_data(movie_data, movieID)[1]}" with movieId "{movieID}" are',
        '\n'
    )
    titles = np.array(())
    genres = np.array(())
    for i in top_indexes:
        particular_movie_data = find_movie_data(movie_data, i + 1)
        titles = np.append(titles, particular_movie_data[1])
        genres = np.append(genres, particular_movie_data[2])
    similar_movies_df = pd.DataFrame({"Movie ID": top_indexes + 1, "Movie Title": titles, "Movie Genres": genres})
    similar_movies_df.index += 1
    print(similar_movies_df)


movie_id = 1377
top_similar_indexes = top_cosine_similarity(ranks_dict[1000], movie_id, 5)
print_similar_movies(movies_data, movie_id, top_similar_indexes)

Most Similar movies to "Batman Returns (1992)" with movieId "1377" are 

   Movie ID            Movie Title                     Movie Genres
1       153  Batman Forever (1995)    Action|Adventure|Comedy|Crime
2      1562  Batman & Robin (1997)           Action|Adventure|Crime
3       592          Batman (1989)     Action|Adventure|Crime|Drama
4      2616      Dick Tracy (1990)                     Action|Crime
5       380       True Lies (1994)  Action|Adventure|Comedy|Romance


  cosine_similarities = XTY / modX_modY
