In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('ml-latest-small/movies.csv')
df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
top_user_movies = []

print("To start, we'll need the IDs of your top 3 rated movies as well as the ratings you'd give them.")

try:
    movie_id_list = input("Please enter the IDs of your top 3 rated movies (separated by space): ").split(' ')
    movie_id_list = [int(item) for item in movie_id_list]
    if any(movie not in df['movieId'].values for movie in movie_id_list):
        raise ValueError("That movie ID doesn't exist!")
    if len(movie_id_list) != 3:
        raise ValueError('There must be exactly 3 movie IDs!')
    
except ValueError as err:
    print(f'\n\x1b[31m{err} Please enter the IDs again, separated by a space.\x1b[0m')
except:
    print('\n\x1b[31mInvalid format! Please enter the IDs again, separated by a space.\x1b[0m')

try:
    movie_ratings_list = input("Please enter the ratings out of 10 for each of your movies (separated by space): ").split(' ')
    movie_ratings_list = [int(item) for item in movie_ratings_list]
    if any(rating > 10 or rating < 1 for rating in movie_ratings_list):
        raise ValueError('Ratings must be from 1 to 10!')
    if len(movie_ratings_list) != 3:
        raise ValueError('There must be exactly 3 ratings!')
    
except ValueError as err:
    print(f'\n\x1b[31m{err} Please enter the ratings again, separated by a space.\x1b[0m')
except:
    print('\n\x1b[31mInvalid format! Please enter the ratings again, separated by a space.\x1b[0m')

top_user_movies.extend({'movie_id': movie_id, 'movie_title': df[df['movieId'] == movie_id]['title'].iloc[0], 'rating': rating} for movie_id, rating in list(zip(movie_id_list, movie_ratings_list)))

To start, we'll need the IDs of your top 3 rated movies as well as the ratings you'd give them.


In [4]:
print('\x1b[33m\n\t\t----------------------Your Top 3 Movies----------------------\x1b[0m')
for item in top_user_movies:
    print(f"\x1b[32mID:\x1b[0m {item['movie_id']:<8}\x1b[32mTitle:\x1b[0m {item['movie_title']:<60}\x1b[32mYour Rating:\x1b[0m {item['rating']}/10")

[33m
		----------------------Your Top 3 Movies----------------------[0m
[32mID:[0m 1       [32mTitle:[0m Toy Story (1995)                                            [32mYour Rating:[0m 7/10
[32mID:[0m 2       [32mTitle:[0m Jumanji (1995)                                              [32mYour Rating:[0m 8/10
[32mID:[0m 3       [32mTitle:[0m Grumpier Old Men (1995)                                     [32mYour Rating:[0m 9/10


In [5]:
top_movie_genres = set()

for item in top_user_movies:
    top_movie_genres.update(genre for genre in df[df['movieId'] == item['movie_id']]['genres'].iloc[0].split('|'))

top_movie_genres = list(top_movie_genres)
top_movie_genres

['Adventure', 'Fantasy', 'Comedy', 'Animation', 'Children', 'Romance']

In [6]:
rated_movies_one_hot_matrix = {}

for movie_id in movie_id_list:
    movie_genres = df[df['movieId'] == movie_id]['genres'].iloc[0].split('|')
    rated_movies_one_hot_matrix[movie_id] = np.array([1 if genre in movie_genres else 0 for genre in top_movie_genres])


In [7]:
rated_movies_matrix = []
for one_hot, rating in zip(rated_movies_one_hot_matrix.values(), movie_ratings_list):
    rated_movies_matrix.append([item * rating for item in one_hot])

rated_movies_matrix = np.array(rated_movies_matrix)

In [8]:
user_profile = np.array([item/rated_movies_matrix.sum() for item in rated_movies_matrix.sum(axis=0)])
user_profile

array([0.19480519, 0.19480519, 0.20779221, 0.09090909, 0.19480519,
       0.11688312])

In [9]:
mask = df['movieId'].isin(movie_id_list)

unrated_df = df[~mask]
unrated_df.head()

Unnamed: 0,movieId,title,genres
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children


In [10]:
unrated_movies_one_hot_matrix = {}

for movie_id in unrated_df['movieId']:
    movie_genres = df[df['movieId'] == movie_id]['genres'].iloc[0].split('|')
    unrated_movies_one_hot_matrix[movie_id] = np.array([1 if genre in movie_genres else 0 for genre in top_movie_genres])


In [11]:
unrated_movies_matrix = {}

for movie_id, one_hot in unrated_movies_one_hot_matrix.items():
    one_hot = np.array(one_hot)
    unrated_movies_matrix[movie_id] = one_hot * user_profile


In [12]:
predicted_rating_list = {}
for movie_id, movie_weights in unrated_movies_matrix.items():
    predicted_rating_list[movie_id] = round(movie_weights.sum()*10, 1)


In [13]:
sorted_dict = dict(sorted(predicted_rating_list.items(), key=lambda item: item[1], reverse=True))
recommended_movies = dict(list(sorted_dict.items())[:5])

In [14]:
recommendation_df = df[df['movieId'].isin(list(recommended_movies.keys()))].copy()
recommendation_df.loc[:, 'predicted_rating'] = recommendation_df['movieId'].map(recommended_movies)

recommendation_df = recommendation_df.sort_values(by='predicted_rating', ascending=False)
recommendation_df

Unnamed: 0,movieId,title,genres,predicted_rating
3194,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...,10.0
6626,56152,Enchanted (2007),Adventure|Animation|Children|Comedy|Fantasy|Mu...,10.0
7530,84637,Gnomeo & Juliet (2011),Adventure|Animation|Children|Comedy|Fantasy|Ro...,10.0
7805,92348,Puss in Boots (Nagagutsu o haita neko) (1969),Adventure|Animation|Children|Comedy|Fantasy|Ro...,10.0
222,258,"Kid in King Arthur's Court, A (1995)",Adventure|Children|Comedy|Fantasy|Romance,9.1
