In [37]:
from IPython.display import display
import pandas as pd
import numpy as np

In [38]:
def select_user(expert_users, ratings_genres_df):

    random_user = expert_users.sample()
    random_user = random_user['user']

    seen_movies = pd.merge(random_user, ratings_genres_df, how='left', left_on='user', right_on='user')


    return seen_movies

def get_favourite_movies(seen_movies):
        return seen_movies.loc[seen_movies['rating'] == np.max(seen_movies['rating'])]

In [39]:
def get_genre_set(ratings_genres_df):
    genre_set = set()
    for genres in ratings_genres_df.genres:
        genre_set.update(genres.split('|'))

    return genre_set

In [40]:
def get_powerset(items):
  powerset = list(items)
  n = len(powerset)
  return [[powerset[k] for k in range(n) if i&1<<k] for i in range(2**n)]

def get_favourite_genres_powerset(genre_set, favourite_movies, num_genres=3):
    genre_set_list = list(genre_set)
    genre_count = np.zeros(len(genre_set_list))

    for genre in favourite_movies.genres:
        genre_list = genre.split('|')
        for i in range(len(genre_list)):
            for j in range(len(genre_set_list)):
                if genre_list[i] == genre_set_list[j]:
                    genre_count[j] += 1

    favourite_genres = []
    while len(favourite_genres) < num_genres:
        if np.max(genre_count) == 0:
            break

        fav_genre = genre_set_list[np.argmax(genre_count)]
        genre_count[np.argmax(genre_count)] = 0

        if fav_genre == "(no genres listed)":
            continue

        favourite_genres.append(fav_genre)


    return get_powerset(favourite_genres)

In [41]:
def get_movies_with_genres(find_genres, genre_df):

    items = []

    for genres in find_genres:
        if not genres:
                continue
        for genre in genres:
            temp = genre_df.loc[(genre_df[genre] == True)]
            items.extend(temp.item)

    out = genre_df[genre_df['item'].isin(items)]
    out.index = np.arange(len(out.index))

    return out

In [42]:
def get_unseen_movies_ratings_genres(movies_df, ratings_df, movies_with_genres, seen_movies):
    unseen_movies = list(pd.concat([movies_df.item, seen_movies.item]).drop_duplicates(keep=False))
    unseen_movies_ratings = ratings_df[ratings_df['item'].isin(unseen_movies)]
    unseen_movies_with_genres = movies_with_genres[movies_with_genres['item'].isin(unseen_movies)]

    return unseen_movies, unseen_movies_ratings, unseen_movies_with_genres

In [43]:
def get_average_ratings(unseen_movies_with_genres, average_ratings_df):
    return average_ratings_df.loc[average_ratings_df['item'].isin(unseen_movies_with_genres.item)]

In [44]:
def get_top_movies(average_ratings_df):
    top_rating = np.max(average_ratings_df.average_rating)

    top_movies = []
    for item in average_ratings_df.index:
        if average_ratings_df.loc[item].average_rating < top_rating - (top_rating/10):
            continue
        top_movies.append(item)

    return top_movies

In [45]:
def get_favourite_tags(favourite_movies, tags_df):
    favourite_movies_list = list(favourite_movies.item)

    favourite_tags = set()
    for item in favourite_movies_list:
        movie_tags = tags_df.loc[tags_df.index == item]
        favourite_tags.update(list(movie_tags.tag))

    return favourite_tags


In [51]:
def dice_coefficient(fav_tag, movie_tag):
    return (2 * len(fav_tag.intersection(movie_tag))) / (len(fav_tag) + len(movie_tag))

def find_k_best(k, movie_similarity, movies_df):
    movie_list = []
    for i in range(k):
        recommend_movie = max(movie_similarity, key=movie_similarity.get)
        movie_list.append(recommend_movie)
        movie_similarity[recommend_movie] = 0

    return movies_df[movies_df['item'].isin(movie_list)]

def get_movie_recommendations(top_movies, tags_df, favourite_tags, movies_df, k):
    movie_similarity = {}
    for item in top_movies:
        tag_set = set()
        movie_tags = tags_df.loc[tags_df['item'] == item]
        tag_set.update(list(movie_tags.tag))
        movie_similarity[item] = dice_coefficient(favourite_tags, tag_set)

    return find_k_best(k, movie_similarity, movies_df)



In [47]:
def content_recommender(expert_users_df, ratings_genres_df, movies_df, ratings_df, genre_df, tags_df, average_ratings_df):

    k = 10

    seen_movies = select_user(expert_users_df, ratings_genres_df)

    favourite_movies = get_favourite_movies(seen_movies)

    genre_set = get_genre_set(ratings_genres_df)

    genre_list = get_favourite_genres_powerset(genre_set, favourite_movies)

    movies_with_genres = get_movies_with_genres(genre_list, genre_df)

    unseen_movies, unseen_movies_ratings, unseen_movies_with_genres = get_unseen_movies_ratings_genres(movies_df, ratings_df, movies_with_genres, seen_movies)

    unseen_average_ratings = get_average_ratings(unseen_movies_with_genres, average_ratings_df)

    top_movies = get_top_movies(unseen_average_ratings)

    favourite_tags = get_favourite_tags(favourite_movies, tags_df)

    recommendations = get_movie_recommendations(top_movies, tags_df, favourite_tags, movies_df, 10)

    return recommendations

In [48]:
data_folder = "./processed_data"

ratings_df = pd.read_csv(data_folder + "/ratings.csv")
movies_df = pd.read_csv(data_folder + "/movies.csv")

expert_users_df = pd.read_csv(data_folder + "/expert_users.csv")
ratings_genres_df = pd.read_csv(data_folder + "/ratings_genres.csv")

genre_df = pd.read_csv(data_folder + "/genre.csv")
tags_df = pd.read_csv(data_folder + "/tags.csv")

average_ratings_df = pd.read_csv(data_folder + "/average_ratings.csv")



In [52]:
rec = content_recommender(expert_users_df, ratings_genres_df, movies_df, ratings_df, genre_df, tags_df, average_ratings_df)
display(rec)

Unnamed: 0,item,title,year,genres
8000,8817,"trouble with girls, the",1969,Comedy|Drama
8104,8923,tess,1979,Drama|Romance
8267,25834,captains courageous,1937,Adventure|Drama
8273,25842,topper,1937,Comedy|Fantasy|Romance
