In [None]:
from py2neo import Graph
import pandas as pd
import numpy as np

In [None]:
pd.set_option('display.max_columns', None)

## Connect to neo4j database

In [None]:
# Connect to local database 'Kaggle Movie Database' with data
# extracted from https://www.kaggle.com/rounakbanik/the-movies-dataset

graph = Graph("bolt://localhost:7687", auth=("neo4j", "ilovemovies"))

# Recommendation engines

Approaches based on guide http://guides.neo4j.com/sandbox/recommendations

## Content based filtering based on movie genre, cast, crew, production co. and keywords

Recommend items that are similar to those that a user rated highly previously.

### Weighted sum of common traits between movies

In [None]:
def get_n_similar_movies_on_common_traits(movie_name, n):
    
    query = '''
    
    // find movies with common genres with previously watched movies (m)
    
    MATCH (m:Movie {name: $movie_name})-[:BELONGS_TO]->(g:Genre)<-[:BELONGS_TO]-(rec:Movie)
    WITH m, rec, COUNT(*) AS genre_score
    
    // find movies with common actors with previously watched movies (m)
    
    OPTIONAL MATCH (m)<-[:ACTED_IN]-(a:Actor)-[:ACTED_IN]->(rec)
    WITH m, rec, genre_score, COUNT(a) AS cast_score
    
    // find movies with common directors with previously watched movies (m)
    
    OPTIONAL MATCH (m)<-[:DIRECTED]-(d:Director)-[:DIRECTED]->(rec)
    WITH m, rec, genre_score, cast_score, COUNT(d) AS dir_score
    
    // find movies with common writers with previously watched movies (m)
    
    OPTIONAL MATCH (m)<-[:WROTE]-(w:Writer)-[:WROTE]->(rec)
    WITH m, rec, genre_score, cast_score, dir_score, COUNT(w) AS wtr_score
    
    // find movies with common producers with previously watched movies (m)
    
    OPTIONAL MATCH (m)<-[:PRODUCED]-(p:ProductionCompany)-[:PRODUCED]->(rec)
    WITH m, rec, genre_score, cast_score, dir_score, wtr_score, COUNT(p) AS prod_score
    
    // find movies with common keywords with previously watched movies (m)
    
    OPTIONAL MATCH (m)<-[:DESCRIBES]-(k:Keyword)-[:DESCRIBES]->(rec)
    WITH m, rec, genre_score, cast_score, dir_score, wtr_score, prod_score, COUNT(k) AS key_score 
    
    // calculate similarity score

    RETURN rec.name AS recommendation, $movie_name AS similar_to,
    genre_score, cast_score, dir_score, wtr_score, prod_score, key_score,
    (1*genre_score)+(2*cast_score)+(2*dir_score)+(2*wtr_score)+(1*prod_score)+(1*key_score) AS final_score
    ORDER BY final_score DESC LIMIT $n
    
    '''
        
    similar_movies = graph.run(query, parameters = {'movie_name': movie_name, 
                                                    'n': n}).data()
    
    return pd.DataFrame(similar_movies)

In [None]:
def get_movie_recommendations_on_common_traits(user_id, top_n_movies, n_top_recommendations):

    query = '''
    
    
        // find the top positively rated movies (m) from user

        MATCH (u:User {id: $user_id})-[r:RATED]->(m:Movie) WHERE r.rating >= 3

        WITH u, m, r.rating AS r ORDER BY r DESC LIMIT $top_n_movies
        
        
        // find other movies (rec) with common traits with the user favorite movies (m)

        MATCH (m)-[:BELONGS_TO]->(g:Genre)<-[:BELONGS_TO]-(rec:Movie)
        WHERE NOT EXISTS ((u)-[:RATED]->(rec)) AND m <> rec
        WITH u, m, r, rec, COUNT(*) AS gs

        OPTIONAL MATCH (m)<-[:ACTED_IN]-(a:Actor)-[:ACTED_IN]->(rec)
        WITH u, m, r, rec, gs, COUNT(a) AS cs

        OPTIONAL MATCH (m)<-[:DIRECTED]-(d:Director)-[:DIRECTED]->(rec)
        WITH u, m, r, rec, gs, cs, COUNT(d) AS ds

        OPTIONAL MATCH (m)<-[:WROTE]-(w:Writer)-[:WROTE]->(rec)
        WITH u, m, r, rec, gs, cs, ds, COUNT(w) AS ws

        OPTIONAL MATCH (m)<-[:PRODUCED]-(p:ProductionCompany)-[:PRODUCED]->(rec)
        WITH u, m, r, rec, gs, cs, ds, ws, COUNT(p) AS ps

        OPTIONAL MATCH (m)<-[:DESCRIBES]-(k:Keyword)-[:DESCRIBES]->(rec)
        WITH u, m, r, rec, gs, cs, ds, ws, ps, COUNT(k) AS ks 


        // calculate score considering the rating of movie m and common traits with recommendation
        
        RETURN u.id AS user, 
        rec.name AS recommendation,
        m.name AS similar_to, 
        r AS rating, 
        gs AS genre, cs AS cast, (ds + ws + ps) AS crew, ks as keywords,
        (r)*((1*gs)+(2*cs)+(2*ds)+(2*ws)+(1*ps)+(1*ks)) AS score
        ORDER BY score DESC LIMIT $n_top_recommendations

    '''


    r = graph.run(query, parameters = {'user_id': user_id,
                                       'top_n_movies': top_n_movies,
                                       'n_top_recommendations': n_top_recommendations}).data()

    return pd.DataFrame(r)

In [None]:
# Find similar movies

get_n_similar_movies_on_common_traits('Catch Me If You Can', n = 10)

In [None]:
# Recommend movies to user based on common traits with previous liked movies

get_movie_recommendations_on_common_traits(user_id = 6, top_n_movies = 10, n_top_recommendations = 25)

### Using jaccard index similarity metric

The Jaccard index is a number between 0 and 1 that indicates how similar two sets are. The Jaccard index of two identical sets is 1. If two sets do not have a common element, then the Jaccard index is 0. The Jaccard is calculated by dividing the size of the intersection of two sets by the union of the two sets.

In [None]:
def get_n_similar_movies_jaccard(movie_name, n):
    
    query = '''
    
    // find movies with common characteristics with previously watched movies (m)
    
    MATCH (m:Movie {name: $movie_name})-[:BELONGS_TO|:ACTED_IN|:DIRECTED|:WROTE|:PRODUCED|:DESCRIBES]-(n)-
    [:BELONGS_TO|:ACTED_IN|:DIRECTED|:WROTE|:PRODUCED|:DESCRIBES]-(rec:Movie)
    
    WITH m, rec, COUNT(n) AS intersection, COLLECT(n.name) as i
    
    
    // find all characteristics of movie m
    
    MATCH (m)-[:BELONGS_TO|:ACTED_IN|:DIRECTED|:WROTE|:PRODUCED|:DESCRIBES]-(mn)
    WITH m, rec, intersection, i, COLLECT(mn.name) AS s1
    
    
    // find all characteristics of movie to recommend
    
    MATCH (rec)-[:BELONGS_TO|:ACTED_IN|:DIRECTED|:WROTE|:PRODUCED|:DESCRIBES]-(rn)
    WITH m, rec, intersection, i, s1, COLLECT(rn.name) AS s2
    
    
    // calculate jaccard score
    
    WITH m, rec, intersection, i, s1+[x IN s2 WHERE NOT x IN s1] AS union, s1, s2
    
    RETURN rec.name AS recommendation, m.name AS similar_to, 
    intersection as nr_similarities, i AS similarities, 
    ((1.0*intersection)/SIZE(union)) AS jaccard_score ORDER BY jaccard_score DESC LIMIT $n
    '''
    
    similar_movies = graph.run(query, parameters = {'movie_name': movie_name, 
                                                    'n': n}).data()
    
    return pd.DataFrame(similar_movies)


In [None]:
def get_movie_recommendations_jaccard_similarity(user_id, top_n_movies, n_top_recommendations):

    query = '''
    
        // find the top positively rated movies from user

        MATCH (u:User {id: $user_id})-[r:RATED]->(m:Movie) WHERE r.rating >= 3

        WITH u, m, r.rating AS r ORDER BY r DESC LIMIT $top_n_movies
        
        
        // find movies with common characteristics with watched movie m
        
        MATCH (m)-[:BELONGS_TO|:ACTED_IN|:DIRECTED|:WROTE|:PRODUCED|:DESCRIBES]-(n)-
        [:BELONGS_TO|:ACTED_IN|:DIRECTED|:WROTE|:PRODUCED|:DESCRIBES]-(rec:Movie)

        WITH u, m, r, rec, COUNT(n) AS intersection, COLLECT(n.name) as i
        
        
        // find all characteristics of movie m

        MATCH (m)-[:BELONGS_TO|:ACTED_IN|:DIRECTED|:WROTE|:PRODUCED|:DESCRIBES]-(mn)
        WITH u, m, r, rec, intersection, i, COLLECT(mn.name) AS s1
        
        
        // find all characteristics of movie to recommend

        MATCH (rec)-[:BELONGS_TO|:ACTED_IN|:DIRECTED|:WROTE|:PRODUCED|:DESCRIBES]-(rn)
        WITH u, m, r, rec, intersection, i, s1, COLLECT(rn.name) AS s2

        
        // calculate jaccard score and add weight of movie rating

        WITH u, m, r, rec, intersection, i, s1+[x IN s2 WHERE NOT x IN s1] AS union, s1, s2

        RETURN 
        u.id AS user,
        rec.name AS recommendation, 
        m.name AS similar_to, 
        r AS rating,
        intersection as nr_similarities,
        i AS similarities, 
        r * ((1.0*intersection)/SIZE(union)) AS score
        ORDER BY score DESC LIMIT $n_top_recommendations

    '''


    r = graph.run(query, parameters = {'user_id': user_id,
                                       'top_n_movies': top_n_movies,
                                       'n_top_recommendations': n_top_recommendations}).data()

    return pd.DataFrame(r)

In [None]:
# Find similar movies with jaccard similarity

get_n_similar_movies_jaccard('Notting Hill', 10)

In [None]:
get_movie_recommendations_jaccard_similarity(user_id = 2, top_n_movies = 10, n_top_recommendations = 25)

## Collaborative filtering

Find similar users in the network. Assuming that similar users have similar preferences, what are the movies those similar users like?

### Using cosine similarity

In [None]:
# The cosine similarity of two users will tell us how similar two users' preferences for movies are. 
# Users with a high cosine similarity will have similar preferences.


def get_movie_recommendations_user_similarity_cosine(user_id, n_top_similar_users, n_top_recommendations):
    
    query = '''
    
        MATCH (u1:User {id: $user_id})-[r1:RATED]->(m:Movie)<-[r2:RATED]-(u2:User)
        WHERE u1 <> u2
        
        WITH u1, u2, COLLECT(r1.rating) AS u1ratings, COLLECT(r2.rating) AS u2ratings
        WHERE size(u1ratings) > 10
        
        WITH u1, u2, gds.alpha.similarity.cosine(u1ratings, u2ratings) AS similarity 
        ORDER BY similarity LIMIT $n_top_similar_users

        MATCH (u2)-[r3:RATED]->(rec:Movie)
        WHERE NOT EXISTS ((u1)-[:RATED]->(rec:Movie))

        RETURN rec.name as recommendation, SUM( similarity * r3.rating) AS score
        ORDER BY score DESC LIMIT $n_top_recommendations
    
    '''
    
    
    recommendations = graph.run(query, parameters = {'user_id': user_id,
                                                   'n_top_similar_users': n_top_similar_users,
                                                   'n_top_recommendations': n_top_recommendations
                                                  }).data()
    
    return pd.DataFrame(recommendations)

In [None]:
# find the n most similar users to the user_id and the movies they liked which the user_id hasn't watched yet

cos_recommendations = get_movie_recommendations_user_similarity_cosine(user_id = 100, 
                                                                       n_top_similar_users = 10, 
                                                                       n_top_recommendations = 15)
cos_recommendations

### Using Pearson similarity

Pearson similarity is well-suited for product recommendations because it takes into account the fact that different users will have different mean ratings: on average some users will tend to give higher ratings than others. Since Pearson similarity considers differences about the mean, this metric will account for these discrepancies.



In [None]:
def get_movie_recommendations_user_similarity_pearson(user_id, n_top_similar_users, n_top_recommendations):
    
    query = '''
    
        MATCH (u1:User {id: $user_id})-[r1:RATED]->(m:Movie)
        WITH u1, gds.alpha.similarity.asVector(m, r1.rating) as u1Vector
        
        MATCH (u2:User)-[r2:RATED]->(m:Movie) WHERE u1<>u2
        
        WITH u1, u2, u1Vector, gds.alpha.similarity.asVector(m, r2.rating) as u2Vector
        WHERE size(apoc.coll.intersection([v in u1Vector | v.category], [v in u2Vector | v.category])) > 10
        
        WITH u1, u2,  gds.alpha.similarity.pearson(u1Vector, u2Vector, {vectorType: "maps"}) AS similarity
        ORDER BY similarity DESC
        LIMIT $n_top_similar_users

        MATCH (u2)-[r:RATED]->(rec:Movie) WHERE NOT EXISTS( (u1)-[:RATED]->(rec) )
        RETURN rec.name as recommendation, SUM( similarity * r.rating) AS score
        ORDER BY score DESC LIMIT $n_top_recommendations

    '''
    
    
    recommendations = graph.run(query, parameters = {'user_id': user_id,
                                                   'n_top_similar_users': n_top_similar_users,
                                                   'n_top_recommendations': n_top_recommendations
                                                  }).data()
    return pd.DataFrame(recommendations)

In [None]:
pearson_recommendations = get_movie_recommendations_user_similarity_pearson(user_id = 10, 
                                                                            n_top_similar_users = 10,
                                                                            n_top_recommendations = 25)

pearson_recommendations