In [1]:
from py2neo import Graph
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)

## Connect to neo4j database

In [3]:
# Connect to local database 'Kaggle Movie Database' with data
# extracted from https://www.kaggle.com/rounakbanik/the-movies-dataset

graph = Graph("bolt://localhost:7687", auth=("neo4j", "ilovemovies"))

# Recommendation engines

Approaches based on guide http://guides.neo4j.com/sandbox/recommendations

## Content based filtering based on movie genre, cast, crew, production co. and keywords

Recommend items that are similar to those that a user rated highly previously.

### Weighted sum of common traits between movies

In [4]:
def get_n_similar_movies_on_common_traits(movie_name, n):
    
    query = '''
    
    // find movies with common genres with previously watched movies (m)
    
    MATCH (m:Movie {name: $movie_name})-[:BELONGS_TO]->(g:Genre)<-[:BELONGS_TO]-(rec:Movie)
    WITH m, rec, COUNT(*) AS genre_score
    
    // find movies with common actors with previously watched movies (m)
    
    OPTIONAL MATCH (m)<-[:ACTED_IN]-(a:Actor)-[:ACTED_IN]->(rec)
    WITH m, rec, genre_score, COUNT(a) AS cast_score
    
    // find movies with common directors with previously watched movies (m)
    
    OPTIONAL MATCH (m)<-[:DIRECTED]-(d:Director)-[:DIRECTED]->(rec)
    WITH m, rec, genre_score, cast_score, COUNT(d) AS dir_score
    
    // find movies with common writers with previously watched movies (m)
    
    OPTIONAL MATCH (m)<-[:WROTE]-(w:Writer)-[:WROTE]->(rec)
    WITH m, rec, genre_score, cast_score, dir_score, COUNT(w) AS wtr_score
    
    // find movies with common producers with previously watched movies (m)
    
    OPTIONAL MATCH (m)<-[:PRODUCED]-(p:ProductionCompany)-[:PRODUCED]->(rec)
    WITH m, rec, genre_score, cast_score, dir_score, wtr_score, COUNT(p) AS prod_score
    
    // find movies with common keywords with previously watched movies (m)
    
    OPTIONAL MATCH (m)<-[:DESCRIBES]-(k:Keyword)-[:DESCRIBES]->(rec)
    WITH m, rec, genre_score, cast_score, dir_score, wtr_score, prod_score, COUNT(k) AS key_score 
    
    // calculate similarity score

    RETURN rec.name AS recommendation, $movie_name AS similar_to,
    genre_score, cast_score, dir_score, wtr_score, prod_score, key_score,
    (1*genre_score)+(2*cast_score)+(2*dir_score)+(2*wtr_score)+(1*prod_score)+(1*key_score) AS final_score
    ORDER BY final_score DESC LIMIT $n
    
    '''
        
    similar_movies = graph.run(query, parameters = {'movie_name': movie_name, 
                                                    'n': n}).data()
    
    return pd.DataFrame(similar_movies)

In [5]:
def get_movie_recommendations_on_common_traits(user_id, top_n_movies, n_top_recommendations):

    query = '''
    
    
        // find the top positively rated movies (m) from user

        MATCH (u:User {id: $user_id})-[r:RATED]->(m:Movie) WHERE r.rating >= 3

        WITH u, m, r.rating AS r ORDER BY r DESC LIMIT $top_n_movies
        
        
        // find other movies (rec) with common traits with the user favorite movies (m)

        MATCH (m)-[:BELONGS_TO]->(g:Genre)<-[:BELONGS_TO]-(rec:Movie)
        WHERE NOT EXISTS ((u)-[:RATED]->(rec)) AND m <> rec
        WITH u, m, r, rec, COUNT(*) AS gs

        OPTIONAL MATCH (m)<-[:ACTED_IN]-(a:Actor)-[:ACTED_IN]->(rec)
        WITH u, m, r, rec, gs, COUNT(a) AS cs

        OPTIONAL MATCH (m)<-[:DIRECTED]-(d:Director)-[:DIRECTED]->(rec)
        WITH u, m, r, rec, gs, cs, COUNT(d) AS ds

        OPTIONAL MATCH (m)<-[:WROTE]-(w:Writer)-[:WROTE]->(rec)
        WITH u, m, r, rec, gs, cs, ds, COUNT(w) AS ws

        OPTIONAL MATCH (m)<-[:PRODUCED]-(p:ProductionCompany)-[:PRODUCED]->(rec)
        WITH u, m, r, rec, gs, cs, ds, ws, COUNT(p) AS ps

        OPTIONAL MATCH (m)<-[:DESCRIBES]-(k:Keyword)-[:DESCRIBES]->(rec)
        WITH u, m, r, rec, gs, cs, ds, ws, ps, COUNT(k) AS ks 


        // calculate score considering the rating of movie m and common traits with recommendation
        
        RETURN u.id AS user, 
        rec.name AS recommendation,
        m.name AS similar_to, 
        r AS rating, 
        gs AS genre, cs AS cast, (ds + ws + ps) AS crew, ks as keywords,
        (r)*((1*gs)+(2*cs)+(2*ds)+(2*ws)+(1*ps)+(1*ks)) AS score
        ORDER BY score DESC LIMIT $n_top_recommendations

    '''


    r = graph.run(query, parameters = {'user_id': user_id,
                                       'top_n_movies': top_n_movies,
                                       'n_top_recommendations': n_top_recommendations}).data()

    return pd.DataFrame(r)

In [19]:
# Find similar movies

get_n_similar_movies_on_common_traits('Catch Me If You Can', n = 10)

Unnamed: 0,recommendation,similar_to,genre_score,cast_score,dir_score,wtr_score,prod_score,key_score,final_score
0,The Departed,Catch Me If You Can,2,2,0,0,0,0,6
1,Badlands,Catch Me If You Can,2,1,0,0,0,0,4
2,The Green Mile,Catch Me If You Can,2,1,0,0,0,0,4
3,The Aviator,Catch Me If You Can,1,1,0,0,0,1,4
4,The Terminal,Catch Me If You Can,1,1,0,0,1,0,4
5,Nick of Time,Catch Me If You Can,2,1,0,0,0,0,4
6,Bordertown,Catch Me If You Can,2,1,0,0,0,0,4
7,Lucky Number Slevin,Catch Me If You Can,2,0,0,0,0,1,3
8,Free Money,Catch Me If You Can,1,1,0,0,0,0,3
9,The Contract,Catch Me If You Can,2,0,0,0,0,1,3


In [20]:
# Recommend movies to user based on common traits with previous liked movies

get_movie_recommendations_on_common_traits(user_id = 6, top_n_movies = 10, n_top_recommendations = 25)

Unnamed: 0,user,recommendation,similar_to,rating,genre,cast,crew,keywords,score
0,6,The Shipping News,Notes on a Scandal,4.5,2,2,0,3,40.5
1,6,The Piano Teacher,Notes on a Scandal,4.5,2,0,0,6,36.0
2,6,Last Tango in Paris,Notes on a Scandal,4.5,2,0,0,5,31.5
3,6,Heavenly Creatures,Notes on a Scandal,4.5,1,0,0,6,31.5
4,6,Torrente 2: Mission in Marbella,Torrente 3 The Protector,4.5,2,2,0,1,31.5
5,6,Timecop,Hard Target,4.0,3,1,2,0,28.0
6,6,Body of Evidence,Notes on a Scandal,4.5,2,0,0,4,27.0
7,6,Cloud 9,Notes on a Scandal,4.5,2,0,0,4,27.0
8,6,Revolutionary Road,Notes on a Scandal,4.5,2,0,2,2,27.0
9,6,Manhattan,Notes on a Scandal,4.5,2,0,0,4,27.0


### Using jaccard index similarity metric

The Jaccard index is a number between 0 and 1 that indicates how similar two sets are. The Jaccard index of two identical sets is 1. If two sets do not have a common element, then the Jaccard index is 0. The Jaccard is calculated by dividing the size of the intersection of two sets by the union of the two sets.

In [9]:
def get_n_similar_movies_jaccard(movie_name, n):
    
    query = '''
    
    // find movies with common characteristics with previously watched movies (m)
    
    MATCH (m:Movie {name: $movie_name})-[:BELONGS_TO|:ACTED_IN|:DIRECTED|:WROTE|:PRODUCED|:DESCRIBES]-(n)-
    [:BELONGS_TO|:ACTED_IN|:DIRECTED|:WROTE|:PRODUCED|:DESCRIBES]-(rec:Movie)
    
    WITH m, rec, COUNT(n) AS intersection, COLLECT(n.name) as i
    
    
    // find all characteristics of movie m
    
    MATCH (m)-[:BELONGS_TO|:ACTED_IN|:DIRECTED|:WROTE|:PRODUCED|:DESCRIBES]-(mn)
    WITH m, rec, intersection, i, COLLECT(mn.name) AS s1
    
    
    // find all characteristics of movie to recommend
    
    MATCH (rec)-[:BELONGS_TO|:ACTED_IN|:DIRECTED|:WROTE|:PRODUCED|:DESCRIBES]-(rn)
    WITH m, rec, intersection, i, s1, COLLECT(rn.name) AS s2
    
    
    // calculate jaccard score
    
    WITH m, rec, intersection, i, s1+[x IN s2 WHERE NOT x IN s1] AS union, s1, s2
    
    RETURN rec.name AS recommendation, m.name AS similar_to, 
    intersection as nr_similarities, i AS similarities, 
    ((1.0*intersection)/SIZE(union)) AS jaccard_score ORDER BY jaccard_score DESC LIMIT $n
    '''
    
    similar_movies = graph.run(query, parameters = {'movie_name': movie_name, 
                                                    'n': n}).data()
    
    return pd.DataFrame(similar_movies)


In [10]:
def get_movie_recommendations_jaccard_similarity(user_id, top_n_movies, n_top_recommendations):

    query = '''
    
        // find the top positively rated movies from user

        MATCH (u:User {id: $user_id})-[r:RATED]->(m:Movie) WHERE r.rating >= 3

        WITH u, m, r.rating AS r ORDER BY r DESC LIMIT $top_n_movies
        
        
        // find movies with common characteristics with watched movie m
        
        MATCH (m)-[:BELONGS_TO|:ACTED_IN|:DIRECTED|:WROTE|:PRODUCED|:DESCRIBES]-(n)-
        [:BELONGS_TO|:ACTED_IN|:DIRECTED|:WROTE|:PRODUCED|:DESCRIBES]-(rec:Movie)

        WITH u, m, r, rec, COUNT(n) AS intersection, COLLECT(n.name) as i
        
        
        // find all characteristics of movie m

        MATCH (m)-[:BELONGS_TO|:ACTED_IN|:DIRECTED|:WROTE|:PRODUCED|:DESCRIBES]-(mn)
        WITH u, m, r, rec, intersection, i, COLLECT(mn.name) AS s1
        
        
        // find all characteristics of movie to recommend

        MATCH (rec)-[:BELONGS_TO|:ACTED_IN|:DIRECTED|:WROTE|:PRODUCED|:DESCRIBES]-(rn)
        WITH u, m, r, rec, intersection, i, s1, COLLECT(rn.name) AS s2

        
        // calculate jaccard score and add weight of movie rating

        WITH u, m, r, rec, intersection, i, s1+[x IN s2 WHERE NOT x IN s1] AS union, s1, s2

        RETURN 
        u.id AS user,
        rec.name AS recommendation, 
        m.name AS similar_to, 
        r AS rating,
        intersection as nr_similarities,
        i AS similarities, 
        r * ((1.0*intersection)/SIZE(union)) AS score
        ORDER BY score DESC LIMIT $n_top_recommendations

    '''


    r = graph.run(query, parameters = {'user_id': user_id,
                                       'top_n_movies': top_n_movies,
                                       'n_top_recommendations': n_top_recommendations}).data()

    return pd.DataFrame(r)

In [11]:
# Find similar movies with jaccard similarity

get_n_similar_movies_jaccard('Notting Hill', 10)

Unnamed: 0,recommendation,similar_to,nr_similarities,similarities,jaccard_score
0,Love Actually,Notting Hill,9,"[Hugh Grant, Richard Curtis, Richard Curtis, R...",0.183673
1,About a Boy,Notting Hill,6,"[Hugh Grant, Romance, Comedy, Drama, friendshi...",0.133333
2,Jack & Sarah,Notting Hill,5,"[Romance, Comedy, Drama, Polygram Filmed Enter...",0.116279
3,Love on the Run,Notting Hill,4,"[Romance, Comedy, Drama, bookshop]",0.1
4,When Harry Met Sally...,Notting Hill,4,"[Romance, Comedy, Drama, friendship]",0.097561
5,My Best Friend's Wedding,Notting Hill,4,"[Julia Roberts, Romance, Comedy, new love]",0.097561
6,The Mikado,Notting Hill,3,"[Romance, Comedy, Drama]",0.096774
7,Pretty Woman,Notting Hill,4,"[Julia Roberts, Romance, Comedy, friendship]",0.095238
8,Confessions of a Dangerous Mind,Notting Hill,4,"[Julia Roberts, Romance, Comedy, Drama]",0.095238
9,Dan in Real Life,Notting Hill,4,"[Romance, Comedy, Drama, bookshop]",0.090909


In [12]:
get_movie_recommendations_jaccard_similarity(user_id = 2, top_n_movies = 10, n_top_recommendations = 25)

Unnamed: 0,user,recommendation,similar_to,rating,nr_similarities,similarities,score
0,2,The Godfather: Part II,The Conversation,5.0,9,"[John Cazale, Francis Ford Coppola, Francis Fo...",1.451613
1,2,The Driver,48 Hrs.,5.0,6,"[Walter Hill, Walter Hill, Action, Drama, Crim...",1.304348
2,2,The Godfather,The Conversation,5.0,7,"[Francis Ford Coppola, Francis Ford Coppola, F...",1.129032
3,2,The Assassination Bureau,48 Hrs.,5.0,5,"[Action, Comedy, Crime, Thriller, Paramount Pi...",1.086957
4,2,A Woman Is a Woman,Contempt,5.0,6,"[Jean-Luc Godard, Jean-Luc Godard, Jean-Luc Go...",1.0
5,2,Rumble Fish,The Conversation,5.0,6,"[Francis Ford Coppola, Francis Ford Coppola, F...",1.0
6,2,The Real McCoy,48 Hrs.,5.0,5,"[Action, Drama, Crime, Thriller, prison]",1.0
7,2,The Godfather: Part III,The Conversation,5.0,7,"[Francis Ford Coppola, Francis Ford Coppola, F...",1.0
8,2,Hustle,48 Hrs.,5.0,6,"[Action, Comedy, Drama, Crime, Thriller, Param...",1.0
9,2,Marie Antoinette,Lost in Translation,4.0,7,"[Sofia Coppola, Sofia Coppola, Sofia Coppola, ...",1.0


## Collaborative filtering

Find similar users in the network. Assuming that similar users have similar preferences, what are the movies those similar users like?

### Using cosine similarity

In [13]:
# The cosine similarity of two users will tell us how similar two users' preferences for movies are. 
# Users with a high cosine similarity will have similar preferences.


def get_movie_recommendations_user_similarity_cosine(user_id, n_top_similar_users, n_top_recommendations):
    
    query = '''
    
        MATCH (u1:User {id: $user_id})-[r1:RATED]->(m:Movie)<-[r2:RATED]-(u2:User)
        WHERE u1 <> u2
        
        WITH u1, u2, COLLECT(r1.rating) AS u1ratings, COLLECT(r2.rating) AS u2ratings
        WHERE size(u1ratings) > 10
        
        WITH u1, u2, gds.alpha.similarity.cosine(u1ratings, u2ratings) AS similarity 
        ORDER BY similarity LIMIT $n_top_similar_users

        MATCH (u2)-[r3:RATED]->(rec:Movie)
        WHERE NOT EXISTS ((u1)-[:RATED]->(rec:Movie))

        RETURN rec.name as recommendation, SUM( similarity * r3.rating) AS score
        ORDER BY score DESC LIMIT $n_top_recommendations
    
    '''
    
    
    recommendations = graph.run(query, parameters = {'user_id': user_id,
                                                   'n_top_similar_users': n_top_similar_users,
                                                   'n_top_recommendations': n_top_recommendations
                                                  }).data()
    
    return pd.DataFrame(recommendations)

In [14]:
# find the n most similar users to the user_id and the movies they liked which the user_id hasn't watched yet

cos_recommendations = get_movie_recommendations_user_similarity_cosine(user_id = 100, 
                                                                       n_top_similar_users = 10, 
                                                                       n_top_recommendations = 15)
cos_recommendations

Unnamed: 0,recommendation,score
0,The 39 Steps,37.816973
1,Dawn of the Dead,31.138773
2,Shaft in Africa,27.110095
3,Terminator 3: Rise of the Machines,24.601402
4,Sissi,23.039801
5,Titanic,22.487188
6,Interview with the Vampire,21.96244
7,Monsieur Hulot's Holiday,21.616616
8,48 Hrs.,21.172894
9,M,20.0933


### Using Pearson similarity

Pearson similarity is well-suited for product recommendations because it takes into account the fact that different users will have different mean ratings: on average some users will tend to give higher ratings than others. Since Pearson similarity considers differences about the mean, this metric will account for these discrepancies.



In [15]:
def get_movie_recommendations_user_similarity_pearson(user_id, n_top_similar_users, n_top_recommendations):
    
    query = '''
    
        MATCH (u1:User {id: $user_id})-[r1:RATED]->(m:Movie)
        WITH u1, gds.alpha.similarity.asVector(m, r1.rating) as u1Vector
        
        MATCH (u2:User)-[r2:RATED]->(m:Movie) WHERE u1<>u2
        
        WITH u1, u2, u1Vector, gds.alpha.similarity.asVector(m, r2.rating) as u2Vector
        WHERE size(apoc.coll.intersection([v in u1Vector | v.category], [v in u2Vector | v.category])) > 10
        
        WITH u1, u2,  gds.alpha.similarity.pearson(u1Vector, u2Vector, {vectorType: "maps"}) AS similarity
        ORDER BY similarity DESC
        LIMIT $n_top_similar_users

        MATCH (u2)-[r:RATED]->(rec:Movie) WHERE NOT EXISTS( (u1)-[:RATED]->(rec) )
        RETURN rec.name as recommendation, SUM( similarity * r.rating) AS score
        ORDER BY score DESC LIMIT $n_top_recommendations

    '''
    
    
    recommendations = graph.run(query, parameters = {'user_id': user_id,
                                                   'n_top_similar_users': n_top_similar_users,
                                                   'n_top_recommendations': n_top_recommendations
                                                  }).data()
    return pd.DataFrame(recommendations)

In [16]:
pearson_recommendations = get_movie_recommendations_user_similarity_pearson(user_id = 10, 
                                                                            n_top_similar_users = 10,
                                                                            n_top_recommendations = 25)

pearson_recommendations

Unnamed: 0,recommendation,score
0,Dawn of the Dead,28.464686
1,The Thomas Crown Affair,27.979029
2,Terminator 3: Rise of the Machines,19.37498
3,Sleepless in Seattle,18.954318
4,The Talented Mr. Ripley,17.972219
5,Men in Black II,17.788819
6,Once Were Warriors,17.648453
7,Scarface,17.601811
8,Cold Mountain,17.58352
9,Solaris,17.451119
