In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from py2neo import Graph

%matplotlib inline

In [4]:
graph = Graph('event_db2/installation-3.4.6/import', username = 'neo4j')

In [5]:
graph

<Graph database=<Database uri='bolt://localhost:7687' secure=False user_agent='py2neo/4.1.0 neo4j-python/1.6.2 Python/3.6.5-final-0 (darwin)'> name='data'>

In [7]:
# Loading user-related data
user = pd.read_csv('ml-100k 3/u.user', sep='|', header=None, names=['id','age','gender','occupation','zip code'])
n_u = user.shape[0]

# Loading genres of movies
genre = pd.read_csv('ml-100k 3/u.genre', sep='|', header=None, names=['name', 'id'])
n_g = genre.shape[0]

# Loading item-related data
# Format : id | title | release date | | IMDb url | "genres"
# where "genres" is a vector of size n_g : genres[i]=1 if the movie belongs to genre i
movie_col = ['id', 'title','release date', 'useless', 'IMDb url']
movie_col = movie_col + genre['id'].tolist()
movie = pd.read_csv('ml-100k 3/u.item', sep='|', header=None, names=movie_col, encoding='cp1252')
movie = movie.fillna('unknown')
n_m = movie.shape[0]

# Loading ratings
rating_col = ['user_id', 'item_id','rating', 'timestamp']
rating = pd.read_csv('ml-100k 3/u.data', sep='\t' ,header=None, names=rating_col)
n_r = rating.shape[0]

In [9]:
##### Create the nodes relative to Users, each one being identified by its user_id #####
# "MERGE" request : creates a new node if it does not exist already
tx = graph.begin()
statement = "MERGE (a:`User`{user_id:{A}}) RETURN a"
for u in user['id']:
    tx.run(statement, {"A": u})

tx.commit()


##### Create the nodes relative to Genres, each one being identified by its genre_id, and with the property name #####
tx = graph.begin()
statement = "MERGE (a:`Genre`{genre_id:{A}, name:{B}}) RETURN a"
for g,row in genre.iterrows() :
    tx.run(statement, {"A": row.iloc[1], "B": row.iloc[0]})

tx.commit()

In [18]:
##### Create the Movie nodes with properties movie_id, title and url ; then create the Is_genre edges #####
tx = graph.begin()
statement1 = "MERGE (a:`Movie`{movie_id:{A}, title:{B}, url:{C}}) RETURN a"
statement2 = ("MATCH (t:`Genre`{genre_id:{D}}) "
              "MATCH (a:`Movie`{movie_id:{A}, title:{B}, url:{C}}) MERGE (a)-[r:`Is_genre`]->(t) RETURN r")

# Looping over movies m
for m,row in movie.iterrows() :
    # Create "Movie" node
    tx.run(statement1, {"A": row.loc['id'], "B": row.loc['title'], "C": row.loc['IMDb url']})
    # is_genre : vector of size n_g, is_genre[i]=True if Movie m belongs to Genre i
    is_genre = row.iloc[-19:]==1
    related_genres = genre[is_genre].axes[0].values

    # Looping over Genres g which satisfy the condition : is_genre[i]=True
    for g in related_genres :
        # Retrieve node corresponding to genre g, and create relation between g and m
        tx.run(statement2,\
                  {"A": int(row.loc['id']), "B": row.loc['title'], "C": row.loc['IMDb url'], "D": int(g)})

    # Every 100 movies, push queued statements to the server for execution to avoid one massive "commit"
    if m%100==0 : tx.process()

# End with a "commit"
tx.commit()


##### Create the Has_rated edges, with rating as property #####
tx = graph.begin()
statement = ("MATCH (u:`User`{user_id:{A}}) "
             "MATCH (m:`Movie`{movie_id:{C}}) MERGE (u)-[r:`Has_rated`{rating:{B}}]->(m) RETURN r")

# Looping over ratings
for r,row in rating.iterrows() :
    # Retrieve "User" and "Movie" nodes, and create relationship with the corresponding rating as property
    tx.run(statement, {"A": int(row.loc['user_id']), "B": int(row.loc['rating']), "C": int(row.loc['item_id'])})
    if r%100==0 : tx.process()

tx.commit()


In [19]:
#Create indexes on nodes for easy lookup
graph.run('CREATE INDEX ON :User(user_id)')
graph.run('CREATE INDEX ON :Movie(movie_id)')
graph.run('CREATE INDEX ON :Genre(genre_id)')

<py2neo.database.Cursor at 0x11504ff28>

In [33]:
user_id = 8
threshold = 0.5

# In Strategy 1, the similarity between two users u1 and u2 is the proportion of movies they have in common
# The score of one given movie m is the proportion of users similar to u1 who rated m

query = (### Similarity normalization : count number of movies seen by u1 ###
  # Count movies rated by u1 as countm
  'MATCH (u1:`User` {user_id:{user_id}})-[:`Has_rated`]->(m1:`Movie`) '
  'WITH count(m1) as countm '
  ### Score normalization : count number of users who are considered similar to u1 ###
  # Retrieve all users u2 who share at least one movie with u1
  'MATCH (u1:`User` {user_id:{user_id}})-[:`Has_rated`]->(m1:`Movie`) '
  'MATCH (m1)<-[r:`Has_rated`]-(u2:`User`) '
  'WHERE NOT u2=u1 '
  # Compute similarity
  'WITH u2, countm, tofloat(count(r))/countm as sim '
  # Keep users u2 whose similarity with u1 is above some threshold
  'WHERE sim>{threshold} '
  # Count number of similar users as countu
  'WITH count(u2) as countu, countm '
  ### Recommendation ###
  # Retrieve all users u2 who share at least one movie with u1
  'MATCH (u1:`User` {user_id:{user_id}})-[:`Has_rated`]->(m1:`Movie`) '
  'MATCH (m1)<-[r:`Has_rated`]-(u2:`User`) '
  'WHERE NOT u2=u1 '
  # Compute similarity
  'WITH u1, u2,countu, tofloat(count(r))/countm as sim '
  # Keep users u2 whose similarity with u1 is above some threshold
  'WHERE sim>{threshold} '
  # Retrieve movies m that were rated by at least one similar user, but not by u1
  'MATCH (m:`Movie`)<-[r:`Has_rated`]-(u2) '
  'WHERE NOT (m)<-[:`Has_rated`]-(u1) '
  # Compute score and return the list of suggestions ordered by score
  'RETURN DISTINCT m, tofloat(count(r))/countu as score ORDER BY score DESC ')
tx = graph.begin()
tx.run(query, {'user_id': user_id, 'threshold': threshold})
tx.commit()

In [32]:
result

<py2neo.database.Cursor at 0x1130f3908>