# Hello

![](./localhost.png)

In [1]:
# Import py2neo and connect to Neo4j
from py2neo import Graph

# just an example, replace with credentials for your own Neo4j instance
graph = Graph(bolt=True, host="localhost", http_port=7687, user='neo4j', password='kiss')

In [2]:
# Hello world, sanity check
graph.run("MATCH (a) RETURN COUNT(a) AS numberOfNodes").evaluate()

0

In [3]:
# Create all the constraints

graph.run("CREATE CONSTRAINT ON (p:Product) ASSERT p.id IS UNIQUE;")
graph.run("CREATE CONSTRAINT ON (a:Aisle) ASSERT a.id IS UNIQUE;")
graph.run("CREATE CONSTRAINT ON (d:Department) ASSERT d.id IS UNIQUE;")
graph.run("CREATE CONSTRAINT ON (u:User) ASSERT u.id IS UNIQUE;")

<py2neo.graph.Cursor at 0x21720818940>

In [4]:
# Load the CSV files
# File are located at neo4j_home/import folder
# products first

graph.run("""
USING PERIODIC COMMIT 
LOAD CSV WITH HEADERS FROM "file:///products_clean.csv" AS line WITH line
CREATE (product:Product {id: toInteger(line.product_id), name: line.product_name})
MERGE (aisle:Aisle {id: toInteger(line.aisle_id), name: line.aisle})
MERGE (department:Department {id: toInteger(line.department_id),name: line.department})
CREATE (product)-[:FOUND_IN]->(aisle)
CREATE (product)-[:TYPE_OF]->(department);
""")

<py2neo.graph.Cursor at 0x217208186a0>

In [5]:
# users next
# This will take awhile...

graph.run("""
USING PERIODIC COMMIT 
LOAD CSV WITH HEADERS FROM "file:///users_orders.csv" AS line WITH line
MATCH (product:Product {id: toInteger(line.product_id)})
MERGE (user:User {id: toInteger(line.user_id)})
CREATE (user)-[b:BOUGHT]->(product)
SET b.order_total = toInteger(line.total_orders);
""")

<py2neo.graph.Cursor at 0x21720818cc0>

## Collaborative filtering

In [65]:
import pandas as pd

In [8]:
# Import number generator to generate a random user ID to query
from random import randint

Next user is: 151258


In [12]:
random_user = randint(0, 206210)
print("The selected user ID is: {}".format(random_user))

Next user is: 83539


What is we go by aisle? What else do other users buy just as much or more in the same aisle?

In [14]:
results = graph.run("""
MATCH (user:User {id: {user_id}})-[b1:BOUGHT]->()-[:FOUND_IN]->()<-[:FOUND_IN]-(otherproduct:Product)<-[b2:BOUGHT]-(otheruser:User)
WHERE NOT (user)-[:BOUGHT]->(otherproduct) AND b2.order_total >= b1.order_total
RETURN otherproduct.name AS recommendation,
       COUNT(*) AS usersInCommon
ORDER BY usersInCommon DESC
LIMIT 10""", user_id=random_user)

for row in results:
    print(row)

('otherproduct.name': 'Organic Baby Spinach', 'usersInCommon': 8568)
('otherproduct.name': 'Organic Blueberries', 'usersInCommon': 4136)
('otherproduct.name': 'Organic Grape Tomatoes', 'usersInCommon': 4110)
('otherproduct.name': 'Organic Raspberries', 'usersInCommon': 4002)
('otherproduct.name': 'Seedless Red Grapes', 'usersInCommon': 3169)
('otherproduct.name': 'Organic Baby Carrots', 'usersInCommon': 3123)
('otherproduct.name': 'Organic Baby Arugula', 'usersInCommon': 2985)
('otherproduct.name': 'Organic Peeled Whole Baby Carrots', 'usersInCommon': 2566)
('otherproduct.name': 'Clementines, Bag', 'usersInCommon': 2077)
('otherproduct.name': 'Original Hummus', 'usersInCommon': 1563)


## Simple Version

What else do other users buy just as much as?

In [27]:
results = graph.run("""
MATCH (user:User {id: {user_id}})-[b1:BOUGHT]->(:Product)<-[b2:BOUGHT]-(otheruser:User)
MATCH (otheruser)-[:BOUGHT]->(rec:Product)
WHERE NOT EXISTS( (user)-[:BOUGHT]->(rec)) AND b2.order_total >= b1.order_total
RETURN rec.name, COUNT(*) AS usersWhoAlsoBought
ORDER BY usersWhoAlsoBought DESC 
LIMIT 10""", user_id=random_user)

for row in results:
    print(row)

('rec.name': 'Bag of Organic Bananas', 'usersWhoAlsoBought': 741)
('rec.name': 'Organic Strawberries', 'usersWhoAlsoBought': 629)
('rec.name': 'Organic Hass Avocado', 'usersWhoAlsoBought': 596)
('rec.name': 'Organic Raspberries', 'usersWhoAlsoBought': 525)
('rec.name': 'Organic Baby Spinach', 'usersWhoAlsoBought': 520)
('rec.name': 'Organic Blueberries', 'usersWhoAlsoBought': 403)
('rec.name': 'Banana', 'usersWhoAlsoBought': 398)
('rec.name': 'Organic Lemon', 'usersWhoAlsoBought': 319)
('rec.name': 'Limes', 'usersWhoAlsoBought': 264)
('rec.name': 'Organic Zucchini', 'usersWhoAlsoBought': 235)


Consider the type of food.

In [70]:
random_user = randint(0, 206210)
print("Next user is: {}".format(random_user))

Next user is: 157770


In [71]:
results = graph.run("""
MATCH (user:User {id: {user_id}})-[b:BOUGHT]->(p:Product)
WITH user, avg(b.order_total) AS mean

MATCH (user)-[b:BOUGHT]->(p:Product)-[:FOUND_IN]->(a:Aisle)
WHERE b.order_total >= mean

WITH user, a, COUNT(*) AS score

MATCH (a)<-[:FOUND_IN]-(rec:Product)
WHERE NOT EXISTS((user)-[:BOUGHT]->(rec))

RETURN rec.name AS recommendation, COLLECT(DISTINCT a.name) AS productType, SUM(score) AS sscore
ORDER BY sscore DESC LIMIT 10
""", user_id=random_user)

for row in results:
    print(row)

('recommendation': 'Dark Chocolate Stump Town Coffee', 'productType': ['candy chocolate'], 'sscore': 2)
('recommendation': 'Sweet And Savory Thai Rice Crackers', 'productType': ['crackers'], 'sscore': 2)
('recommendation': 'Gluten Free Cracklebred Multigrain', 'productType': ['crackers'], 'sscore': 2)
('recommendation': 'Milk Chocolate Candies Fun Size Packs', 'productType': ['candy chocolate'], 'sscore': 2)
('recommendation': 'Delicate Wafers Layered with Chocolate Creme and Covered in a Milk Chocolatey Coating Milk Chocolate Candy', 'productType': ['candy chocolate'], 'sscore': 2)
('recommendation': 'Skinny Cow Dreamy Clusters Dark Chocolate Candy Pouches', 'productType': ['candy chocolate'], 'sscore': 2)
('recommendation': 'Feijoas', 'productType': ['fresh fruits'], 'sscore': 2)
('recommendation': 'Grape Licorice Twists', 'productType': ['candy chocolate'], 'sscore': 2)
('recommendation': 'Whole Seeded Watermelon', 'productType': ['fresh fruits'], 'sscore': 2)
('recommendation': 'Go

Not helpful with the same scores...

Let's see if similarity metrics will help instead.

### 1) Cosine Similarity

The cosine similarty of two users will tell us how similar two users' preferences for products are. Users with a high cosine similarity will have similar preferences.

In [46]:
results = graph.run("""
MATCH (p1:User {id: {user_id}})-[x:BOUGHT]->(p:Product)<-[y:BOUGHT]-(p2:User)
WITH COUNT(p) AS numberproducts, SUM(x.order_total * y.order_total) AS xyDotProduct,
SQRT(REDUCE(xDot = 0.0, a IN COLLECT(x.order_total) | xDot + a^2)) AS xLength,
SQRT(REDUCE(yDot = 0.0, b IN COLLECT(y.order_total) | yDot + b^2)) AS yLength,
p1, p2 WHERE numberproducts > 10
RETURN p2.id, xyDotProduct / (xLength * yLength) AS cosim
ORDER BY cosim DESC 
LIMIT 10""", user_id=random_user)

for row in results:
    print(row)

('p2.id': 169980, 'sim': 0.9969876781034773)
('p2.id': 172952, 'sim': 0.9801803870196637)
('p2.id': 43659, 'sim': 0.9778117301634254)
('p2.id': 32373, 'sim': 0.9772545497599154)
('p2.id': 23301, 'sim': 0.9763251586321616)
('p2.id': 55857, 'sim': 0.9736320448389624)
('p2.id': 35072, 'sim': 0.9684002494125045)
('p2.id': 95504, 'sim': 0.9682818167765578)
('p2.id': 178096, 'sim': 0.9671176244289773)
('p2.id': 165458, 'sim': 0.9669875568304563)


### 2) Pearson Similarity

This is particularly well-suited for product recommendations because it takes into account the fact that different users will have different mean total orders: on average some people do buy only from Instacart, while some prefer to go out of their house. Since Pearson similarity considers differences about the mean, this metric will account for these discrepancies.

In [58]:
results = graph.run("""
MATCH (u1:User {id: {user_id}})-[r:BOUGHT]->(m:Product)
WITH u1, avg(r.order_total) AS u1_mean

MATCH (u1)-[r1:BOUGHT]->(m:Product)<-[r2:BOUGHT]-(u2)
WITH u1, u1_mean, u2, COLLECT({r1: r1, r2: r2}) AS totalorders WHERE size(totalorders) > 10

MATCH (u2)-[r:BOUGHT]->(m:Product)
WITH u1, u1_mean, u2, avg(r.order_total) AS u2_mean, totalorders

UNWIND totalorders AS r

WITH sum( (r.r1.order_total - u1_mean) * (r.r2.order_total - u2_mean) ) AS nom,
     sqrt( sum( (r.r1.order_total - u1_mean)^2) * sum( (r.r2.order_total - u2_mean) ^2)) AS denom,
     u1, u2 WHERE denom <> 0

RETURN u2.id, nom/denom AS pearson
ORDER BY pearson DESC 
LIMIT 10""", user_id=random_user)

for row in results:
    print(row)

('u2.id': 35072, 'pearson': 0.9727175394285357)
('u2.id': 43659, 'pearson': 0.9637119084411767)
('u2.id': 165458, 'pearson': 0.9293957775127478)
('u2.id': 95504, 'pearson': 0.9201349269162116)
('u2.id': 152404, 'pearson': 0.8765293905143908)
('u2.id': 165996, 'pearson': 0.8226642814199816)
('u2.id': 194017, 'pearson': 0.787541481037643)
('u2.id': 169980, 'pearson': 0.739962078431307)
('u2.id': 165005, 'pearson': 0.7358519473824435)
('u2.id': 172952, 'pearson': 0.6187372937371)


I have a problem with so many 0.90+ scores for the cosine similarity metric. The Pearson scores seems to be a better spread...

In [63]:
results = graph.run("""
MATCH (u1:User {id: {user_id}})-[r:BOUGHT]->(m:Product)
WITH u1, avg(r.order_total) AS u1_mean

MATCH (u1)-[r1:BOUGHT]->(m:Product)<-[r2:BOUGHT]-(u2)
WITH u1, u1_mean, u2, COLLECT({r1: r1, r2: r2}) AS totalorders WHERE size(totalorders) > 10

MATCH (u2)-[r:BOUGHT]->(m:Product)
WITH u1, u1_mean, u2, avg(r.order_total) AS u2_mean, totalorders

UNWIND totalorders AS r

WITH sum( (r.r1.order_total - u1_mean) * (r.r2.order_total - u2_mean) ) AS nom,
     sqrt( sum( (r.r1.order_total - u1_mean)^2) * sum( (r.r2.order_total - u2_mean) ^2)) AS denom,
     u1, u2 WHERE denom <> 0

WITH u1, u2, nom/denom AS pearson
ORDER BY pearson DESC LIMIT 10

MATCH (u2)-[r:BOUGHT]->(m:Product) WHERE NOT EXISTS( (u1)-[:BOUGHT]->(m) )

RETURN m.name AS recommendation, SUM( pearson * r.order_total) AS score
ORDER BY score DESC 
LIMIT 10""", user_id=random_user)

for row in results:
    print(row)

('recommendation': 'Clementines', 'score': 93.32503170845067)
('recommendation': 'Strawberries', 'score': 72.32054522971809)
('recommendation': '0% Greek Strained Yogurt', 'score': 61.81156358012525)
('recommendation': 'Organic Mexican Blend Finely Shredded Cheese', 'score': 56.702986634710285)
('recommendation': 'Almonds', 'score': 52.808198036681276)
('recommendation': 'Organic Spring Mix', 'score': 48.82757182433385)
('recommendation': 'Organic Romaine Leaf', 'score': 45.730714460175)
('recommendation': 'Organic Extra Virgin Olive Oil', 'score': 44.19657590190201)
('recommendation': 'Vanilla Greek Yogurt 0% Fat', 'score': 40.16461553291979)
('recommendation': 'Organic Reduced Fat Milk', 'score': 39.95795223529057)


#### **Conclusion**

I am quite happy using the Pearson similarity to find other products from similar users based on the total number of orders for that product in lieu of a ratings system.

## Content-based filtering

What are other products similar to what you are looking at?

In [None]:
result = graph.run('''
MATCH (aisle:Aisle)-[:FOUND_IN]->(product)<-[:FOUND_IN]-(otherproduct)
WHERE aisle.name CONTAINS "Graph Database"
RETURN otherGroup.name, COUNT(topic) AS topicsInCommon,
       COLLECT(topic.name) AS topics
ORDER BY topicsInCommon DESC, otherGroup.name
LIMIT 10
''')

for row in result:
    print(row)

## Making recommendations

What to buy next? Meat -> Vegetable -> Wine or other way round