# Hello

![](./localhost.png)

In [21]:
# Import py2neo and connect to Neo4j
from py2neo import Graph

# Import number generator to generate a random user ID and product ID to query
from random import randint

# just an example, replace with credentials for your own Neo4j instance
graph = Graph(bolt=True, host="localhost", http_port=7687, user='neo4j', password='kiss')

In [37]:
# Hello world, sanity check
graph.run("MATCH (a) RETURN COUNT(a) AS numberOfNodes").evaluate()

242654

In [None]:
# # Create all the constraints

# graph.run("CREATE CONSTRAINT ON (p:Product) ASSERT p.id IS UNIQUE;")
# graph.run("CREATE CONSTRAINT ON (a:Aisle) ASSERT a.id IS UNIQUE;")
# graph.run("CREATE CONSTRAINT ON (d:Department) ASSERT d.id IS UNIQUE;")
# graph.run("CREATE CONSTRAINT ON (u:User) ASSERT u.id IS UNIQUE;")

In [None]:
# # Load the CSV files
# # File are located at neo4j_home/import folder
# # products first

# graph.run("""
# USING PERIODIC COMMIT 
# LOAD CSV WITH HEADERS FROM "file:///products_clean.csv" AS line WITH line
# CREATE (product:Product {id: toInteger(line.product_id), name: line.product_name})
# MERGE (aisle:Aisle {id: toInteger(line.aisle_id), name: line.aisle})
# MERGE (department:Department {id: toInteger(line.department_id),name: line.department})
# CREATE (product)-[:FOUND_IN]->(aisle)
# CREATE (product)-[:TYPE_OF]->(department);
# """)

In [None]:
# # users next
# # This will take awhile...

# graph.run("""
# USING PERIODIC COMMIT 
# LOAD CSV WITH HEADERS FROM "file:///users_orders.csv" AS line WITH line
# MATCH (product:Product {id: toInteger(line.product_id)})
# MERGE (user:User {id: toInteger(line.user_id)})
# CREATE (user)-[b:BOUGHT]->(product)
# SET b.order_total = toInteger(line.total_orders);
# """)

## Collaborative filtering

In [28]:
random_user = randint(0, 206210)
print("The selected user ID is: {}".format(random_user))

The selected user ID is: 12570


What if we go by aisle? What else do other users buy just as much or more in the same aisle?

In [29]:
results = graph.run("""
MATCH (user:User {id: {user_id}})-[b1:BOUGHT]->()-[:FOUND_IN]->()<-[:FOUND_IN]-(otherproduct:Product)<-[b2:BOUGHT]-(otheruser:User)
WHERE NOT (user)-[:BOUGHT]->(otherproduct) AND b2.order_total >= b1.order_total
RETURN otherproduct.name AS recommendation,
       COUNT(*) AS usersInCommon
ORDER BY usersInCommon DESC
LIMIT 10""", user_id=random_user)

for row in results:
    print(row)

('recommendation': 'Banana', 'usersInCommon': 37169)
('recommendation': 'Organic Strawberries', 'usersInCommon': 32013)
('recommendation': 'Bag of Organic Bananas', 'usersInCommon': 31485)
('recommendation': 'Raspberries', 'usersInCommon': 24473)
('recommendation': 'Organic Hass Avocado', 'usersInCommon': 22389)
('recommendation': 'Hass Avocados', 'usersInCommon': 21427)
('recommendation': 'Strawberries', 'usersInCommon': 19063)
('recommendation': 'Large Lemon', 'usersInCommon': 18414)
('recommendation': 'Organic Avocado', 'usersInCommon': 18205)
('recommendation': 'Limes', 'usersInCommon': 16428)


## Simple Version (the KNN of Recommendations)

What else do other users buy just as much as?

In [30]:
results = graph.run("""
MATCH (user:User {id: {user_id}})-[b1:BOUGHT]->(:Product)<-[b2:BOUGHT]-(otheruser:User)
MATCH (otheruser)-[:BOUGHT]->(rec:Product)
WHERE NOT EXISTS( (user)-[:BOUGHT]->(rec)) AND b2.order_total >= b1.order_total
RETURN rec.name, COUNT(*) AS usersWhoAlsoBought
ORDER BY usersWhoAlsoBought DESC 
LIMIT 10""", user_id=random_user)

for row in results:
    print(row)

('rec.name': 'Bag of Organic Bananas', 'usersWhoAlsoBought': 11493)
('rec.name': 'Organic Strawberries', 'usersWhoAlsoBought': 9445)
('rec.name': 'Banana', 'usersWhoAlsoBought': 8294)
('rec.name': 'Strawberries', 'usersWhoAlsoBought': 7252)
('rec.name': 'Organic Hass Avocado', 'usersWhoAlsoBought': 5604)
('rec.name': 'Organic Blueberries', 'usersWhoAlsoBought': 5204)
('rec.name': 'Clementines', 'usersWhoAlsoBought': 4243)
('rec.name': 'Hass Avocados', 'usersWhoAlsoBought': 4071)
('rec.name': 'Organic Avocado', 'usersWhoAlsoBought': 4036)
('rec.name': 'Raspberries', 'usersWhoAlsoBought': 3982)


Consider the type of food.

In [12]:
random_user = randint(0, 206210)
print("Next user is: {}".format(random_user))

Next user is: 159738


In [31]:
results = graph.run("""
MATCH (user:User {id: {user_id}})-[b:BOUGHT]->(p:Product)
WITH user, avg(b.order_total) AS mean

MATCH (user)-[b:BOUGHT]->(p:Product)-[:FOUND_IN]->(a:Aisle)
WHERE b.order_total >= mean

WITH user, a, COUNT(*) AS score

MATCH (a)<-[:FOUND_IN]-(rec:Product)
WHERE NOT EXISTS((user)-[:BOUGHT]->(rec))

RETURN rec.name AS recommendation, COLLECT(DISTINCT a.name) AS productType, SUM(score) AS sscore
ORDER BY sscore DESC LIMIT 10
""", user_id=random_user)

for row in results:
    print(row)

('recommendation': 'Organic Spring Mix', 'productType': ['packaged vegetables fruits'], 'sscore': 2)
('recommendation': 'Organic Portabello Mushroom Caps', 'productType': ['packaged vegetables fruits'], 'sscore': 2)
('recommendation': 'Organic Apple Fuji 3 Lb Bag', 'productType': ['packaged vegetables fruits'], 'sscore': 2)
('recommendation': 'Packaged Organic Apricots', 'productType': ['packaged vegetables fruits'], 'sscore': 2)
('recommendation': 'Grapes, Certified Organic, California, Black Seedless', 'productType': ['packaged vegetables fruits'], 'sscore': 2)
('recommendation': 'Sliced Baby Bellas Mushrooms', 'productType': ['packaged vegetables fruits'], 'sscore': 2)
('recommendation': 'Parmesan Crusted Cod', 'productType': ['frozen meat seafood'], 'sscore': 2)
('recommendation': 'Organic Baby Romaine Lettuce', 'productType': ['packaged vegetables fruits'], 'sscore': 2)
('recommendation': 'Santa Fe Caesar Complete Salad Kit', 'productType': ['packaged vegetables fruits'], 'sscore'

Not helpful with the same scores...

Let's see if similarity metrics will help instead.

### 1) Cosine Similarity

The cosine similarty of two users will tell us how similar two users' preferences for products are. Users with a high cosine similarity will have similar preferences.

In [32]:
results = graph.run("""
MATCH (p1:User {id: {user_id}})-[x:BOUGHT]->(p:Product)<-[y:BOUGHT]-(p2:User)
WITH COUNT(p) AS numberproducts, SUM(x.order_total * y.order_total) AS xyDotProduct,
SQRT(REDUCE(xDot = 0.0, a IN COLLECT(x.order_total) | xDot + a^2)) AS xLength,
SQRT(REDUCE(yDot = 0.0, b IN COLLECT(y.order_total) | yDot + b^2)) AS yLength,
p1, p2 WHERE numberproducts > 10
RETURN p2.id, xyDotProduct / (xLength * yLength) AS cosim
ORDER BY cosim DESC 
LIMIT 10""", user_id=random_user)

for row in results:
    print(row)

('p2.id': 93378, 'cosim': 1.0)
('p2.id': 37075, 'cosim': 1.0)
('p2.id': 34556, 'cosim': 1.0)
('p2.id': 14221, 'cosim': 0.9999999999999999)
('p2.id': 169980, 'cosim': 0.9969876781034773)
('p2.id': 194440, 'cosim': 0.9950590839400425)
('p2.id': 35214, 'cosim': 0.993781603687568)
('p2.id': 145881, 'cosim': 0.989762410697451)
('p2.id': 94615, 'cosim': 0.9879933919232682)
('p2.id': 59361, 'cosim': 0.9850449679330249)


### 2) Pearson Similarity

This is particularly well-suited for product recommendations because it takes into account the fact that different users will have different mean total orders: on average some people do buy only from Instacart, while some prefer to go out of their house. Since Pearson similarity considers differences about the mean, this metric will account for these discrepancies.

In [33]:
results = graph.run("""
MATCH (u1:User {id: {user_id}})-[r:BOUGHT]->(m:Product)
WITH u1, avg(r.order_total) AS u1_mean

MATCH (u1)-[r1:BOUGHT]->(m:Product)<-[r2:BOUGHT]-(u2)
WITH u1, u1_mean, u2, COLLECT({r1: r1, r2: r2}) AS totalorders WHERE size(totalorders) > 10

MATCH (u2)-[r:BOUGHT]->(m:Product)
WITH u1, u1_mean, u2, avg(r.order_total) AS u2_mean, totalorders

UNWIND totalorders AS r

WITH sum( (r.r1.order_total - u1_mean) * (r.r2.order_total - u2_mean) ) AS nom,
     sqrt( sum( (r.r1.order_total - u1_mean)^2) * sum( (r.r2.order_total - u2_mean) ^2)) AS denom,
     u1, u2 WHERE denom <> 0

RETURN u2.id, nom/denom AS pearson
ORDER BY pearson DESC 
LIMIT 10""", user_id=random_user)

for row in results:
    print(row)

('u2.id': 43783, 'pearson': 0.9168641652177419)
('u2.id': 148698, 'pearson': 0.8901667246271118)
('u2.id': 8855, 'pearson': 0.8838304369607415)
('u2.id': 194597, 'pearson': 0.8005758992467286)
('u2.id': 92766, 'pearson': 0.6848314049687728)
('u2.id': 53449, 'pearson': 0.63079342810736)
('u2.id': 178096, 'pearson': 0.6285083027687466)
('u2.id': 22624, 'pearson': 0.6202638734661828)
('u2.id': 110023, 'pearson': 0.6190677785368518)
('u2.id': 166372, 'pearson': 0.6125638918316887)


I have a problem with so many 0.90+ scores for the cosine similarity metric. The Pearson scores seems to be a better spread...

## FINAL: Collaborative Filtering = KNN + Pearson 

Combine the simple implementation of KNN with the Pearson correlation scoring system:

In [34]:
results = graph.run("""
MATCH (u1:User {id: {user_id}})-[r:BOUGHT]->(m:Product)
WITH u1, avg(r.order_total) AS u1_mean

MATCH (u1)-[r1:BOUGHT]->(m:Product)<-[r2:BOUGHT]-(u2)
WITH u1, u1_mean, u2, COLLECT({r1: r1, r2: r2}) AS totalorders WHERE size(totalorders) > 10

MATCH (u2)-[r:BOUGHT]->(m:Product)
WITH u1, u1_mean, u2, avg(r.order_total) AS u2_mean, totalorders

UNWIND totalorders AS r

WITH sum( (r.r1.order_total - u1_mean) * (r.r2.order_total - u2_mean) ) AS nom,
     sqrt( sum( (r.r1.order_total - u1_mean)^2) * sum( (r.r2.order_total - u2_mean) ^2)) AS denom,
     u1, u2 WHERE denom <> 0

WITH u1, u2, nom/denom AS pearson
ORDER BY pearson DESC LIMIT 10

MATCH (u2)-[r:BOUGHT]->(m:Product) WHERE NOT EXISTS( (u1)-[:BOUGHT]->(m) )

RETURN m.name AS recommendation, SUM(pearson * r.order_total) AS score
ORDER BY score DESC 
LIMIT 10""", user_id=random_user)

for row in results:
    print(row)

('recommendation': 'Unsweetened Almondmilk', 'score': 52.047186777626735)
('recommendation': 'Organic Broccoli Florets', 'score': 48.864540201612535)
('recommendation': 'Organic Large Extra Fancy Fuji Apple', 'score': 38.32933883790394)
('recommendation': 'Organic Lemon', 'score': 33.556738843469866)
('recommendation': 'French Vanilla Sugar Free Liquid Coffee Creamer', 'score': 32.4658862670795)
('recommendation': 'Organic Cucumber', 'score': 29.447750413657232)
('recommendation': 'Granny Smith Apples', 'score': 27.595168463440466)
('recommendation': 'Lunchables Turkey & Cheddar with Crackers', 'score': 26.589060791314516)
('recommendation': 'Organic Yellow Onion', 'score': 23.96909917390705)
('recommendation': 'Black Plum', 'score': 23.21670107815513)


#### **Conclusion**

I am quite happy using the Pearson similarity to find other products from similar users based on the total number of orders for that product in lieu of a ratings system.

## Content-based filtering

What are other products similar to what you are looking at? Let's bring in the categorical nature of some of the products. 

If we know what products a user has bought, we can use this information to recommend similar products: Recommend products similar to those the user has already bought.

In [None]:
# Case sensitive... 'Bacon' worked but 'bacon' didn't work
# Case very sensitive... 'Product' works but not 'PRODUCT'

result = graph.run('''
MATCH (m:Product)-[:FOUND_IN]->(g:Aisle)<-[:FOUND_IN]-(rec:Product)
WHERE m.name CONTAINS 'Bacon'
WITH rec.name AS recommendation, g.name AS aisle, COUNT(*) AS commonAisles
RETURN recommendation, aisle, commonAisles
ORDER BY commonAisles DESC 
LIMIT 10''')

for row in result:
    print(row)

99 commonAisles is a problem. lol. What is we take a user and then recommend based on the types of products. Either by aisle or by department.

In [None]:
random_user = randint(0, 206210)
print("Next user is: {}".format(random_user))

#### By Aisle

In [None]:
result = graph.run('''
MATCH (u:User {id: {user_id}})-[r:BOUGHT]->(m:Product), (m)-[:FOUND_IN]->(g:Aisle)<-[:FOUND_IN]-(rec:Product)
WHERE NOT EXISTS( (u)-[:BOUGHT]->(rec) )
WITH rec.name AS recommendation, [g.name, COUNT(*)] AS scores
RETURN recommendation, COLLECT(scores) AS scoreComponents, REDUCE (s=0,x in COLLECT(scores) | s+x[1]) AS score
ORDER BY score DESC 
LIMIT 10''', user_id=random_user)

for row in result:
    print(row)

#### By Department

In [None]:
result = graph.run('''
MATCH (u:User {id: {user_id}})-[r:BOUGHT]->(m:Product), (m)-[:TYPE_OF]->(g:Department)<-[:TYPE_OF]-(rec:Product)
WHERE NOT EXISTS( (u)-[:BOUGHT]->(rec) )
WITH rec.name AS recommendation, [g.name, COUNT(*)] AS scores
RETURN recommendation, COLLECT(scores) AS scoreComponents, REDUCE (s=0,x in COLLECT(scores) | s+x[1]) AS score
ORDER BY score DESC 
LIMIT 10''', user_id=random_user)

for row in result:
    print(row)

Separately (Aisle vs Department), I think they are pretty rubbish recommendations. Let's try a weighted sum of these two components in the 'score'.

In [35]:
result = graph.run('''
MATCH (m:Product) WHERE m.name CONTAINS 'Bacon'
MATCH (m)-[:FOUND_IN]->(g:Aisle)<-[:FOUND_IN]-(rec:Product)

WITH m, rec, COUNT(*) AS gs

OPTIONAL MATCH (m)-[:TYPE_OF]->(a:Department)<-[:TYPE_OF]-(rec)
WITH m, rec, gs, COUNT(*) AS as

RETURN rec.name AS recommendation, (5*gs)+(3*as) AS score ORDER BY score DESC 
LIMIT 10
''')

for row in result:
    print(row)

('recommendation': 'Vegetable Combo Sp', 'score': 8)
('recommendation': 'Original Pork Sausage Patties', 'score': 8)
('recommendation': 'Naturals Savory Turkey Breakfast Sausage', 'score': 8)
('recommendation': 'Organic Creamy Deluxe Macaroni Dinner Rotini & White Cheddar Sauce', 'score': 8)
('recommendation': 'Gluten Free Corn Dogs', 'score': 8)
('recommendation': "Meat Lover's Pizza", 'score': 8)
('recommendation': 'Bacon', 'score': 8)
('recommendation': 'Thick Slices Canadian Bacon', 'score': 8)
('recommendation': 'Smoked Cheddar Beef Franks', 'score': 8)
('recommendation': 'Rich & Hearty Savory Beef Barley Vegetable Soup', 'score': 8)


In [36]:
result = graph.run('''
MATCH (m:Product) WHERE m.name CONTAINS 'Turkey'
MATCH (m)-[:FOUND_IN]->(g:Aisle)<-[:FOUND_IN]-(rec:Product)

WITH m, rec, COUNT(*) AS gs

OPTIONAL MATCH (m)-[:TYPE_OF]->(a:Department)<-[:TYPE_OF]-(rec)
WITH m, rec, gs, COUNT(*) AS as

RETURN rec.name AS recommendation, (5*gs)+(3*as) AS score ORDER BY score DESC 
LIMIT 10
''')

for row in result:
    print(row)

('recommendation': 'Turkey Cotto Salami', 'score': 8)
('recommendation': 'Premium Deli Honey Ham', 'score': 8)
('recommendation': 'Mexican Papaya', 'score': 8)
('recommendation': 'Jamaican Jerk Hemp Seed Salad', 'score': 8)
('recommendation': 'Southern Cornmeal Crusted Cod Fillets', 'score': 8)
('recommendation': 'Organic Chicken Thighs', 'score': 8)
('recommendation': 'Cheddar Cheese Burrito', 'score': 8)
('recommendation': 'Dinosaur Shapes Chicken Breast Nuggets', 'score': 8)
('recommendation': 'Peppercorn Sauce', 'score': 8)
('recommendation': 'Beef With Vegetables & Barley Condensed Soup', 'score': 8)


If we add a minimum order total of at least 5, the query takes a long time to run...

In [None]:
# Will take awhile

result = graph.run('''
MATCH (m:Product) WHERE m.name CONTAINS 'Bacon'
MATCH (m)-[:FOUND_IN]->(g:Aisle)<-[:FOUND_IN]-(rec:Product)

WITH m, rec, COUNT(*) AS gs

OPTIONAL MATCH (m)-[:TYPE_OF]->(a:Department)<-[:TYPE_OF]-(rec)
WITH m, rec, gs, COUNT(*) AS as

OPTIONAL MATCH (m)<-[b:BOUGHT]-(d:User) WHERE b.order_total > 5
WITH m, rec, gs, as, COUNT(d) AS ds

RETURN rec.name AS recommendation, (5*gs)+(3*as)+(1*ds) AS score 
ORDER BY score DESC 
LIMIT 10
''', user_id=random_user)

for row in result:
    print(row)

But everything is "stuck" in the same category/type versus the previous one.

In [None]:
result = graph.run('''
MATCH (m:Product) WHERE m.name CONTAINS 'Bacon'
MATCH (m)-[:FOUND_IN]->(g:Aisle)<-[:FOUND_IN]-(rec:Product)

WITH m, rec, COUNT(*) AS gs

OPTIONAL MATCH (m)-[:TYPE_OF]->(a:Department)<-[:TYPE_OF]-(rec)
WITH m, rec, gs, COUNT(a) AS as

RETURN rec.name AS recommendation, (5*gs)+(3*as) AS score ORDER BY score DESC 
LIMIT 10
''')

for row in result:
    print(row)

So far, we used 'common traits' to traverse the network and return results. As we can see the 'scores' for most of these are the same. No doubt they seem to produce the same order but for reproducibility and for robustness, we need a form of similarity scoring.

### 1) Jaccard Index

From [Wikipedia](https://en.wikipedia.org/wiki/Jaccard_index): The Jaccard index, also known as Intersection over Union and the Jaccard similarity coefficient (originally coined coefficient de communauté by Paul Jaccard), is a statistic used for comparing the similarity and diversity of sample sets. The Jaccard coefficient measures similarity between finite sample sets, and is defined as the size of the intersection divided by the size of the union of the sample sets.

**TL;DR** The Jaccard index is a number between 0 and 1 that indicates how similar two sets are. The Jaccard index of two identical sets is 1. If two sets do not have a common element, then the Jaccard index is 0. 

We can calculate the Jaccard index for sets of product via their aisles to determine how similar two products are.

In [None]:
random_product = randint(0, 49687)
print("Random product is: {}".format(random_product))

In [None]:
product = graph.run('''
MATCH (m:Product {id: {product_id}})
RETURN m.name
''', product_id=random_product)

for row in product:
    print(row)

In [None]:
result = graph.run('''
MATCH (m:Product {id: {product_id}})-[:FOUND_IN]->(a:Aisle)<-[:FOUND_IN]-(other:Product)
WITH m, other, COUNT(a) AS intersection, COLLECT(a.name) AS i

MATCH (m)-[:FOUND_IN]->(ma:Aisle)
WITH m, other, intersection,i, COLLECT(ma.name) AS s1

MATCH (other)-[:FOUND_IN]->(oa:Aisle)
WITH m, other,intersection,i, s1, COLLECT(oa.name) AS s2

WITH m, other,intersection, s1,s2

WITH m, other,intersection, s1+filter(x IN s2 WHERE NOT x IN s1) AS union, s1, s2

RETURN other.name AS recommendation, s1,s2,((1.0*intersection)/SIZE(union)) AS jaccard ORDER BY jaccard DESC 
LIMIT 10
''', product_id=random_product)

for row in result:
    print(row)

In [None]:
result = graph.run('''
MATCH (m:Product {id: {product_id}})-[:TYPE_OF]->(d:Department)<-[:TYPE_OF]-(other:Product)
WITH m, other, COUNT(d) AS intersection, COLLECT(d.name) AS i

MATCH (m)-[:TYPE_OF]->(md:Department)
WITH m, other, intersection,i, COLLECT(md.name) AS s1

MATCH (other)-[:TYPE_OF]->(od:Department)
WITH m, other,intersection,i, s1, COLLECT(od.name) AS s2

WITH m, other,intersection, s1,s2

WITH m, other,intersection, s1+filter(x IN s2 WHERE NOT x IN s1) AS union, s1, s2

RETURN other.name AS recommendation, s1,s2,((1.0*intersection)/SIZE(union)) AS jaccard ORDER BY jaccard DESC 
LIMIT 10
''', product_id=random_product)

for row in result:
    print(row)

Once again, separating the aisle and department, will just yield a 1.0 Jaccard score, because they _are_ the same. So let's mix the two up.

In [None]:
result = graph.run('''
MATCH (m:Product {id: {product_id}})-[:TYPE_OF|:FOUND_IN]-(t)<-[:TYPE_OF|:FOUND_IN]-(other:Product)
WITH m, other, COUNT(t) AS intersection, COLLECT(t.name) AS i

MATCH (m)-[:TYPE_OF|:FOUND_IN]-(mt)
WITH m, other, intersection,i, COLLECT(mt.name) AS s1

MATCH (other)-[:TYPE_OF|:FOUND_IN]-(ot)
WITH m, other,intersection,i, s1, COLLECT(ot.name) AS s2

WITH m, other,intersection,s1,s2

WITH m, other,intersection,s1+filter(x IN s2 WHERE NOT x IN s1) AS union, s1, s2

RETURN other.name AS recommendation, s1,s2,((1.0*intersection)/SIZE(union)) AS jaccard ORDER BY jaccard DESC
LIMIT 10''', product_id=random_product)

for row in result:
    print(row)

In [None]:
result = graph.run('''
MATCH (m:Product {id: {product_id}})-[:TYPE_OF|:FOUND_IN]-(t)<-[:TYPE_OF|:FOUND_IN]-(other:Product)
WITH m, other, COUNT(t) AS intersection, COLLECT(t.name) AS i

MATCH (m)-[:TYPE_OF|:FOUND_IN]-(mt)
WITH m, other, intersection,i, COLLECT(mt.name) AS s1

MATCH (other)-[:TYPE_OF|:FOUND_IN]-(ot)
WITH m, other,intersection,i, s1, COLLECT(ot.name) AS s2

WITH m, other,intersection,s1,s2

WITH m, other,intersection,s1+filter(x IN s2 WHERE NOT x IN s1) AS union, s1, s2

RETURN other.name AS recommendation, s1,s2,((1.0*intersection)/SIZE(union)) AS jaccard ORDER BY jaccard ASC
LIMIT 10''', product_id=random_product)

for row in result:
    print(row)

I realised that Jaccard scores will be whole numbers, because for each product it can be found in one aisle and one department respectively. It doesn't have multiple aisles or multiple departments. So, I now turn to clustering to see if I can use it for something.

### 2) Clustering

In [3]:
from igraph import Graph as IGraph

In [12]:
result = graph.run('''
MATCH (a:Aisle)<-[:FOUND_IN]-()-[:TYPE_OF]->(d:Department)
RETURN a.name AS aisleName, d.name AS departmentName, COUNT(*) AS weight
ORDER BY weight DESC
LIMIT 10''')

for row in result:
    print(row)

('aisleName': 'candy chocolate', 'departmentName': 'snacks', 'weight': 1245)
('aisleName': 'ice cream ice', 'departmentName': 'frozen', 'weight': 1089)
('aisleName': 'yogurt', 'departmentName': 'dairy eggs', 'weight': 1026)
('aisleName': 'chips pretzels', 'departmentName': 'snacks', 'weight': 986)
('aisleName': 'tea', 'departmentName': 'beverages', 'weight': 891)
('aisleName': 'packaged cheese', 'departmentName': 'dairy eggs', 'weight': 891)
('aisleName': 'frozen meals', 'departmentName': 'frozen', 'weight': 880)
('aisleName': 'cookies cakes', 'departmentName': 'snacks', 'weight': 874)
('aisleName': 'energy granola bars', 'departmentName': 'snacks', 'weight': 832)
('aisleName': 'spices seasonings', 'departmentName': 'pantry', 'weight': 797)


In [13]:
cluster = graph.run('''
MATCH (a:Aisle)<-[:FOUND_IN]-()-[:TYPE_OF]->(d:Department)
RETURN a.name AS aisleName, d.name AS departmentName, COUNT(*) AS weight''')

In [14]:
ig = IGraph.TupleList(cluster, weights=True)
ig

<igraph.Graph at 0x1dc9ad82b88>

In [15]:
clusters = IGraph.community_walktrap(ig, weights='weight')
clusters = clusters.as_clustering()
len(clusters)

16

In [16]:
# Let's take a look at the 'clusters'
nodes = [node['name'] for node in ig.vs]
nodes = [{'id': x, 'label': x} for x in nodes]
nodes[:5]

for node in nodes:
    idx = ig.vs.find(name=node['id']).index
    node['group'] = clusters.membership[idx]
    
nodes[:20]

[{'group': 0,
  'id': 'canned fruit applesauce',
  'label': 'canned fruit applesauce'},
 {'group': 0, 'id': 'canned goods', 'label': 'canned goods'},
 {'group': 1, 'id': 'trail mix snack mix', 'label': 'trail mix snack mix'},
 {'group': 1, 'id': 'snacks', 'label': 'snacks'},
 {'group': 1, 'id': 'chips pretzels', 'label': 'chips pretzels'},
 {'group': 2, 'id': 'fresh herbs', 'label': 'fresh herbs'},
 {'group': 2, 'id': 'produce', 'label': 'produce'},
 {'group': 3, 'id': 'prepared soups salads', 'label': 'prepared soups salads'},
 {'group': 3, 'id': 'deli', 'label': 'deli'},
 {'group': 4,
  'id': 'preserved dips spreads',
  'label': 'preserved dips spreads'},
 {'group': 4, 'id': 'pantry', 'label': 'pantry'},
 {'group': 5,
  'id': 'specialty wines champagnes',
  'label': 'specialty wines champagnes'},
 {'group': 5, 'id': 'alcohol', 'label': 'alcohol'},
 {'group': 6, 'id': 'eggs', 'label': 'eggs'},
 {'group': 6, 'id': 'dairy eggs', 'label': 'dairy eggs'},
 {'group': 7, 'id': 'frozen desser

In [None]:
# # Write it back into the database
# #
# # Create constraint on cluster name

# graph.run("CREATE CONSTRAINT ON (cluster:Cluster) ASSERT cluster.name IS UNIQUE;")

In [17]:
# # Write it back into the database
# #
# # Writing Aisle first

# graph.run('''
# UNWIND {params} AS p 
# MATCH (a:Aisle {name: p.id})
# MERGE (cluster:Cluster {name: p.group})
# MERGE (a)-[:IN_CLUSTER]->(cluster)
# ''', params = nodes)

<py2neo.graph.Cursor at 0x1dc9ae0a278>

In [18]:
# # Write it back into the database
# #
# # Writing Department next

# graph.run('''
# UNWIND {params} AS p 
# MATCH (d:Department {name: p.id})
# MERGE (cluster:Cluster {name: p.group})
# MERGE (d)-[:IN_CLUSTER]->(cluster)
# ''', params = nodes)

<py2neo.graph.Cursor at 0x1dc9ae1a828>

### Novelty Recommendation

I found that having clusters would allow me to group aisles together. A good use for this is to find something for "Try Something New" but not too foreign.

In [22]:
random_user = randint(0, 206210)
print("Next user is: {}".format(random_user))

Next user is: 194981


In [27]:
result = graph.run('''
MATCH (user:User {id: {user_id}})-[:BOUGHT]->(product)-[:FOUND_IN]->(a:Aisle)-[:IN_CLUSTER]->(cluster)
WITH user, cluster, COUNT(*) AS times
ORDER BY times DESC
LIMIT 1
WITH user, cluster
MATCH (cluster)<-[:IN_CLUSTER]-(a)<-[:FOUND_IN]-()
WITH cluster, a.name AS aisles, COUNT(a) as commonAisles
RETURN aisles, commonAisles
ORDER BY commonAisles DESC
LIMIT 5''', user_id=random_user)

for row in result:
    print(row)

('aisles': 'cereal', 'commonAisles': 454)
('aisles': 'hot cereal pancake mixes', 'commonAisles': 303)
('aisles': 'granola', 'commonAisles': 185)
('aisles': 'breakfast bars pastries', 'commonAisles': 173)


In [23]:
# This is giving me stuff on the opposite end of what a user orders most often
# Is this my novelty recommendation?

result = graph.run('''
MATCH (user:User {id: {user_id}})-[:BOUGHT]->(product)-[:FOUND_IN]->(a:Aisle)-[:IN_CLUSTER]->(cluster)
WITH user, cluster, COUNT(*) AS times
ORDER BY times DESC
LIMIT 1
WITH user, cluster
MATCH (cluster)<-[:IN_CLUSTER]-(a)<-[:FOUND_IN]-()
WITH cluster, a.name AS aisles, COUNT(a) as commonAisles
ORDER BY commonAisles ASC
LIMIT 1
WITH aisles AS x
MATCH (Aisle {name: x})<-[:FOUND_IN]-(otherProducts)<-[b:BOUGHT]-()
WHERE b.order_total > 10
RETURN DISTINCT otherProducts.name, MAX(b.order_total) AS orderTotal
ORDER BY orderTotal DESC
LIMIT 10''', user_id=random_user)

for row in result:
    print(row)

('otherProducts.name': 'Peanut Butter Gluten Free Breakfast Bars', 'orderTotal': 60)
('otherProducts.name': 'Nutri-Grain Soft Baked Strawberry Cereal Breakfast Bars', 'orderTotal': 58)
('otherProducts.name': "Kellogg's Pop-Tarts Frosted Strawberry Pastries", 'orderTotal': 58)
('otherProducts.name': 'Crunchy Granola Bars Variety Pack', 'orderTotal': 58)
('otherProducts.name': 'Special K Protein Greek Yogurt And Fruit Bar 4.76oz', 'orderTotal': 57)
('otherProducts.name': 'Whole Wheat Blueberry Fig Bars', 'orderTotal': 56)
('otherProducts.name': 'Frosted Toaster Pastries, Cherry Pomegran', 'orderTotal': 53)
('otherProducts.name': 'Pop-Tarts Frosted Raspberry Toaster Pastries', 'orderTotal': 53)
('otherProducts.name': 'Frosted Cherry Pop-Tarts Toaster Pastries', 'orderTotal': 52)
('otherProducts.name': 'Oat Blueberry Lemon Whenever Bars', 'orderTotal': 51)


## Making recommendations

### 1) Collaborative Filtering
Using total_orders as an indicator of some sort of rating (high number of total_orders = highly rated), the recommender is able to generate a list of other products based on the buying habits (total_orders) from a pool of users, using the Pearson corelation as scores and KNN as a traversial method. 

### 2) Content-based Filtering
This didn't work out as well as hoped. Mainly because each product is a TYPE_OF and FOUND_IN one other category each. A product is not a meat and a vegetable at the same time. This made content-based filtering non-existant for this dataset. However, being a graph database, the nodes do allow me to implement a search function instead.

### 3) Clusters of Products
I was able to infer clusters of products from the Aisles and the Departments that the products were found in. Clusters consists mostly of 4 Aisle nodes and 1 Department nodes on average. From there, we would be able to traverse up from Product to Aisle to Cluster and find the Aisle where the User bought the least from. Traverse out of the cluster and find all Products in the least popular Aisle and recommend Products that other people seem to like more. 