In [1]:
import sys
print(sys.executable)
!{sys.executable} -m pip install neo4j

/Users/Brian/anaconda3/bin/python


In [61]:
from neo4j import GraphDatabase

# Update with your credentials
URI = "neo4j://127.0.0.1:7687"
USER = "neo4j"
PASSWORD = "Bh43292167^"  # or the password you set


class GraphDB:
    
    driver = None
    enable_logging = True
    
    def __init__(self):
        self.get_driver()
        
    def get_driver(self):
        if self.driver:
            return self.driver()
        else:
            try:
                self.init_driver()
                return self.driver
            except Exception as e:
                print(f"Failed to initialize driver due to {e}")
                return None
                
                
                
    def init_driver(self):
        self.driver = GraphDatabase.driver(URI, auth=(USER, PASSWORD))
        return

    def run_query(self, query, parameters=None, single=False):
        """
        Run a Cypher query with optional parameters.

        Args:
            query (str): The Cypher query to execute.
            parameters (dict, optional): Query parameters.
            single (bool): If True, return only the first result.

        Returns:
            list | dict | None: Query results.
        """
        with self.driver.session() as session:
            result = session.run(query, parameters or {})
            records = [r.data() for r in result]
            if single:
                return records[0] if records else None
            return records
        

    def create_node(self, label, properties):
        if self.node_exists(label, properties):
            if self.enable_logging:
                print("Node already exists")
            return self.get_node(label, properties)
        
        query = f"CREATE (n:{label} $props) RETURN n"
        return self.run_query(query, {"props": properties}, single=True)

    def get_nodes_matching_label(self, label):
        query = f"MATCH (n:{label}) RETURN n"
        return self.run_query(query)
    
    def get_node(self, label, properties):
        prop_keys = list(properties.keys())
        prop_str = ", ".join([f"{key}: ${key}" for key in prop_keys])

        query = f"MATCH (n:{label} {{{prop_str}}}) RETURN n LIMIT 1"
        return self.run_query(query, properties, single=True)
    
    def node_exists(self, label, properties):
        prop_keys = list(properties.keys())
        prop_str = ", ".join([f"{key}: ${key}" for key in prop_keys])

        query = f"MATCH (n:{label} {{{prop_str}}}) RETURN n LIMIT 1"
        result = self.run_query(query, properties, single=True)
        return result is not None
    
    def relationship_exists(self, label1, prop1, rel_type, label2, prop2):
        prop1_str = ", ".join([f"{key}: $prop1.{key}" for key in prop1.keys()])
        prop2_str = ", ".join([f"{key}: $prop2.{key}" for key in prop2.keys()])

        query = f"""
        MATCH (a:{label1} {{{prop1_str}}})-[r:{rel_type}]->(b:{label2} {{{prop2_str}}})
        RETURN r LIMIT 1
        """
        result = self.run_query(query, {"prop1": prop1, "prop2": prop2}, single=True)
        return result is not None
    
    def get_relationship(self, label1, prop1, rel_type, label2, prop2):
        prop1_str = ", ".join([f"{key}: $prop1.{key}" for key in prop1.keys()])
        prop2_str = ", ".join([f"{key}: $prop2.{key}" for key in prop2.keys()])

        query = f"""
        MATCH (a:{label1} {{{prop1_str}}})-[r:{rel_type}]->(b:{label2} {{{prop2_str}}})
        RETURN type(r) AS relationship, properties(r) AS edge_properties LIMIT 1
        """
        return self.run_query(query, {"prop1": prop1, "prop2": prop2}, single=True)

    
    def create_relationship(self, label1, prop1, rel_type, label2, prop2, edge_prop=None):
        """
        Create a relationship between two nodes:
          1. Create nodes if they don’t exist (using existing helpers)
          2. If the relationship exists, return it and print a message
          3. Otherwise, create and return it
        """
        # If node1 doesn't exist, Create it
        if not self.node_exists(label1, prop1):
            if self.enable_logging:
                print(f"{label1} node does not exist — creating it.")
            self.create_node(label1, prop1)

        # If node2 doesn't exist, Create it
        if not self.node_exists(label2, prop2):
            if self.enable_logging:
                print(f"{label2} node does not exist — creating it.")
            self.create_node(label2, prop2)

        # Check if relationship already exists
        if self.relationship_exists(label1, prop1, rel_type, label2, prop2):
            if self.enable_logging:
                print(f"Relationship '{rel_type}' already exists between {label1} and {label2}.")
            return self.get_relationship(label1, prop1, rel_type, label2, prop2)

        # Build relationship creation query
        prop1_str = ", ".join([f"{key}: $prop1.{key}" for key in prop1.keys()])
        prop2_str = ", ".join([f"{key}: $prop2.{key}" for key in prop2.keys()])

        if edge_prop:
            edge_keys = list(edge_prop.keys())
            edge_str = ", ".join([f"{key}: $edge_prop.{key}" for key in edge_keys])
            edge_prop_clause = f"{{{edge_str}}}"
        else:
            edge_prop_clause = ""

        query = f"""
        MATCH (a:{label1} {{{prop1_str}}}), (b:{label2} {{{prop2_str}}})
        CREATE (a)-[r:{rel_type} {edge_prop_clause}]->(b)
        RETURN type(r) AS relationship, properties(r) AS edge_properties
        """

        params = {"prop1": prop1, "prop2": prop2}
        if edge_prop:
            params["edge_prop"] = edge_prop

        result = self.run_query(query, params, single=True)
        if self.enable_logging:
            print(f"Created new relationship '{rel_type}' between {label1} and {label2}.")
        return result
    

In [18]:
import pandas as pd
print(pd.__file__)

/Users/Brian/anaconda3/lib/python3.7/site-packages/pandas/__init__.py


In [62]:
my_graph = GraphDB()

In [17]:
node = my_graph.create_node("Recipe", {"name": "Shepards Pie2", "Recipe_id": 3})
print(node)

{'n': {'name': 'Shepards Pie2', 'Recipe_id': 3}}


In [18]:
print(my_graph.driver)

<neo4j._sync.driver.Neo4jDriver object at 0x7fdf1f23db00>


In [19]:
node = my_graph.create_node("Ingredient", {"name": "potato"})
print(node)



{'n': {'name': 'potato'}}


In [32]:
edge = my_graph.create_relationship("Recipe", {"name": "Shepards Pie2", "Recipe_id": 3}, "has_ingredient", "Ingredient", {"name": "peas"})

Created new relationship 'has_ingredient' between Recipe and Ingredient.


In [33]:
edge = my_graph.create_relationship("Recipe", {"name": "Shepards Pie2", "Recipe_id": 3}, "has_ingredient", "Ingredient", {"name": "peas"})

Relationship 'has_ingredient' already exists between Recipe and Ingredient.


In [34]:
edge = my_graph.create_relationship("Recipe", {"name": "Shepards Pie2", "Recipe_id": 3}, "has_ingredient", "Ingredient", {"name": "carrots"})

Ingredient node does not exist — creating it.
Created new relationship 'has_ingredient' between Recipe and Ingredient.


In [52]:
import pandas as pd
import ast
import os

csv_path = os.path.join(os.getcwd(), 'sample', 'FoodKG_ingredients_normalized.csv')

df = pd.read_csv(csv_path)
print(df.head(5))

#need to apply this twice because there is an extra layer of quotes around the lists
df["ingredients_normalized"] = df["ingredients_normalized"].apply(ast.literal_eval)
df["ingredients_normalized"] = df["ingredients_normalized"].apply(ast.literal_eval)

df_exploded = df.explode("ingredients_normalized").reset_index(drop=True)
df_exploded = df_exploded.rename(columns={"ingredients_normalized": "ingredient"})

print(df_exploded.head(5))


   id                             ingredients_normalized
0   0  "['brown sugar', 'evaporated milk', 'vanilla',...
1   1  "['chipped beef', 'chicken', 'mushroom soup', ...
2   2  "['corn', 'cream cheese', 'butter', 'garlic po...
3   3  "['chicken', 'chicken gravy', 'mushroom soup',...
4   4  "['peanut butter', 'graham cracker', 'butter',...
   id       ingredient
0   0      brown sugar
1   0  evaporated milk
2   0          vanilla
3   0            pecan
4   0           butter


In [65]:
df_exploded

Unnamed: 0,id,ingredient
0,0,brown sugar
1,0,evaporated milk
2,0,vanilla
3,0,pecan
4,0,butter
...,...,...
91693,12299,mushroom soup
91694,12299,soy mayonnaise
91695,12299,chicken seasoning
91696,12299,turmeric


In [57]:
def Create_Nodes_Edges_Recipes_to_Ingredients(graph_intf, RecipeID, IngredientTitle):
    #TEMP RECIPE TITLE DUMMY NAME FOR NOW:
    RecipeTitle = f"Title_{RecipeID}"
    
    edge = graph_intf.create_relationship("Recipe", {"Title": RecipeTitle, "Recipe_id": RecipeID}, "has_ingredient", "Ingredient", {"name": IngredientTitle})
    

In [63]:
my_graph.enable_logging = False

In [64]:
for idx, row in df_exploded.iterrows():
    if idx% 500 ==0:
        print(idx)
    
    RecipeID = row["id"]
    IngredientTitle = row["ingredient"]
    
    Create_Nodes_Edges_Recipes_to_Ingredients(my_graph, RecipeID, IngredientTitle)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500
22000
22500
23000
23500
24000
24500
25000
25500
26000
26500
27000
27500
28000
28500
29000
29500
30000
30500
31000
31500
32000
32500
33000
33500
34000
34500
35000
35500
36000
36500
37000
37500
38000
38500
39000
39500
40000
40500
41000
41500
42000
42500
43000
43500
44000
44500
45000


KeyboardInterrupt: 