In [1]:
import sys
print(sys.executable)
!{sys.executable} -m pip install neo4j

/Users/Brian/anaconda3/bin/python


In [36]:
from neo4j import GraphDatabase

# Update with your credentials
URI = "bolt://67.58.49.84:7687"
USER = "neo4j"
PASSWORD = "x7t4p2ks"  # or the password you set


class GraphDB:
    
    driver = None
    enable_logging = True
    
    def __init__(self):
        self.get_driver()
        
    def get_driver(self):
        if self.driver:
            return self.driver()
        else:
            try:
                self.init_driver()
                return self.driver
            except Exception as e:
                print(f"Failed to initialize driver due to {e}")
                return None
                
                
                
    def init_driver(self):
        self.driver = GraphDatabase.driver(URI, auth=(USER, PASSWORD))
        return

    def run_query(self, query, parameters=None, single=False):
        """
        Run a Cypher query with optional parameters.

        Args:
            query (str): The Cypher query to execute.
            parameters (dict, optional): Query parameters.
            single (bool): If True, return only the first result.

        Returns:
            list | dict | None: Query results.
        """
        with self.driver.session() as session:
            result = session.run(query, parameters or {})
            records = [r.data() for r in result]
            if single:
                return records[0] if records else None
            return records
        

    def create_node(self, label, properties):
        if self.node_exists(label, properties):
            if self.enable_logging:
                print("Node already exists")
            return self.get_node(label, properties)
        
        query = f"CREATE (n:{label} $props) RETURN n"
        return self.run_query(query, {"props": properties}, single=True)

    def get_nodes_matching_label(self, label):
        query = f"MATCH (n:{label}) RETURN n"
        return self.run_query(query)
    
    def get_node(self, label, properties):
        prop_keys = list(properties.keys())
        prop_str = ", ".join([f"{key}: ${key}" for key in prop_keys])

        query = f"MATCH (n:{label} {{{prop_str}}}) RETURN n LIMIT 1"
        return self.run_query(query, properties, single=True)
    
    def node_exists(self, label, properties):
        prop_keys = list(properties.keys())
        prop_str = ", ".join([f"{key}: ${key}" for key in prop_keys])

        query = f"MATCH (n:{label} {{{prop_str}}}) RETURN n LIMIT 1"
        result = self.run_query(query, properties, single=True)
        return result is not None
    
    def relationship_exists(self, label1, prop1, rel_type, label2, prop2):
        prop1_str = ", ".join([f"{key}: $prop1.{key}" for key in prop1.keys()])
        prop2_str = ", ".join([f"{key}: $prop2.{key}" for key in prop2.keys()])

        query = f"""
        MATCH (a:{label1} {{{prop1_str}}})-[r:{rel_type}]->(b:{label2} {{{prop2_str}}})
        RETURN r LIMIT 1
        """
        result = self.run_query(query, {"prop1": prop1, "prop2": prop2}, single=True)
        return result is not None
    
    def get_relationship(self, label1, prop1, rel_type, label2, prop2):
        prop1_str = ", ".join([f"{key}: $prop1.{key}" for key in prop1.keys()])
        prop2_str = ", ".join([f"{key}: $prop2.{key}" for key in prop2.keys()])

        query = f"""
        MATCH (a:{label1} {{{prop1_str}}})-[r:{rel_type}]->(b:{label2} {{{prop2_str}}})
        RETURN type(r) AS relationship, properties(r) AS edge_properties LIMIT 1
        """
        return self.run_query(query, {"prop1": prop1, "prop2": prop2}, single=True)
    
    def create_ingredient_nodes(self, ingredient_list):
        query = """
        UNWIND $ingredients AS name
        MERGE (i:Ingredient {name: name})
        """
        return self.run_query(query, {"ingredients": ingredient_list})
    
    def create_recipe_nodes(self, recipe_list):
        """
        recipe_list = 
            [
              {"title": "Shepards Pie", "id": 0},
              ...
              {"title": "Apple Pie", "id": 12299},
              
            ]
        
        """
        
        
        query = """
            UNWIND $recipes AS r
            MERGE (rec:Recipe {id: r.id})
            SET rec.title = r.title
            """
        return self.run_query(query, {"recipes": recipe_list})
    
    def create_hasIngredient_edges(self, edge_list, batch_size=5000):
        total = len(edge_list)
        for i in range(0, total, batch_size):
            batch = edge_list[i : i + batch_size]
            query = """
            UNWIND $edges AS e
            MATCH (r:Recipe {id: e.id})
            MATCH (ing:Ingredient {name: e.ingredient})
            MERGE (r)-[:hasIngredient]->(ing)
            """
            start = time.perf_counter()
            self.run_query(query, {"edges": batch})
            elapsed = time.perf_counter() - start
            if(self.enable_logging):
                print(f"Inserted batch {i//batch_size+1} "
                  f"({len(batch)} relationships) in {elapsed:.2f}s")
        return

    
    def create_relationship(self, label1, prop1, rel_type, label2, prop2, edge_prop=None):
        """
        Create a relationship between two nodes:
          1. Create nodes if they don’t exist (using existing helpers)
          2. If the relationship exists, return it and print a message
          3. Otherwise, create and return it
        """
        # If node1 doesn't exist, Create it
        if not self.node_exists(label1, prop1):
            if self.enable_logging:
                print(f"{label1} node does not exist — creating it.")
            self.create_node(label1, prop1)

        # If node2 doesn't exist, Create it
        if not self.node_exists(label2, prop2):
            if self.enable_logging:
                print(f"{label2} node does not exist — creating it.")
            self.create_node(label2, prop2)

        # Check if relationship already exists
        if self.relationship_exists(label1, prop1, rel_type, label2, prop2):
            if self.enable_logging:
                print(f"Relationship '{rel_type}' already exists between {label1} and {label2}.")
            return self.get_relationship(label1, prop1, rel_type, label2, prop2)

        # Build relationship creation query
        prop1_str = ", ".join([f"{key}: $prop1.{key}" for key in prop1.keys()])
        prop2_str = ", ".join([f"{key}: $prop2.{key}" for key in prop2.keys()])

        if edge_prop:
            edge_keys = list(edge_prop.keys())
            edge_str = ", ".join([f"{key}: $edge_prop.{key}" for key in edge_keys])
            edge_prop_clause = f"{{{edge_str}}}"
        else:
            edge_prop_clause = ""

        query = f"""
        MATCH (a:{label1} {{{prop1_str}}}), (b:{label2} {{{prop2_str}}})
        CREATE (a)-[r:{rel_type} {edge_prop_clause}]->(b)
        RETURN type(r) AS relationship, properties(r) AS edge_properties
        """

        params = {"prop1": prop1, "prop2": prop2}
        if edge_prop:
            params["edge_prop"] = edge_prop

        result = self.run_query(query, params, single=True)
        if self.enable_logging:
            print(f"Created new relationship '{rel_type}' between {label1} and {label2}.")
        return result
    

In [18]:
import pandas as pd
print(pd.__file__)

/Users/Brian/anaconda3/lib/python3.7/site-packages/pandas/__init__.py


In [37]:
my_graph = GraphDB()

In [17]:
node = my_graph.create_node("Recipe", {"name": "Shepards Pie2", "Recipe_id": 3})
print(node)

{'n': {'name': 'Shepards Pie2', 'Recipe_id': 3}}


In [18]:
print(my_graph.driver)

<neo4j._sync.driver.Neo4jDriver object at 0x7fdf1f23db00>


In [19]:
node = my_graph.create_node("Ingredient", {"name": "potato"})
print(node)



{'n': {'name': 'potato'}}


In [32]:
edge = my_graph.create_relationship("Recipe", {"name": "Shepards Pie2", "Recipe_id": 3}, "has_ingredient", "Ingredient", {"name": "peas"})

Created new relationship 'has_ingredient' between Recipe and Ingredient.


In [33]:
edge = my_graph.create_relationship("Recipe", {"name": "Shepards Pie2", "Recipe_id": 3}, "has_ingredient", "Ingredient", {"name": "peas"})

Relationship 'has_ingredient' already exists between Recipe and Ingredient.


In [34]:
edge = my_graph.create_relationship("Recipe", {"name": "Shepards Pie2", "Recipe_id": 3}, "has_ingredient", "Ingredient", {"name": "carrots"})

Ingredient node does not exist — creating it.
Created new relationship 'has_ingredient' between Recipe and Ingredient.


In [1]:
import pandas as pd
import ast
import os

csv_path = os.path.join(os.getcwd(), 'data/_backup', 'FoodKG_ingredients_normalized.csv')

df = pd.read_csv(csv_path)
print(df.head(5))

#need to apply this twice because there is an extra layer of quotes around the lists
df["ingredients_normalized"] = df["ingredients_normalized"].apply(ast.literal_eval)
df["ingredients_normalized"] = df["ingredients_normalized"].apply(ast.literal_eval)

df_exploded = df.explode("ingredients_normalized").reset_index(drop=True)
df_exploded = df_exploded.rename(columns={"ingredients_normalized": "ingredient"})

print(df_exploded.head(5))




   id                             ingredients_normalized
0   0  "['brown sugar', 'evaporated milk', 'vanilla',...
1   1  "['chipped beef', 'chicken', 'mushroom soup', ...
2   2  "['corn', 'cream cheese', 'butter', 'garlic po...
3   3  "['chicken', 'chicken gravy', 'mushroom soup',...
4   4  "['peanut butter', 'graham cracker', 'butter',...
   id       ingredient
0   0      brown sugar
1   0  evaporated milk
2   0          vanilla
3   0            pecan
4   0           butter


In [3]:
df_exploded[['ingredient']].head(5)

Unnamed: 0,ingredient
0,brown sugar
1,evaporated milk
2,vanilla
3,pecan
4,butter


In [4]:
# Get unique ingredients
unique_df = df_exploded["ingredient"].dropna().unique()

# Convert back to a DataFrame
unique_df = pd.DataFrame(unique_df, columns=["ingredient"])

# Save to CSV
unique_df.to_csv("unique_ingredients.csv", index=False)

In [16]:
import time

ingredient_list = df_exploded["ingredient"].dropna().unique().tolist()

start = time.time()
my_graph.create_ingredient_nodes(ingredient_list)
end = time.time()

print(f"Time elapsed: {end - start:.4f} seconds")

Time elapsed: 2.2393 seconds


In [27]:
df_exploded['RecipeId'] = df_exploded['id'].apply(lambda x: str(x)+'title')
#df_exploded[['RecipeId', 'id']]

unique_recipes = (
    df_exploded[["RecipeId", "id"]]
    .drop_duplicates()
    .rename(columns={"RecipeId": "title"})
    .to_dict(orient="records")
)

start = time.time()
my_graph.create_recipe_nodes(unique_recipes)
end = time.time()

print(f"Time elapsed: {end - start:.4f} seconds")

Time elapsed: 31.6044 seconds


In [28]:
print(len(unique_recipes))

12300


In [30]:
"""
Deleted all the nodes and created below indexes. Rerunning again to see diff

CREATE INDEX ingredient_name_index IF NOT EXISTS
FOR (i:Ingredient) ON (i.name);

CREATE INDEX recipe_id_index IF NOT EXISTS FOR (r:Recipe) ON (r.id);

"""

'CREATE INDEX ingredient_name_index IF NOT EXISTS\nFOR (i:Ingredient) ON (i.name);\n\nCREATE INDEX recipe_id_index IF NOT EXISTS FOR (r:Recipe) ON (r.id);\n\n'

In [32]:
start = time.time()
my_graph.create_ingredient_nodes(ingredient_list)
end = time.time()
print(f"Time elapsed: {end - start:.4f} seconds")

Time elapsed: 0.7695 seconds


In [33]:
start = time.time()
my_graph.create_recipe_nodes(unique_recipes)
end = time.time()

print(f"Time elapsed: {end - start:.4f} seconds")

Time elapsed: 2.3412 seconds


In [34]:
df_exploded[['id', 'ingredient']]

Unnamed: 0,id,ingredient
0,0,brown sugar
1,0,evaporated milk
2,0,vanilla
3,0,pecan
4,0,butter
...,...,...
91693,12299,mushroom soup
91694,12299,soy mayonnaise
91695,12299,chicken seasoning
91696,12299,turmeric


In [38]:
edges = df_exploded[['id', 'ingredient']].dropna().to_dict(orient="records")

start = time.time()
my_graph.create_hasIngredient_edges(edges, batch_size=5000)
end = time.time()
print(f"Time elapsed: {end - start:.4f} seconds")

Inserted batch 1 (5000 relationships) in 1.86s
Inserted batch 2 (5000 relationships) in 1.43s
Inserted batch 3 (5000 relationships) in 2.76s
Inserted batch 4 (5000 relationships) in 1.74s
Inserted batch 5 (5000 relationships) in 0.99s
Inserted batch 6 (5000 relationships) in 0.56s
Inserted batch 7 (5000 relationships) in 0.43s
Inserted batch 8 (5000 relationships) in 0.34s
Inserted batch 9 (5000 relationships) in 0.35s
Inserted batch 10 (5000 relationships) in 0.35s
Inserted batch 11 (5000 relationships) in 0.34s
Inserted batch 12 (5000 relationships) in 0.33s
Inserted batch 13 (5000 relationships) in 0.51s
Inserted batch 14 (5000 relationships) in 0.45s
Inserted batch 15 (5000 relationships) in 0.37s
Inserted batch 16 (5000 relationships) in 0.31s
Inserted batch 17 (5000 relationships) in 0.50s
Inserted batch 18 (5000 relationships) in 0.41s
Inserted batch 19 (1697 relationships) in 0.18s
Time elapsed: 14.2283 seconds


In [57]:
def Create_Nodes_Edges_Recipes_to_Ingredients(graph_intf, RecipeID, IngredientTitle):
    #TEMP RECIPE TITLE DUMMY NAME FOR NOW:
    RecipeTitle = f"Title_{RecipeID}"
    
    edge = graph_intf.create_relationship("Recipe", {"Title": RecipeTitle, "Recipe_id": RecipeID}, "has_ingredient", "Ingredient", {"name": IngredientTitle})
    

In [63]:
my_graph.enable_logging = False