In [1]:
import sys

print(sys.executable)
!{sys.executable} -m pip install neo4j

/Users/Brian/anaconda3/bin/python


In [52]:
!{sys.executable} -m pip install psycopg2-binary


Collecting psycopg2-binary
[?25l  Downloading https://files.pythonhosted.org/packages/c1/d3/30a58e2399ad0d7830bb2c3c07b2e937e2ea8fe53e1d9e4b95d03d995362/psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl (2.8MB)
[K     |████████████████████████████████| 2.8MB 4.3MB/s eta 0:00:01
[?25hInstalling collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.9


In [50]:
from sqlalchemy import create_engine

In [66]:
from neo4j import GraphDatabase

# Update with your credentials
URI = "bolt://67.58.49.84:7687"
USER = "neo4j"
PASSWORD = "x7t4p2ks"  # or the password you set


class GraphDB:
    driver = None
    enable_logging = True

    def __init__(self):
        self.get_driver()

    def get_driver(self):
        if self.driver:
            return self.driver()
        else:
            try:
                self.init_driver()
                return self.driver
            except Exception as e:
                print(f"Failed to initialize driver due to {e}")
                return None

    def init_driver(self):
        self.driver = GraphDatabase.driver(URI, auth=(USER, PASSWORD))
        return

    def run_query(self, query, parameters=None, single=False):
        """
        Run a Cypher query with optional parameters.

        Args:
            query (str): The Cypher query to execute.
            parameters (dict, optional): Query parameters.
            single (bool): If True, return only the first result.

        Returns:
            list | dict | None: Query results.
        """
        with self.driver.session() as session:
            result = session.run(query, parameters or {})
            records = [r.data() for r in result]
            if single:
                return records[0] if records else None
            return records

    def create_node(self, label, properties):
        if self.node_exists(label, properties):
            if self.enable_logging:
                print("Node already exists")
            return self.get_node(label, properties)

        query = f"CREATE (n:{label} $props) RETURN n"
        return self.run_query(query, {"props": properties}, single=True)

    def get_nodes_matching_label(self, label):
        query = f"MATCH (n:{label}) RETURN n"
        return self.run_query(query)

    def get_node(self, label, properties):
        prop_keys = list(properties.keys())
        prop_str = ", ".join([f"{key}: ${key}" for key in prop_keys])

        query = f"MATCH (n:{label} {{{prop_str}}}) RETURN n LIMIT 1"
        return self.run_query(query, properties, single=True)

    def node_exists(self, label, properties):
        prop_keys = list(properties.keys())
        prop_str = ", ".join([f"{key}: ${key}" for key in prop_keys])

        query = f"MATCH (n:{label} {{{prop_str}}}) RETURN n LIMIT 1"
        result = self.run_query(query, properties, single=True)
        return result is not None

    def relationship_exists(self, label1, prop1, rel_type, label2, prop2):
        prop1_str = ", ".join([f"{key}: $prop1.{key}" for key in prop1.keys()])
        prop2_str = ", ".join([f"{key}: $prop2.{key}" for key in prop2.keys()])

        query = f"""
        MATCH (a:{label1} {{{prop1_str}}})-[r:{rel_type}]->(b:{label2} {{{prop2_str}}})
        RETURN r LIMIT 1
        """
        result = self.run_query(query, {"prop1": prop1, "prop2": prop2}, single=True)
        return result is not None

    def get_relationship(self, label1, prop1, rel_type, label2, prop2):
        prop1_str = ", ".join([f"{key}: $prop1.{key}" for key in prop1.keys()])
        prop2_str = ", ".join([f"{key}: $prop2.{key}" for key in prop2.keys()])

        query = f"""
        MATCH (a:{label1} {{{prop1_str}}})-[r:{rel_type}]->(b:{label2} {{{prop2_str}}})
        RETURN type(r) AS relationship, properties(r) AS edge_properties LIMIT 1
        """
        return self.run_query(query, {"prop1": prop1, "prop2": prop2}, single=True)

    def create_ingredient_nodes(self, ingredient_records):
        """
        ingredient_records should look like this:

        ingredient_records = [
            {"ingredientName": "brown sugar"},
            ...
            {"ingredientName": "butter"}
        ]
        """

        query = """
        UNWIND $ingredients AS ing
        MERGE (i:Ingredient {ingredientName: ing.ingredientName})
        """
        return self.run_query(query, {"ingredients": ingredient_records})
    
    def create_nutrient_nodes(self, nutrient_records):
        """
        nutrient_records : list[dict]
            Example:
            [
                {
                    "nutrientId": 1003,
                    "nutrientName": "Protein",
                    "unitName": "g"
                },
                {
                    "nutrientId": 1004,
                    "nutrientName": "Total Fat",
                    "unitName": "g"
                }
            ]
        """
        query = """
        UNWIND $nutrient_records AS record
        MERGE (n:Nutrient {nutrientId: record.nutrientId})
        SET n.nutrientName = record.nutrientName,
            n.unitName = record.unitName
        """
    
        self.run_query(query, {"nutrient_records":nutrient_records})
        
    def create_product_nodes(self, product_records):
        """
        product_records should look like this:
    
            product_records = [
                {
                    "productId": 100001,
                    "productDescription": "Organic Whole Milk"
                },
                {
                    "productId": 100002,
                    "productDescription": "Greek Yogurt, Vanilla"
                },
                ...
            ]
        """
    
        query = """
            UNWIND $product_records AS p
            MERGE (prod:Product {productId: p.productId})
            SET prod.productDescription = p.productDescription
        """
    
        return self.run_query(query, {"product_records": product_records})

    def create_recipe_nodes(self, recipe_records):
        """
        recipe_records should look like this:
    
            recipe_records = [
                {
                    "recipeId": 0,
                    "recipeName": "No-Bake Nut Cookies",
                    "originalIngredients": "["1 c. firmly packed brown sugar",..."2 tbsp butter"]"
                },
                ...
                {
                    "recipeId": 2,
                    "recipeName": "Creamy Corn",
                    "originalIngredients": "["2 (16 oz.) pkg. frozen corn",..."1/2 c. butter"]""
                }
            ]
    
        """
        
        query = """
            UNWIND $recipes AS r
            MERGE (rec:Recipe {recipeId: r.recipeId})
            SET rec.recipeName = r.recipeName,
                rec.originalIngredients = r.originalIngredients
            """
        return self.run_query(query, {"recipes": recipe_records})

    def create_hasIngredient_edges(self, edge_records):
        """
        edge_records should look like this:
    
            edge_records = [
                {
                    "recipeId": 0,
                    "ingredientName": "brown sugar"
                },
                {
                    "recipeId": 0,
                    "ingredientName": "evaporated milk"
                },
                {
                    "recipeId": 1,
                    "ingredientName": "chicken"
                },
                ...
            ]

        """
    
        query = """
            UNWIND $edges AS e
            MATCH (r:Recipe {recipeId: e.recipeId})
            MATCH (i:Ingredient {ingredientName: e.ingredientName})
            MERGE (r)-[:HAS_INGREDIENT]->(i)
        """
    
        return self.run_query(query, {"edges": edge_records})

    def create_relationship(self, label1, prop1, rel_type, label2, prop2, edge_prop=None):
        """
        Create a relationship between two nodes:
          1. Create nodes if they don’t exist (using existing helpers)
          2. If the relationship exists, return it and print a message
          3. Otherwise, create and return it
        """
        # If node1 doesn't exist, Create it
        if not self.node_exists(label1, prop1):
            if self.enable_logging:
                print(f"{label1} node does not exist — creating it.")
            self.create_node(label1, prop1)

        # If node2 doesn't exist, Create it
        if not self.node_exists(label2, prop2):
            if self.enable_logging:
                print(f"{label2} node does not exist — creating it.")
            self.create_node(label2, prop2)

        # Check if relationship already exists
        if self.relationship_exists(label1, prop1, rel_type, label2, prop2):
            if self.enable_logging:
                print(f"Relationship '{rel_type}' already exists between {label1} and {label2}.")
            return self.get_relationship(label1, prop1, rel_type, label2, prop2)

        # Build relationship creation query
        prop1_str = ", ".join([f"{key}: $prop1.{key}" for key in prop1.keys()])
        prop2_str = ", ".join([f"{key}: $prop2.{key}" for key in prop2.keys()])

        if edge_prop:
            edge_keys = list(edge_prop.keys())
            edge_str = ", ".join([f"{key}: $edge_prop.{key}" for key in edge_keys])
            edge_prop_clause = f"{{{edge_str}}}"
        else:
            edge_prop_clause = ""

        query = f"""
        MATCH (a:{label1} {{{prop1_str}}}), (b:{label2} {{{prop2_str}}})
        CREATE (a)-[r:{rel_type} {edge_prop_clause}]->(b)
        RETURN type(r) AS relationship, properties(r) AS edge_properties
        """

        params = {"prop1": prop1, "prop2": prop2}
        if edge_prop:
            params["edge_prop"] = edge_prop

        result = self.run_query(query, params, single=True)
        if self.enable_logging:
            print(f"Created new relationship '{rel_type}' between {label1} and {label2}.")
        return result


In [70]:
URI = "bolt://67.58.49.84:7687"
USER_N4J = "neo4j"
PASSWORD_N4J = "x7t4p2ks"

class GraphDB:
    
    driver = None
    enable_logging = True
    
    def __init__(self):
        self.get_driver()
        
    def get_driver(self):
        if self.driver:
            return self.driver()
        else:
            try:
                self.init_driver()
                return self.driver
            except Exception as e:
                print(f"Failed to initialize driver due to {e}")
                return None
                
                
                
    def init_driver(self):
        self.driver = GraphDatabase.driver(URI, auth=(USER_N4J, PASSWORD_N4J))
        return

    def run_query(self, query, parameters=None, single=False):
        """
        Run a Cypher query with optional parameters.

        Args:
            query (str): The Cypher query to execute.
            parameters (dict, optional): Query parameters.
            single (bool): If True, return only the first result.

        Returns:
            list | dict | None: Query results.
        """
        with self.driver.session() as session:
            result = session.run(query, parameters or {})
            records = [r.data() for r in result]
            if single:
                return records[0] if records else None
            return records
        

    def create_node(self, label, properties):
        if self.node_exists(label, properties):
            if self.enable_logging:
                print("Node already exists")
            return self.get_node(label, properties)
        
        query = f"CREATE (n:{label} $props) RETURN n"
        return self.run_query(query, {"props": properties}, single=True)

    def get_nodes_matching_label(self, label):
        query = f"MATCH (n:{label}) RETURN n"
        return self.run_query(query)
    
    def get_node(self, label, properties):
        prop_keys = list(properties.keys())
        prop_str = ", ".join([f"{key}: ${key}" for key in prop_keys])

        query = f"MATCH (n:{label} {{{prop_str}}}) RETURN n LIMIT 1"
        return self.run_query(query, properties, single=True)
    
    def node_exists(self, label, properties):
        prop_keys = list(properties.keys())
        prop_str = ", ".join([f"{key}: ${key}" for key in prop_keys])

        query = f"MATCH (n:{label} {{{prop_str}}}) RETURN n LIMIT 1"
        result = self.run_query(query, properties, single=True)
        return result is not None
    
    def relationship_exists(self, label1, prop1, rel_type, label2, prop2):
        prop1_str = ", ".join([f"{key}: $prop1.{key}" for key in prop1.keys()])
        prop2_str = ", ".join([f"{key}: $prop2.{key}" for key in prop2.keys()])

        query = f"""
        MATCH (a:{label1} {{{prop1_str}}})-[r:{rel_type}]->(b:{label2} {{{prop2_str}}})
        RETURN r LIMIT 1
        """
        result = self.run_query(query, {"prop1": prop1, "prop2": prop2}, single=True)
        return result is not None
    
    def get_relationship(self, label1, prop1, rel_type, label2, prop2):
        prop1_str = ", ".join([f"{key}: $prop1.{key}" for key in prop1.keys()])
        prop2_str = ", ".join([f"{key}: $prop2.{key}" for key in prop2.keys()])

        query = f"""
        MATCH (a:{label1} {{{prop1_str}}})-[r:{rel_type}]->(b:{label2} {{{prop2_str}}})
        RETURN type(r) AS relationship, properties(r) AS edge_properties LIMIT 1
        """
        return self.run_query(query, {"prop1": prop1, "prop2": prop2}, single=True)
    

    def create_nutrient_nodes(self, nutrient_records):
        """
        nutrient_records : list[dict]
            Example:
            [
                {
                    "nutrientId": 1003,
                    "nutrientName": "Protein",
                    "unitName": "g"
                },
                {
                    "nutrientId": 1004,
                    "nutrientName": "Total Fat",
                    "unitName": "g"
                }
            ]
        """
        query = """
        UNWIND $nutrient_records AS record
        MERGE (n:Nutrient {nutrientId: record.nutrientId})
        SET n.nutrientName = record.nutrientName,
            n.unitName = record.unitName
        """
    
        self.run_query(query, {"nutrient_records":nutrient_records})

    def create_ingredient_nodes(self, ingredient_records):
        """
        ingredient_records should look like this:
    
            ingredient_records = [
                {"ingredientName": "brown sugar"},
                ...
                {"ingredientName": "butter"}
            ]
        """
    
        query = """
            UNWIND $ingredients AS ing
            MERGE (i:Ingredient {ingredientName: ing.ingredientName})
        """
        return self.run_query(query, {"ingredients": ingredient_records})
    
    def create_product_nodes(self, product_records):
        """
        product_records should look like this:
    
            product_records = [
                {
                    "productId": 100001,
                    "productDescription": "Organic Whole Milk"
                },
                {
                    "productId": 100002,
                    "productDescription": "Greek Yogurt, Vanilla"
                },
                ...
            ]
        """
    
        query = """
            UNWIND $product_records AS p
            MERGE (prod:Product {productId: p.productId})
            SET prod.productDescription = p.productDescription
        """
    
        return self.run_query(query, {"product_records": product_records})

    def create_recipe_nodes(self, recipe_records):
        """
        recipe_records should look like this:
    
            recipe_records = [
                {
                    "recipeId": 0,
                    "recipeName": "No-Bake Nut Cookies",
                    "originalIngredients": "["1 c. firmly packed brown sugar",..."2 tbsp butter"]"
                },
                ...
                {
                    "recipeId": 2,
                    "recipeName": "Creamy Corn",
                    "originalIngredients": "["2 (16 oz.) pkg. frozen corn",..."1/2 c. butter"]""
                }
            ]
    
        """
        
        query = """
            UNWIND $recipes AS r
            MERGE (rec:Recipe {recipeId: r.recipeId})
            SET rec.recipeName = r.recipeName,
                rec.originalIngredients = r.originalIngredients
            """
        return self.run_query(query, {"recipes": recipe_records})


    def create_hasNutrient_edges(self, edge_records):
        """ 
        Example:
            edge_records = [
                {"productId": 100001, "nutrientId": 1003, "amount": 5.0},
                {"productId": 100001, "nutrientId": 1004, "amount": 10.0},
                {"productId": 100002, "nutrientId": 1003, "amount": 8.0}
            ]
        """
    
        query = """
            UNWIND $edges AS e
            MATCH (p:Product {productId: e.productId})
            MATCH (n:Nutrient {nutrientId: e.nutrientId})
            MERGE (p)-[r:HAS_NUTRIENT]->(n)
            SET r.amount = e.amount
        """
    
        return self.run_query(query, {"edges": edge_records})

    def create_hasProduct_edges(self, edge_records):
        """
            edge_records = [
                {"ingredientName": "butter", "productId": 100001},
                {"ingredientName": "milk", "productId": 100001},
                {"ingredientName": "butter", "productId": 100002}
            ]
        """
    
        query = """
            UNWIND $edges AS e
            MATCH (ing:Ingredient {ingredientName: e.ingredientName})
            MATCH (p:Product {productId: e.productId})
            MERGE (ing)-[:HAS_PRODUCT]->(p)
        """
    
        return self.run_query(query, {"edges": edge_records})
    
    def create_hasIngredient_edges(self, edge_records):
        """
        edge_records should look like this:
    
            edge_records = [
                {
                    "recipeId": 0,
                    "ingredientName": "brown sugar"
                },
                {
                    "recipeId": 0,
                    "ingredientName": "evaporated milk"
                },
                {
                    "recipeId": 1,
                    "ingredientName": "chicken"
                },
                ...
            ]

        """
    
        query = """
            UNWIND $edges AS e
            MATCH (r:Recipe {recipeId: e.recipeId})
            MATCH (i:Ingredient {ingredientName: e.ingredientName})
            MERGE (r)-[:HAS_INGREDIENT]->(i)
        """
    
        return self.run_query(query, {"edges": edge_records})

    
    def create_relationship(self, label1, prop1, rel_type, label2, prop2, edge_prop=None):
        """
        Create a relationship between two nodes:
          1. Create nodes if they don’t exist (using existing helpers)
          2. If the relationship exists, return it and print a message
          3. Otherwise, create and return it
        """
        # If node1 doesn't exist, Create it
        if not self.node_exists(label1, prop1):
            if self.enable_logging:
                print(f"{label1} node does not exist — creating it.")
            self.create_node(label1, prop1)

        # If node2 doesn't exist, Create it
        if not self.node_exists(label2, prop2):
            if self.enable_logging:
                print(f"{label2} node does not exist — creating it.")
            self.create_node(label2, prop2)

        # Check if relationship already exists
        if self.relationship_exists(label1, prop1, rel_type, label2, prop2):
            if self.enable_logging:
                print(f"Relationship '{rel_type}' already exists between {label1} and {label2}.")
            return self.get_relationship(label1, prop1, rel_type, label2, prop2)

        # Build relationship creation query
        prop1_str = ", ".join([f"{key}: $prop1.{key}" for key in prop1.keys()])
        prop2_str = ", ".join([f"{key}: $prop2.{key}" for key in prop2.keys()])

        if edge_prop:
            edge_keys = list(edge_prop.keys())
            edge_str = ", ".join([f"{key}: $edge_prop.{key}" for key in edge_keys])
            edge_prop_clause = f"{{{edge_str}}}"
        else:
            edge_prop_clause = ""

        query = f"""
        MATCH (a:{label1} {{{prop1_str}}}), (b:{label2} {{{prop2_str}}})
        CREATE (a)-[r:{rel_type} {edge_prop_clause}]->(b)
        RETURN type(r) AS relationship, properties(r) AS edge_properties
        """

        params = {"prop1": prop1, "prop2": prop2}
        if edge_prop:
            params["edge_prop"] = edge_prop

        result = self.run_query(query, params, single=True)
        if self.enable_logging:
            print(f"Created new relationship '{rel_type}' between {label1} and {label2}.")
        return result


In [4]:
import pandas as pd

print(pd.__file__)

/Users/Brian/anaconda3/lib/python3.7/site-packages/pandas/__init__.py


In [71]:
my_graph = GraphDB()

In [17]:
node = my_graph.create_node("Recipe", {"name": "Shepards Pie2", "Recipe_id": 3})
print(node)

{'n': {'name': 'Shepards Pie2', 'Recipe_id': 3}}


In [18]:
print(my_graph.driver)

<neo4j._sync.driver.Neo4jDriver object at 0x7fdf1f23db00>


In [19]:
node = my_graph.create_node("Ingredient", {"name": "potato"})
print(node)



{'n': {'name': 'potato'}}


In [32]:
edge = my_graph.create_relationship("Recipe", {"name": "Shepards Pie2", "Recipe_id": 3}, "has_ingredient", "Ingredient",
                                    {"name": "peas"})

Created new relationship 'has_ingredient' between Recipe and Ingredient.


In [33]:
edge = my_graph.create_relationship("Recipe", {"name": "Shepards Pie2", "Recipe_id": 3}, "has_ingredient", "Ingredient",
                                    {"name": "peas"})

Relationship 'has_ingredient' already exists between Recipe and Ingredient.


In [34]:
edge = my_graph.create_relationship("Recipe", {"name": "Shepards Pie2", "Recipe_id": 3}, "has_ingredient", "Ingredient",
                                    {"name": "carrots"})

Ingredient node does not exist — creating it.
Created new relationship 'has_ingredient' between Recipe and Ingredient.


In [12]:
import pandas as pd
import ast
import os

csv_path = os.path.join(os.getcwd(), 'data/output', 'processed_ingredients.csv')

df = pd.read_csv(csv_path)



RecipeDF = df[["id", "title", "ingredients"]].drop_duplicates().rename(columns={"id": "recipeId", "title": "recipeName", "ingredients": "originalIngredients"})
    
def create_recipe_nodes(graph, recipe_df, batch_size=100000):
    n_rows = len(recipe_df)
    for i in range(0, n_rows, batch_size):
        records = recipe_df[i:min(n_rows, i+batch_size)].to_dict("records")
        graph.create_recipe_nodes(records)
        

create_recipe_nodes(my_graph, RecipeDF)

#need to apply this twice because there is an extra layer of quotes around the lists
#df["ingredients_normalized"] = df["ingredients_normalized"].apply(ast.literal_eval)
#df["ingredients_normalized"] = df["ingredients_normalized"].apply(ast.literal_eval)

#df_exploded = df.explode("ingredients_normalized").reset_index(drop=True)
#df_exploded = df_exploded.rename(columns={"ingredients_normalized": "ingredient"})

#print(df_exploded.head(5))


In [40]:
len(df)

72000

In [31]:
csv_path = os.path.join(os.getcwd(), 'data/output', 'processed_ingredients.csv')
df = pd.read_csv(csv_path)
df["ingredients_normalized"] = df["ingredients_normalized"].apply(ast.literal_eval)
df["ingredients_normalized"] = df["ingredients_normalized"].apply(ast.literal_eval)
ingredient_df = df.explode("ingredients_normalized").reset_index(drop=True).rename(columns={"ingredients_normalized": "ingredientName"})
ingredient_df.dropna(inplace = True)
ingredient_df.drop_duplicates(subset=["ingredientName"], inplace=True)

def create_ingredient_nodes(graph, ingredient_df, batch_size=100000):
    n_rows = len(ingredient_df)
    for i in range(0, n_rows, batch_size):
        records = ingredient_df[i:min(n_rows, i+batch_size)].to_dict("records")
        graph.create_ingredient_nodes(records)
        
        
create_ingredient_nodes(my_graph, ingredient_df)

In [37]:
csv_path = os.path.join(os.getcwd(), 'data/output', 'processed_ingredients.csv')
df = pd.read_csv(csv_path)
df["ingredients_normalized"] = df["ingredients_normalized"].apply(ast.literal_eval)
df["ingredients_normalized"] = df["ingredients_normalized"].apply(ast.literal_eval)
ingredient_df = df.explode("ingredients_normalized").reset_index(drop=True).rename(columns={"ingredients_normalized": "ingredientName"})
ingredient_df.dropna(inplace = True)
ingredient_df.drop_duplicates(subset=["ingredientName"], inplace=True)

In [42]:
    df = pd.read_csv(csv_path)
    df["ingredients_normalized"] = df["ingredients_normalized"].apply(ast.literal_eval)
    df["ingredients_normalized"] = df["ingredients_normalized"].apply(ast.literal_eval)

    #Process the df
    recipe_df = df[["id", "title", "ingredients"]].drop_duplicates().rename(columns={"id": "recipeId", "title": "recipeName", "ingredients": "originalIngredients"})
    df_exploded = df.explode("ingredients_normalized").reset_index(drop=True).rename(columns={"ingredients_normalized": "ingredientName", "id": "recipeId", "title": "recipeName", "ingredients": "originalIngredients"})
    ingredient_df = df_exploded[["ingredientName"]]
    ingredient_df.dropna(inplace = True)
    ingredient_df.drop_duplicates(subset=["ingredientName"], inplace=True)

    hasIngredient_df = df_exploded[["recipeId", "ingredientName"]]
    ingredient_df.dropna(inplace = True)
    ingredient_df.drop_duplicates(inplace=True)

In [43]:
recipe_df

Unnamed: 0,recipeId,recipeName,originalIngredients
0,0,No-Bake Nut Cookies,"['1 c. firmly packed brown sugar', '1/2 c. eva..."
1,1,Jewell Ball'S Chicken,"['1 small jar chipped beef, cut up', '4 boned ..."
2,2,Creamy Corn,"['2 (16 oz.) pkg. frozen corn', '1 (8 oz.) pkg..."
3,3,Chicken Funny,"['1 large whole chicken', '2 (10 1/2 oz.) cans..."
4,4,Reeses Cups(Candy),"['1 c. peanut butter', '3/4 c. graham cracker ..."
...,...,...,...
71995,71995,Sweet Dough,"['1/2 c. white sugar', '1/2 c. plus 1 Tbsp. sh..."
71996,71996,Philly Cream Cheese Dip,"['1 Tbsp. milk', '1 (8 oz.) pkg. cream cheese,..."
71997,71997,Chili,"['1/2 c. suet', '4 lb. beef, ground coarse', '..."
71998,71998,English Stew,"['1 head cabbage, chopped', '1 large onion, in..."


In [44]:
ingredient_df

Unnamed: 0,ingredientName
0,brown sugar
1,evaporated milk
2,vanilla
3,pecan
4,butter
...,...
541833,milk of magnesia
541902,broken tortilla chips
542031,dried sweet pepper flakes
542236,bird nest noodle


In [45]:
hasIngredient_df

Unnamed: 0,recipeId,ingredientName
0,0,brown sugar
1,0,evaporated milk
2,0,vanilla
3,0,pecan
4,0,butter
...,...,...
542717,71999,worcestershire sauce
542718,71999,cream of mushroom soup
542719,71999,cream cheese
542720,71999,white wine


In [49]:
def create_recipe_ingredient_relationship(graph, hasIngredient_df, batch_size=100000):
    n_rows = len(hasIngredient_df)
    for i in range(0, n_rows, batch_size):
        records = hasIngredient_df[i:min(n_rows, i+batch_size)].to_dict("records")
        graph.create_hasIngredient_edges(records)
        
create_recipe_ingredient_relationship(my_graph, hasIngredient_df)

In [None]:
csv_path = os.path.join(os.getcwd(), 'data/output', 'processed_ingredients.csv')
df = pd.read_csv(csv_path)



In [3]:
df_exploded[['ingredient']].head(5)

Unnamed: 0,ingredient
0,brown sugar
1,evaporated milk
2,vanilla
3,pecan
4,butter


In [4]:
# Get unique ingredients
unique_df = df_exploded["ingredient"].dropna().unique()

# Convert back to a DataFrame
unique_df = pd.DataFrame(unique_df, columns=["ingredient"])

# Save to CSV
unique_df.to_csv("unique_ingredients.csv", index=False)

In [16]:
import time

ingredient_list = df_exploded["ingredient"].dropna().unique().tolist()

start = time.time()
my_graph.create_ingredient_nodes(ingredient_list)
end = time.time()

print(f"Time elapsed: {end - start:.4f} seconds")

Time elapsed: 2.2393 seconds


In [27]:
df_exploded['RecipeId'] = df_exploded['id'].apply(lambda x: str(x) + 'title')
#df_exploded[['RecipeId', 'id']]

unique_recipes = (
    df_exploded[["RecipeId", "id"]]
    .drop_duplicates()
    .rename(columns={"RecipeId": "title"})
    .to_dict(orient="records")
)

start = time.time()
my_graph.create_recipe_nodes(unique_recipes)
end = time.time()

print(f"Time elapsed: {end - start:.4f} seconds")

Time elapsed: 31.6044 seconds


In [28]:
print(len(unique_recipes))

12300


In [30]:
"""
Deleted all the nodes and created below indexes. Rerunning again to see diff

CREATE INDEX ingredient_name_index IF NOT EXISTS
FOR (i:Ingredient) ON (i.name);

CREATE INDEX recipe_id_index IF NOT EXISTS FOR (r:Recipe) ON (r.id);

"""

'CREATE INDEX ingredient_name_index IF NOT EXISTS\nFOR (i:Ingredient) ON (i.name);\n\nCREATE INDEX recipe_id_index IF NOT EXISTS FOR (r:Recipe) ON (r.id);\n\n'

In [32]:
start = time.time()
my_graph.create_ingredient_nodes(ingredient_list)
end = time.time()
print(f"Time elapsed: {end - start:.4f} seconds")

Time elapsed: 0.7695 seconds


In [33]:
start = time.time()
my_graph.create_recipe_nodes(unique_recipes)
end = time.time()

print(f"Time elapsed: {end - start:.4f} seconds")

Time elapsed: 2.3412 seconds


In [34]:
df_exploded[['id', 'ingredient']]

Unnamed: 0,id,ingredient
0,0,brown sugar
1,0,evaporated milk
2,0,vanilla
3,0,pecan
4,0,butter
...,...,...
91693,12299,mushroom soup
91694,12299,soy mayonnaise
91695,12299,chicken seasoning
91696,12299,turmeric


In [38]:
edges = df_exploded[['id', 'ingredient']].dropna().to_dict(orient="records")

start = time.time()
my_graph.create_hasIngredient_edges(edges, batch_size=5000)
end = time.time()
print(f"Time elapsed: {end - start:.4f} seconds")

Inserted batch 1 (5000 relationships) in 1.86s
Inserted batch 2 (5000 relationships) in 1.43s
Inserted batch 3 (5000 relationships) in 2.76s
Inserted batch 4 (5000 relationships) in 1.74s
Inserted batch 5 (5000 relationships) in 0.99s
Inserted batch 6 (5000 relationships) in 0.56s
Inserted batch 7 (5000 relationships) in 0.43s
Inserted batch 8 (5000 relationships) in 0.34s
Inserted batch 9 (5000 relationships) in 0.35s
Inserted batch 10 (5000 relationships) in 0.35s
Inserted batch 11 (5000 relationships) in 0.34s
Inserted batch 12 (5000 relationships) in 0.33s
Inserted batch 13 (5000 relationships) in 0.51s
Inserted batch 14 (5000 relationships) in 0.45s
Inserted batch 15 (5000 relationships) in 0.37s
Inserted batch 16 (5000 relationships) in 0.31s
Inserted batch 17 (5000 relationships) in 0.50s
Inserted batch 18 (5000 relationships) in 0.41s
Inserted batch 19 (1697 relationships) in 0.18s
Time elapsed: 14.2283 seconds


In [57]:
def Create_Nodes_Edges_Recipes_to_Ingredients(graph_intf, RecipeID, IngredientTitle):
    #TEMP RECIPE TITLE DUMMY NAME FOR NOW:
    RecipeTitle = f"Title_{RecipeID}"

    edge = graph_intf.create_relationship("Recipe", {"Title": RecipeTitle, "Recipe_id": RecipeID}, "has_ingredient",
                                          "Ingredient", {"name": IngredientTitle})


In [63]:
my_graph.enable_logging = False

In [67]:
#POSTGRES INFO
HOST = "awesome-hw.sdsc.edu"
PORT = 5432
DATABASE = "nourish"
USER_PG = "b6hill"
PASSWORD_PG = "dse203#2025"


class PostgresCon:

    engine = None
    def __init__(self):
        self.get_engine()

    def get_engine(self):
        if self.engine:
            return self.engine
        else:
            try:
                self.init_engine()
                return self.engine
            except Exception as e:
                print(f"Failed to initialize engine due to {e}")
                return None

    def init_engine(self):
        self.engine = create_engine(f"postgresql+psycopg2://{USER_PG}:{PASSWORD_PG}@{HOST}:{PORT}/{DATABASE}")
        return
    
postgres_conn = PostgresCon()

query = """
        SELECT
            fn.fdc_id AS productId,
            fb.description AS productDescription,
            fn.nutrient_id AS nutrientId,
            fn.amount AS amount,
            nm.name AS nutrientName,
            nm.unit_name AS unitName
        FROM
            usda_2022_branded_food_nutrients fn,
            usda_2022_nutrient_master nm,
            usda_2022_food_branded_experimental fb
        WHERE
            fn.nutrient_id = nm.id AND
            fb.fdc_id = fn.fdc_id;
    """

df = pd.read_sql(query, postgres_conn.get_engine())
df


Unnamed: 0,productid,productdescription,nutrientid,amount,nutrientname,unitname
0,599337,PURE SWEET CIDER,1093,25.00,"Sodium, Na",MG
1,599337,PURE SWEET CIDER,1008,51.00,Energy,KCAL
2,599337,PURE SWEET CIDER,2000,12.71,"Sugars, Total",G
3,599337,PURE SWEET CIDER,1004,0.00,Total lipid (fat),G
4,599337,PURE SWEET CIDER,1092,57.00,"Potassium, K",MG
...,...,...,...,...,...,...
21453544,599239,COUNTRY GRAVY MIX,2000,11.11,"Sugars, Total",G
21453545,599239,COUNTRY GRAVY MIX,1110,0.00,"Vitamin D (D2 + D3), International Units",IU
21453546,599239,COUNTRY GRAVY MIX,1253,0.00,Cholesterol,MG
21453547,599239,COUNTRY GRAVY MIX,1079,0.00,"Fiber, total dietary",G


In [74]:
df2 = df.rename(columns=dict(zip(['productid', 'productdescription', 'nutrientid', 'amount',
       'nutrientname', 'unitname'], ['productId', 'productDescription', 'nutrientId', 'amount',
       'nutrientName', 'unitName'])))

In [None]:
def create_product_nodes(graph, product_df, batch_size=100000):
    n_rows = len(product_df)
    for i in range(0, n_rows, batch_size):
        print(i,'/',n_rows)
        records = product_df[i:min(n_rows, i+batch_size)].to_dict("records")
        graph.create_product_nodes(records)

create_product_nodes(my_graph, df2[["productId", "productDescription"]].drop_duplicates())

In [63]:
nutrient_df = df[["nutrientId", "nutrientName", "unitName"]].drop_duplicates()

def create_nutrient_nodes(graph, nutrient_df, batch_size=100000):
    n_rows = len(nutrient_df)
    for i in range(0, n_rows, batch_size):
        print(i)
        records = nutrient_df[i:min(n_rows, i+batch_size)].to_dict("records")
        graph.create_nutrient_nodes(records)

create_nutrient_nodes(my_graph, nutrient_df)
nutrient_df

Unnamed: 0,nutrientId,nutrientName,unitName
0,1003,Protein,G
1,1004,Total lipid (fat),G
2,1005,"Carbohydrate, by difference",G
3,1110,"Vitamin D (D2 + D3), International Units",IU
4,1079,"Fiber, total dietary",G
...,...,...,...
5994941,1111,Vitamin D2 (ergocalciferol),UG
7273261,1273,SFA 22:0,G
7273582,1236,"Sugars, intrinsic",G
15600914,1080,Lignin,G


In [64]:
create_nutrient_nodes(my_graph, nutrient_df)

0


In [None]:
def create_product_nodes(graph, product_df, batch_size=100000):
    n_rows = len(product_df)
    for i in range(0, n_rows, batch_size):
        records = product_df[i:min(n_rows, i+batch_size)].to_dict("records")
        graph.create_product_nodes(records)

In [None]:
hasNutrient_df = df[["nutrientId", "productId", "amount"]].drop_duplicates()

In [65]:
csv_path = os.path.join(os.getcwd(), 'data/output', 'processed_products.csv')
df_p2i = pd.read_csv(csv_path)

df_p2i

Unnamed: 0,fdc_id,description,mapped_ingredient
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...","""biscuit"""
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger...","""cinnamon roll"""
2,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa...","""breadcrumb"""
3,167515,"George Weston Bakeries, Thomas English Muffins","""muffin"""
4,167516,"Waffles, buttermilk, frozen, ready-to-heat","""waffle"""
...,...,...,...
23795,345245,Kellogg's Special K Crackers Sour Cream & Onio...,"""sour cream & onion"""
23796,345246,Kellogg's Krave Cereal Milk Chocolate 1.87oz,"""milk chocolate"""
23797,345247,Kellogg's Nutri-Grain Granola Bars Strawberry ...,"""nutri-grain granola"""
23798,345248,Kellogg's Krave Cereal Chocolate 35oz 4ct,"""chocolate"""
