In [30]:
!pip install neo4j -q
!pip install psycopg2-binary -q
!pip install pandas -q
!pip install numpy -q

In [31]:
from neo4j import GraphDatabase
import pandas as pd
import numpy as np
import os
import ast
from sqlalchemy import create_engine

In [32]:
URI = "bolt://67.58.49.84:7687"
USER_N4J = "neo4j"
PASSWORD_N4J = "x7t4p2ks"  # or the password you set


#POSTGRES INFO
HOST = "awesome-hw.sdsc.edu"
PORT = 5432
DATABASE = "nourish"
USER_PG = "b6hill"
PASSWORD_PG = "dse203#2025"


class PostgresCon:

    engine = None
    def __init__(self):
        self.get_engine()

    def get_engine(self):
        if self.engine:
            return self.engine
        else:
            try:
                self.init_engine()
                return self.engine
            except Exception as e:
                print(f"Failed to initialize engine due to {e}")
                return None

    def init_engine(self):
        self.engine = create_engine(f"postgresql+psycopg2://{USER_PG}:{PASSWORD_PG}@{HOST}:{PORT}/{DATABASE}")
        return


class GraphDB:
    
    driver = None
    enable_logging = True
    
    def __init__(self):
        self.get_driver()
        
    def get_driver(self):
        if self.driver:
            return self.driver()
        else:
            try:
                self.init_driver()
                return self.driver
            except Exception as e:
                print(f"Failed to initialize driver due to {e}")
                return None
                
                
                
    def init_driver(self):
        self.driver = GraphDatabase.driver(URI, auth=(USER_N4J, PASSWORD_N4J))
        return

    def run_query(self, query, parameters=None, single=False):
        """
        Run a Cypher query with optional parameters.

        Args:
            query (str): The Cypher query to execute.
            parameters (dict, optional): Query parameters.
            single (bool): If True, return only the first result.

        Returns:
            list | dict | None: Query results.
        """
        with self.driver.session() as session:
            result = session.run(query, parameters or {})
            records = [r.data() for r in result]
            if single:
                return records[0] if records else None
            return records
        

    def create_node(self, label, properties):
        if self.node_exists(label, properties):
            if self.enable_logging:
                print("Node already exists")
            return self.get_node(label, properties)
        
        query = f"CREATE (n:{label} $props) RETURN n"
        return self.run_query(query, {"props": properties}, single=True)

    def get_nodes_matching_label(self, label):
        query = f"MATCH (n:{label}) RETURN n"
        return self.run_query(query)
    
    def get_node(self, label, properties):
        prop_keys = list(properties.keys())
        prop_str = ", ".join([f"{key}: ${key}" for key in prop_keys])

        query = f"MATCH (n:{label} {{{prop_str}}}) RETURN n LIMIT 1"
        return self.run_query(query, properties, single=True)
    
    def node_exists(self, label, properties):
        prop_keys = list(properties.keys())
        prop_str = ", ".join([f"{key}: ${key}" for key in prop_keys])

        query = f"MATCH (n:{label} {{{prop_str}}}) RETURN n LIMIT 1"
        result = self.run_query(query, properties, single=True)
        return result is not None
    
    def relationship_exists(self, label1, prop1, rel_type, label2, prop2):
        prop1_str = ", ".join([f"{key}: $prop1.{key}" for key in prop1.keys()])
        prop2_str = ", ".join([f"{key}: $prop2.{key}" for key in prop2.keys()])

        query = f"""
        MATCH (a:{label1} {{{prop1_str}}})-[r:{rel_type}]->(b:{label2} {{{prop2_str}}})
        RETURN r LIMIT 1
        """
        result = self.run_query(query, {"prop1": prop1, "prop2": prop2}, single=True)
        return result is not None
    
    def get_relationship(self, label1, prop1, rel_type, label2, prop2):
        prop1_str = ", ".join([f"{key}: $prop1.{key}" for key in prop1.keys()])
        prop2_str = ", ".join([f"{key}: $prop2.{key}" for key in prop2.keys()])

        query = f"""
        MATCH (a:{label1} {{{prop1_str}}})-[r:{rel_type}]->(b:{label2} {{{prop2_str}}})
        RETURN type(r) AS relationship, properties(r) AS edge_properties LIMIT 1
        """
        return self.run_query(query, {"prop1": prop1, "prop2": prop2}, single=True)
    

    def create_nutrient_nodes(self, nutrient_records):
        """
        nutrient_records : list[dict]
            Example:
            [
                {
                    "nutrientId": 1003,
                    "nutrientName": "Protein",
                    "unitName": "g"
                },
                {
                    "nutrientId": 1004,
                    "nutrientName": "Total Fat",
                    "unitName": "g"
                }
            ]
        """
        query = """
        UNWIND $nutrient_records AS record
        MERGE (n:Nutrient {nutrientId: record.nutrientId})
        SET n.nutrientName = record.nutrientName,
            n.unitName = record.unitName
        """
    
        self.run_query(query, {"nutrient_records":nutrient_records})

    def create_ingredient_nodes(self, ingredient_records):
        """
        ingredient_records should look like this:
    
            ingredient_records = [
                {"ingredientName": "brown sugar"},
                ...
                {"ingredientName": "butter"}
            ]
        """
    
        query = """
            UNWIND $ingredients AS ing
            MERGE (i:Ingredient {ingredientName: ing.ingredientName})
        """
        return self.run_query(query, {"ingredients": ingredient_records})
    
    def create_product_nodes(self, product_records):
        """
        product_records should look like this:
    
            product_records = [
                {
                    "productId": 100001,
                    "productDescription": "Organic Whole Milk"
                },
                {
                    "productId": 100002,
                    "productDescription": "Greek Yogurt, Vanilla"
                },
                ...
            ]
        """
    
        query = """
            UNWIND $product_records AS p
            MERGE (prod:Product {productId: p.productId})
            SET prod.productDescription = p.productDescription
        """
    
        return self.run_query(query, {"product_records": product_records})

    def create_recipe_nodes(self, recipe_records):
        """
        recipe_records should look like this:
    
            recipe_records = [
                {
                    "recipeId": 0,
                    "recipeName": "No-Bake Nut Cookies",
                    "originalIngredients": "["1 c. firmly packed brown sugar",..."2 tbsp butter"]"
                },
                ...
                {
                    "recipeId": 2,
                    "recipeName": "Creamy Corn",
                    "originalIngredients": "["2 (16 oz.) pkg. frozen corn",..."1/2 c. butter"]""
                }
            ]
    
        """
        
        query = """
            UNWIND $recipes AS r
            MERGE (rec:Recipe {recipeId: r.recipeId})
            SET rec.recipeName = r.recipeName,
                rec.originalIngredients = r.originalIngredients
            """
        return self.run_query(query, {"recipes": recipe_records})


    def create_hasNutrient_edges(self, edge_records):
        """ 
        Example:
            edge_records = [
                {"productId": 100001, "nutrientId": 1003, "amount": 5.0},
                {"productId": 100001, "nutrientId": 1004, "amount": 10.0},
                {"productId": 100002, "nutrientId": 1003, "amount": 8.0}
            ]
        """
    
        query = """
            UNWIND $edges AS e
            MATCH (p:Product {productId: e.productId})
            MATCH (n:Nutrient {nutrientId: e.nutrientId})
            MERGE (p)-[r:HAS_NUTRIENT]->(n)
            SET r.amount = e.amount
        """
    
        return self.run_query(query, {"edges": edge_records})

    def create_hasProduct_edges(self, edge_records):
        """
            edge_records = [
                {"ingredientName": "butter", "productId": 100001},
                {"ingredientName": "milk", "productId": 100001},
                {"ingredientName": "butter", "productId": 100002}
            ]
        """
    
        query = """
            UNWIND $edges AS e
            MATCH (ing:Ingredient {ingredientName: e.ingredientName})
            MATCH (p:Product {productId: e.productId})
            MERGE (ing)-[:HAS_PRODUCT]->(p)
        """
    
        return self.run_query(query, {"edges": edge_records})
    
    def create_hasIngredient_edges(self, edge_records):
        """
        edge_records should look like this:
    
            edge_records = [
                {
                    "recipeId": 0,
                    "ingredientName": "brown sugar"
                },
                {
                    "recipeId": 0,
                    "ingredientName": "evaporated milk"
                },
                {
                    "recipeId": 1,
                    "ingredientName": "chicken"
                },
                ...
            ]

        """
    
        query = """
            UNWIND $edges AS e
            MATCH (r:Recipe {recipeId: e.recipeId})
            MATCH (i:Ingredient {ingredientName: e.ingredientName})
            MERGE (r)-[:HAS_INGREDIENT]->(i)
        """
    
        return self.run_query(query, {"edges": edge_records})

    
    def create_relationship(self, label1, prop1, rel_type, label2, prop2, edge_prop=None):
        """
        Create a relationship between two nodes:
          1. Create nodes if they don’t exist (using existing helpers)
          2. If the relationship exists, return it and print a message
          3. Otherwise, create and return it
        """
        # If node1 doesn't exist, Create it
        if not self.node_exists(label1, prop1):
            if self.enable_logging:
                print(f"{label1} node does not exist — creating it.")
            self.create_node(label1, prop1)

        # If node2 doesn't exist, Create it
        if not self.node_exists(label2, prop2):
            if self.enable_logging:
                print(f"{label2} node does not exist — creating it.")
            self.create_node(label2, prop2)

        # Check if relationship already exists
        if self.relationship_exists(label1, prop1, rel_type, label2, prop2):
            if self.enable_logging:
                print(f"Relationship '{rel_type}' already exists between {label1} and {label2}.")
            return self.get_relationship(label1, prop1, rel_type, label2, prop2)

        # Build relationship creation query
        prop1_str = ", ".join([f"{key}: $prop1.{key}" for key in prop1.keys()])
        prop2_str = ", ".join([f"{key}: $prop2.{key}" for key in prop2.keys()])

        if edge_prop:
            edge_keys = list(edge_prop.keys())
            edge_str = ", ".join([f"{key}: $edge_prop.{key}" for key in edge_keys])
            edge_prop_clause = f"{{{edge_str}}}"
        else:
            edge_prop_clause = ""

        query = f"""
        MATCH (a:{label1} {{{prop1_str}}}), (b:{label2} {{{prop2_str}}})
        CREATE (a)-[r:{rel_type} {edge_prop_clause}]->(b)
        RETURN type(r) AS relationship, properties(r) AS edge_properties
        """

        params = {"prop1": prop1, "prop2": prop2}
        if edge_prop:
            params["edge_prop"] = edge_prop

        result = self.run_query(query, params, single=True)
        if self.enable_logging:
            print(f"Created new relationship '{rel_type}' between {label1} and {label2}.")
        return result

In [39]:
#RECIPE-INGREDIENTS TABLE##################################################################################################
def create_recipe_nodes(graph, recipe_df, batch_size=100000):
    n_rows = len(recipe_df)
    for i in range(0, n_rows, batch_size):
        if(i%500000 == 0):
            print(i,'/',n_rows)
        records = recipe_df[i:min(n_rows, i+batch_size)].to_dict("records")
        graph.create_recipe_nodes(records)

def create_ingredient_nodes(graph, ingredient_df, batch_size=100000):
    n_rows = len(ingredient_df)
    for i in range(0, n_rows, batch_size):
        if(i%500000 == 0):
            print(i,'/',n_rows)
        records = ingredient_df[i:min(n_rows, i+batch_size)].to_dict("records")
        graph.create_ingredient_nodes(records)

def create_recipe_ingredient_relationship(graph, hasIngredient_df, batch_size=100000):
    n_rows = len(hasIngredient_df)
    for i in range(0, n_rows, batch_size):
        if(i%500000 == 0):
            print(i,'/',n_rows)
        records = hasIngredient_df[i:min(n_rows, i+batch_size)].to_dict("records")
        graph.create_hasIngredient_edges(records)

def write_recipe_ingredient_rel(graph, csv_path):

    #Process the df
    df = pd.read_csv(csv_path)
    #df["ingredients_normalized"] = df["ingredients_normalized"].apply(ast.literal_eval)
    df["ingredients_normalized"] = df["ingredients_normalized"].apply(ast.literal_eval)

    
    recipe_df = df[["id", "title", "ingredients"]].drop_duplicates().rename(columns={"id": "recipeId", "title": "recipeName", "ingredients": "originalIngredients"})
    df_exploded = df.explode("ingredients_normalized").reset_index(drop=True).rename(columns={"ingredients_normalized": "ingredientName", "id": "recipeId"})
    ingredient_df = df_exploded[["ingredientName"]]
    ingredient_df.dropna(inplace = True)
    ingredient_df.drop_duplicates(subset=["ingredientName"], inplace=True)

    hasIngredient_df = df_exploded[["recipeId", "ingredientName"]]
    ingredient_df.dropna(inplace = True)
    ingredient_df.drop_duplicates(inplace=True)
    ################

    print('starting recipe node creation')
    create_recipe_nodes(graph, recipe_df)
    print('starting ingredient node creation')
    create_ingredient_nodes(graph, ingredient_df)
    print('starting has_ingredient node creation')
    create_recipe_ingredient_relationship(graph, hasIngredient_df[15790000:])


#PRODUCTS-INGREDIENTS TABLE################################################################################################
def create_product_nodes(graph, product_df, batch_size=100000):
    n_rows = len(product_df)
    for i in range(0, n_rows, batch_size):
        if(i%500000 == 0):
            print(i,'/',n_rows)
        records = product_df[i:min(n_rows, i+batch_size)].to_dict("records")
        graph.create_product_nodes(records)


def create_product_ingredient_relationship(graph, hasProduct_df, batch_size=100000):
    n_rows = len(hasProduct_df)
    for i in range(0, n_rows, batch_size):
        if(i%500000 == 0):
            print(i,'/',n_rows)
        records = hasProduct_df[i:min(n_rows, i+batch_size)].to_dict("records")
        graph.create_hasProduct_edges(records)

def write_product_ingredient_rel(graph, csv_path):

    #Process the df
    df = pd.read_csv(csv_path)
    df = df.rename(columns={"fdc_id": "productId", "description": "productDescription", "mapped_ingredient": "ingredientName"})

    product_df = df[["productId", "productDescription"]].drop_duplicates()
    ingredient_df = df[["ingredientName"]].drop_duplicates()
    hasProduct_df = df[["productId", "ingredientName"]].drop_duplicates()
    ################

    print("starting product node creation")
    create_product_nodes(graph, product_df)
    print("starting ingredient node creation")
    create_ingredient_nodes(graph, ingredient_df)
    print("starting has_product edge creation")
    create_product_ingredient_relationship(graph, hasProduct_df)

#PRODUCTS-NUTRIENTS TABLE##################################################################################################
def create_nutrient_nodes(graph, nutrient_df, batch_size=100000):
    n_rows = len(nutrient_df)
    for i in range(0, n_rows, batch_size):
        records = nutrient_df[i:min(n_rows, i+batch_size)].to_dict("records")
        graph.create_nutrient_nodes(records)


def create_product_nutrient_relationship(graph, hasNutrient_df, batch_size=100000):
    n_rows = len(hasNutrient_df)
    for i in range(0, n_rows, batch_size):
        if(i%500000 == 0):
            print(i,'/',n_rows)
        records = hasNutrient_df[i:min(n_rows, i+batch_size)].to_dict("records")
        graph.create_hasNutrient_edges(records)


def write_product_nutrient_rel(graph, postgres_conn):

    query = """
            SELECT
                fn.fdc_id AS productId,
                fb.description AS productDescription,
                fn.nutrient_id AS nutrientId,
                fn.amount AS amount,
                nm.name AS nutrientName,
                nm.unit_name AS unitName
            FROM
                usda_2022_branded_food_nutrients fn,
                usda_2022_nutrient_master nm,
                usda_2022_food_branded_experimental fb
            WHERE
                fn.nutrient_id = nm.id AND
                fb.fdc_id = fn.fdc_id;
        """

    df = pd.read_sql(query, postgres_conn.get_engine())
    df = df.rename(columns = dict(zip(['productid', 'productdescription', 'nutrientid', 'amount',
       'nutrientname', 'unitname'], ['productId', 'productDescription', 'nutrientId', 'amount',
       'nutrientName', 'unitName'])))

    nutrient_df = df[["nutrientId", "nutrientName", "unitName"]].drop_duplicates()
    product_df = df[["productId", "productDescription"]].drop_duplicates()
    hasNutrient_df = df[["nutrientId", "productId", "amount"]].drop_duplicates()

    print("starting nutrient node creation")
    create_nutrient_nodes(graph, nutrient_df)
    print("starting product node creation")
    create_product_nodes(graph, product_df)
    print("starting has_nutrient node creation")
    create_product_nutrient_relationship(graph, hasNutrient_df)



# MAIN

In [40]:
postgres_conn = PostgresCon()
graph = GraphDB()

In [10]:
write_recipe_ingredient_rel(graph, os.path.join(os.getcwd(), 'foodkg_spacy_processed_cleaned.csv'))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ingredient_df.dropna(inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ingredient_df.drop_duplicates(subset=["ingredientName"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ingredient_df.dropna(inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ingredient_df.drop_dupl

starting recipe node creation
starting ingredient node creation
starting has_ingredient node creation
0 / 3873957
500000 / 3873957
1000000 / 3873957
1500000 / 3873957
2000000 / 3873957
2500000 / 3873957
3000000 / 3873957
3500000 / 3873957


In [11]:
write_product_nutrient_rel(graph, postgres_conn)

starting nutrient node creation
starting product node creation
0 / 1590701
500000 / 1590701
1000000 / 1590701
1500000 / 1590701
starting has_nutrient node creation
0 / 21453549
500000 / 21453549
1000000 / 21453549
1500000 / 21453549
2000000 / 21453549
2500000 / 21453549
3000000 / 21453549
3500000 / 21453549
4000000 / 21453549
4500000 / 21453549
5000000 / 21453549
5500000 / 21453549
6000000 / 21453549
6500000 / 21453549
7000000 / 21453549
7500000 / 21453549
8000000 / 21453549
8500000 / 21453549
9000000 / 21453549
9500000 / 21453549
10000000 / 21453549
10500000 / 21453549
11000000 / 21453549
11500000 / 21453549
12000000 / 21453549
12500000 / 21453549
13000000 / 21453549
13500000 / 21453549
14000000 / 21453549
14500000 / 21453549
15000000 / 21453549
15500000 / 21453549
16000000 / 21453549
16500000 / 21453549
17000000 / 21453549
17500000 / 21453549
18000000 / 21453549
18500000 / 21453549
19000000 / 21453549
19500000 / 21453549
20000000 / 21453549
20500000 / 21453549
21000000 / 21453549


In [None]:
write_product_ingredient_rel(graph, 'brian_results_async_3.csv')

Below code provided to show steps taken to process the output csv files from Spacy and OpenAI preprocessing step and ensure csv matches expected format for the node/edge writer script. The final CSVs are already saved so no need to reexecute below code blocks

## Process the Recipe Ingredients CSV

In [6]:
df = pd.read_csv(
    "foodkg_spacy_processed.csv",
    engine="python",
    on_bad_lines="warn"
)
df.tail(5)

Unnamed: 0,recipe_id,original_ingredients,processed_ingredients
2231137,2231137,['1/2 cup chocolate hazelnut spread (recommend...,"['chocolate hazelnut', 'round wheat tortilla',..."
2231138,2231138,"['1 dozen eggs', '1 paprika', '1 salt and pepp...","['dozen egg', 'paprika', 'salt pepper', 'likin..."
2231139,2231139,"['150 grams Daikon radish', '1 tbsp Sesame oil...","['daikon radish', 'sesame oil', 'sesame seed',..."
2231140,2231140,"['1 cup apple cider', '6 tablespoons sugar', '...","['apple cider', 'sugar', 'kosher salt', 'bay',..."
2231141,2231141,"['1 pound ground veal', '1/2 pound sweet Itali...","['veal', 'sweet italian sausage', 'casing', 'd..."


In [6]:
postgres_conn = PostgresCon()

query = """
            SELECT
                f.id AS recipe_id,
                f.title AS recipeName
            FROM
                public."FoodKG" f;
        """

df_recipeName = pd.read_sql(query, postgres_conn.get_engine())
df_recipeName

Unnamed: 0,recipe_id,recipename
0,715,Zucchini Stew
1,716,Party Potatoes
2,717,Gran'S Toll House Cookies
3,718,Hominy Casserole(Large Recipe; Make Half)
4,720,Poached Chicken Breast In Wine
...,...,...
2231137,710,Chocolate Macaroon Bars
2231138,711,Chicken Breasts In Lemon Sauce
2231139,712,Spaghetti Meat Sauce
2231140,713,Old Fashion Punch


In [7]:
df_joined = df.merge(df_recipeName, on="recipe_id", how="left")
df_joined

Unnamed: 0,recipe_id,original_ingredients,processed_ingredients,recipename
0,0,"['1 c. firmly packed brown sugar', '1/2 c. eva...","['brown sugar', 'milk', 'vanilla', 'nut', 'but...",No-Bake Nut Cookies
1,1,"['1 small jar chipped beef, cut up', '4 boned ...","['beef', 'chicken breast', 'cream mushroom sou...",Jewell Ball'S Chicken
2,2,"['2 (16 oz.) pkg. frozen corn', '1 (8 oz.) pkg...","['corn', 'cream cheese', 'butter', 'garlic', '...",Creamy Corn
3,3,"['1 large whole chicken', '2 (10 1/2 oz.) cans...","['chicken', 'chicken gravy', 'cream mushroom s...",Chicken Funny
4,4,"['1 c. peanut butter', '3/4 c. graham cracker ...","['peanut butter', 'graham cracker crumb', 'but...",Reeses Cups(Candy)
...,...,...,...,...
2231137,2231137,['1/2 cup chocolate hazelnut spread (recommend...,"['chocolate hazelnut', 'round wheat tortilla',...",Sunnys Fake Crepes
2231138,2231138,"['1 dozen eggs', '1 paprika', '1 salt and pepp...","['dozen egg', 'paprika', 'salt pepper', 'likin...",Devil Eggs
2231139,2231139,"['150 grams Daikon radish', '1 tbsp Sesame oil...","['daikon radish', 'sesame oil', 'sesame seed',...",Extremely Easy and Quick - Namul Daikon Salad
2231140,2231140,"['1 cup apple cider', '6 tablespoons sugar', '...","['apple cider', 'sugar', 'kosher salt', 'bay',...",Pan-Roasted Pork Chops With Apple Fritters


In [8]:
df_joined = df_joined.rename(columns = {'recipe_id':'id', 'recipename':'title', 'original_ingredients': 'ingredients', 'processed_ingredients': 'ingredients_normalized'})
df_joined

Unnamed: 0,id,ingredients,ingredients_normalized,title
0,0,"['1 c. firmly packed brown sugar', '1/2 c. eva...","['brown sugar', 'milk', 'vanilla', 'nut', 'but...",No-Bake Nut Cookies
1,1,"['1 small jar chipped beef, cut up', '4 boned ...","['beef', 'chicken breast', 'cream mushroom sou...",Jewell Ball'S Chicken
2,2,"['2 (16 oz.) pkg. frozen corn', '1 (8 oz.) pkg...","['corn', 'cream cheese', 'butter', 'garlic', '...",Creamy Corn
3,3,"['1 large whole chicken', '2 (10 1/2 oz.) cans...","['chicken', 'chicken gravy', 'cream mushroom s...",Chicken Funny
4,4,"['1 c. peanut butter', '3/4 c. graham cracker ...","['peanut butter', 'graham cracker crumb', 'but...",Reeses Cups(Candy)
...,...,...,...,...
2231137,2231137,['1/2 cup chocolate hazelnut spread (recommend...,"['chocolate hazelnut', 'round wheat tortilla',...",Sunnys Fake Crepes
2231138,2231138,"['1 dozen eggs', '1 paprika', '1 salt and pepp...","['dozen egg', 'paprika', 'salt pepper', 'likin...",Devil Eggs
2231139,2231139,"['150 grams Daikon radish', '1 tbsp Sesame oil...","['daikon radish', 'sesame oil', 'sesame seed',...",Extremely Easy and Quick - Namul Daikon Salad
2231140,2231140,"['1 cup apple cider', '6 tablespoons sugar', '...","['apple cider', 'sugar', 'kosher salt', 'bay',...",Pan-Roasted Pork Chops With Apple Fritters


In [9]:
df_joined.to_csv('foodkg_spacy_processed_cleaned.csv', index=False)

## Process the products-ingredients mapping csv to rename cols and drop null vals

In [33]:
df = pd.read_csv('mapped_responses.csv')
df.head(5)

Unnamed: 0,fdc_id,description,response
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",biscuits
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger...",cinnamon
2,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa...",breadcrumb
3,167515,"George Weston Bakeries, Thomas English Muffins",muffin
4,167516,"Waffles, buttermilk, frozen, ready-to-heat",waffle


In [34]:
df = df.rename(columns={'response':'mapped_ingredient'})
df

Unnamed: 0,fdc_id,description,mapped_ingredient
0,167512,"Pillsbury Golden Layer Buttermilk Biscuits, Ar...",biscuits
1,167513,"Pillsbury, Cinnamon Rolls with Icing, refriger...",cinnamon
2,167514,"Kraft Foods, Shake N Bake Original Recipe, Coa...",breadcrumb
3,167515,"George Weston Bakeries, Thomas English Muffins",muffin
4,167516,"Waffles, buttermilk, frozen, ready-to-heat",waffle
...,...,...,...
963946,1546543,REDUCED FAT ICE CREAM,cream
963947,1546544,REDUCED FAT ICE CREAM,ice cream
963948,1546545,SHRIMP MEAT,shrimp
963949,1546546,PREMIUM ICE CREAM,ice cream


In [38]:
df.to_csv('brian_results_async_3.csv',index=False)

In [37]:
df = df.dropna()
df[df.isna().any(axis=1)]

Unnamed: 0,fdc_id,description,mapped_ingredient
