# Build a medium size KG from a CSV dataset

OH: "If you build it, they will SPARQL"

First, let's initialize the KG object the same as we did in an earlier notebook:

In [1]:
import kglab

namespaces = {
    "wtm": "http://purl.org/heals/food/",
    "ind": "http://purl.org/heals/ingredient/",
    }

kg = kglab.KnowledgeGraph(
    name = "A recipe KG example based on Food.com",
    base_uri = "https://www.food.com/recipe/",
    language = "en",
    namespaces = namespaces,
    )

Next, we'll define a dictionary that maps (somewhat magically) from strings (i.e., "labels") to ingredients defined in the <http://purl.org/heals/ingredient/> vocabulary:

In [2]:
common_ingredient = {
    "water": kg.get_ns("ind").Water,
    "salt": kg.get_ns("ind").Salt,
    "pepper": kg.get_ns("ind").BlackPepper,
    "black pepper": kg.get_ns("ind").BlackPepper,
    
    "baking powder": kg.get_ns("ind").BakingPowder,
    "baking soda": kg.get_ns("ind").BakingSoda,

    "vanilla": kg.get_ns("ind").VanillaExtract,
    "vanilla extract": kg.get_ns("ind").VanillaExtract,

    "butter": kg.get_ns("ind").Butter,
    "milk": kg.get_ns("ind").CowMilk,
    "egg": kg.get_ns("ind").ChickenEgg,
    "eggs": kg.get_ns("ind").ChickenEgg,

    "sugar": kg.get_ns("ind").WhiteSugar,
    "brown sugar": kg.get_ns("ind").BrownSugar,
    "honey": kg.get_ns("ind").Honey,

    "flour": kg.get_ns("ind").AllPurposeFlour,
    "all-purpose flour": kg.get_ns("ind").AllPurposeFlour,
    "whole wheat flour": kg.get_ns("ind").WholeWheatFlour,

    "olive oil": kg.get_ns("ind").OliveOil,
    "vinegar": kg.get_ns("ind").AppleCiderVinegar,

    "onion": kg.get_ns("ind").Onion,
    "onions": kg.get_ns("ind").Onion,
    "garlic": kg.get_ns("ind").Garlic,
    "garlic clove": kg.get_ns("ind").Garlic,
    "garlic cloves": kg.get_ns("ind").Garlic,
    "cabbage": kg.get_ns("ind").Cabbage,
    "carrot": kg.get_ns("ind").Carrot,
    "carrots": kg.get_ns("ind").Carrot,
    "celery": kg.get_ns("ind").Celery,
    "potato": kg.get_ns("ind").Potato,
    "potatoes": kg.get_ns("ind").Potato,
    "tomato": kg.get_ns("ind").Tomato,
    "tomatoes": kg.get_ns("ind").Tomato,
}

Now load the dataset of recipes into a dataframe:

In [3]:
import pandas as pd

df = pd.read_csv("dat/recipes.csv")
df.head()

Unnamed: 0,id,name,minutes,tags,description,ingredients
0,35653,make it your way shortcakes,25,"['30-minutes-or-less', 'time-to-make', 'course...",top with freshly sliced strawberries and whipp...,"['flour', 'salt', 'baking powder', 'sugar', 'b..."
1,292568,better cake mix,35,"['60-minutes-or-less', 'time-to-make', 'course...",to make a boxed cake mix even better add these...,"['butter', 'flour', 'baking powder']"
2,236346,1 bowl 1 person mashed potatoes,25,"['30-minutes-or-less', 'time-to-make', 'course...",ever feel like having mashed potatoes but you ...,"['potato', 'butter', 'milk', 'salt', 'pepper']"
3,164636,1 1 1 tempura batter,5,"['15-minutes-or-less', 'time-to-make', 'course...","i use this everytime i make onion rings, hot p...","['egg', 'flour', 'water']"
4,144841,2 step pound cake for a kitchen aide mixer,110,"['time-to-make', 'course', 'preparation', 'occ...",this recipe was published in a southern living...,"['flour', 'sugar', 'butter', 'milk', 'eggs', '..."


Then iterate over the rows in the dataframe, representing a new recipe in the KG for each row:

In [6]:
import rdflib as rdf
from rdflib.namespace import RDF, XSD

for index, row in df.iterrows():
    recipe_id = row["id"]
    node = rdf.URIRef("https://www.food.com/recipe/{}".format(recipe_id))
    kg.add(node, RDF.type, kg.get_ns("wtm").Recipe)

    recipe_name = row["name"]
    kg.add(node, kg.get_ns("skos").definition, rdf.Literal(recipe_name))
    
    cook_time = row["minutes"]
    kg.add(node, kg.get_ns("wtm").hasCookTime, rdf.Literal(cook_time, datatype=XSD.integer))
    
    ind_list = eval(row["ingredients"])

    for ind in ind_list:
        ingredient = ind.strip()
        ingredient_obj = common_ingredient[ingredient]
        kg.add(node, kg.get_ns("wtm").hasIngredient, ingredient_obj)

The graph has grown by a couple orders of magnitude, so it's visualization should be more interesting now.
Let's take a look:

In [8]:
VIS_STYLE = {
    "wtm": {
        "color": "orange",
        "size": 20,
    },
    "ind":{
        "color": "blue",
        "size": 35,
    },
}


g = kg.vis(notebook=True, style=VIS_STYLE)
g.show("tmp.html")

Given the defaults for this kind of visualization, there's likely a dense pulsing mass of orange (recipes) at the center, with a close cluster of common ingredients (dark blue), surrounded by less common ingredients and cooking times (light blue) ?

Let's serialize the KG to a local file:

In [10]:
kg.save_ttl("tmp.ttl")