In [1]:
# for use in tutorial and development; do not include this `sys.path` change in production:
import sys ; sys.path.insert(0, "../")
import os ; from os.path import dirname

# Produce a dataset for working with PyTorch_Geometric

### Initialize
Set up the dependencies plus a reproducible environment.

In [2]:
from collections import defaultdict
from icecream import ic
import csv
import kglab
import pandas as pd
import pickle
import rdflib
import re
import typing

pd.set_option("max_rows", None)

### Pre-defined Ingredients
Extract and reuse as many ingredients from the FoodOn ontology as possible.

In [3]:
NAMESPACES = {
    "wtm":  "http://purl.org/heals/food/",
    "ind":  "http://purl.org/heals/ingredient/",
    }

ind_kg = kglab.KnowledgeGraph(
    namespaces = NAMESPACES,
    )

In [4]:
sparql = """
    SELECT ?ind ?label ?definition
    WHERE {
        ?ind rdf:type wtm:Ingredient .
        ?ind rdfs:label ?label .
        ?ind skos:definition ?definition
    }
"""

In [5]:
#url = NAMESPACES["ind"]
ind_kg.load_rdf("ind.rdf", format="xml")

IND_DF = ind_kg.query_as_df(sparql)
IND_DF.head()

Unnamed: 0,ind,label,definition
0,ind:Thyme,thyme,any of a genus (Thymus) of Eurasian mints with...
1,ind:Cheese,cheese,the curd of milk separated from the whey and p...
2,ind:OliveOil,olive oil,a pale yellow to yellowish-green nondrying oil...
3,ind:BlackPeppercorn,black peppercorn,a dried berry of the black pepper
4,ind:VanillaExtract,vanilla extract,a solution made by macerating and percolating ...


How many unique ingredients have been pre-defined by FoodOn?

In [6]:
unique_labels = list(IND_DF.label.unique())
ic(len(unique_labels))
unique_labels

ic| len(unique_labels): 75


['thyme',
 'cheese',
 'olive oil',
 'black peppercorn',
 'vanilla extract',
 "baker's yeast",
 'mayonnaise',
 'chicken egg',
 'potato',
 'oat',
 'corned beef',
 'whole wheat flour',
 'tapioca flour',
 'green pepper',
 'beef',
 'bacon',
 'black pepper',
 'pumpkin seed',
 'peanut',
 'white sugar',
 'almond meal',
 'banana',
 'carrot',
 'canola oil',
 'chicken',
 'brown sugar',
 'cane sugar',
 'almond',
 'beef bouillon',
 'blueberry',
 'baking powder',
 'baking soda',
 'chuck roast',
 'paprika',
 'pecan',
 'basil',
 'coconut',
 'gluten free flour',
 'onion',
 'green onion',
 'coconut milk',
 'rosemary',
 'cornstarch',
 'dried cranberry',
 'cow milk',
 'corn syrup',
 'honey',
 'kamut flour',
 'salt',
 'sesame oil',
 'celery',
 'balsamic vinegar',
 'walnut',
 'soy sauce',
 'cocoa powder',
 'clove',
 'oregano',
 'parsley',
 'chayote squash',
 'lamb',
 'tomato',
 'kamut',
 'water',
 'apple cider vinegar',
 'all purpose flour',
 'lemon juice',
 'garlic',
 'butter',
 'vegetable oil',
 'whole gr

### Local Class Definitions
Define a namedtuple factor for recipes, plus a subclass of `kglab.KnowledgeGraph` specific to this application.

In [7]:
class Recipe (typing.NamedTuple):
    """simple representation for a parsed recipe from Food.com"""
    id: str
    definition: str
    cook_time: int
    ind_set: typing.Set[str]

In [8]:
class RecipeKG (kglab.KnowledgeGraph):
    _SYNONYMS = {}
    _INGREDIENTS = {}

    def add_syn (
        self,
        syn_list: list,
        targets: list,
        ) -> None:
        for syn in syn_list:
            self._SYNONYMS[syn] = set(targets)


    def map_syn (
        self,
        syn: str,
        ) -> typing.Set:
        if syn in self._SYNONYMS:
            return self._SYNONYMS[syn]
 
        return { syn }


    def add_ind (
        self,
        label: str,
        entity: str,
        *,
        descrip = "",
        ) -> None:
        self._INGREDIENTS[label] = kg.get_ns("ind")[entity]

        ind, _, descrip = IND_DF.loc[IND_DF["ind"] == f"ind:{entity}"].values[0]
        ic(entity, label, descrip)

        node = self.get_ns("ind")[entity]
        self.add( node, self.get_ns("rdf").type, self.get_ns("wtm").Ingredient )
        self.add( node, self.get_ns("skos").prefLabel, rdflib.Literal(label) )
        self.add( node, self.get_ns("skos").definition, rdflib.Literal(descrip) )


    def new_ind (
        self,
        label: str,
        entity: str,
        *,
        descrip = None,
        ) -> None:
        node = rdflib.URIRef("{}{}".format(NAMESPACES["ind"], entity))
        self._INGREDIENTS[label] = node
        
        self.add( node, self.get_ns("rdf").type, self.get_ns("wtm").Ingredient )
        self.add( node, self.get_ns("skos").prefLabel, rdflib.Literal(label) )

        if descrip:
            self.add( node, self.get_ns("skos").definition, rdflib.Literal(descrip) )

In [9]:
kg = RecipeKG(
    base_uri = "https://www.food.com/recipe/",
    namespaces = NAMESPACES,
    )

kg.load_rdf("recipe_lg.ttl")

<__main__.RecipeKG at 0x12621b5d0>

### Index the Labels

In [10]:
sparql = """
    SELECT ?ind ?label
    WHERE {
        ?ind rdf:type wtm:Ingredient .
        ?ind skos:prefLabel ?label .
    }
"""

In [11]:
df = kg.query_as_df(sparql)
df.head()

Unnamed: 0,ind,label
0,ind:Parsley,parsley
1,ind:Onion,onion
2,ind:ChickenBroth,chicken broth
3,ind:GreenPepper,green pepper
4,ind:VegetableOil,oil


In [12]:
for _, row in df.iterrows():
    entity = row.ind.split(":")[1]
    kg._INGREDIENTS[row.label] = kg.get_ns("ind")[entity]

In [13]:
kg._INGREDIENTS

{'parsley': rdflib.term.URIRef('http://purl.org/heals/ingredient/Parsley'),
 'onion': rdflib.term.URIRef('http://purl.org/heals/ingredient/Onion'),
 'chicken broth': rdflib.term.URIRef('http://purl.org/heals/ingredient/ChickenBroth'),
 'green pepper': rdflib.term.URIRef('http://purl.org/heals/ingredient/GreenPepper'),
 'oil': rdflib.term.URIRef('http://purl.org/heals/ingredient/VegetableOil'),
 'corned beef': rdflib.term.URIRef('http://purl.org/heals/ingredient/CornedBeef'),
 'brown sugar': rdflib.term.URIRef('http://purl.org/heals/ingredient/BrownSugar'),
 'bacon': rdflib.term.URIRef('http://purl.org/heals/ingredient/Bacon'),
 'paprika': rdflib.term.URIRef('http://purl.org/heals/ingredient/Paprika'),
 'lemon juice': rdflib.term.URIRef('http://purl.org/heals/ingredient/LemonJuice'),
 'egg': rdflib.term.URIRef('http://purl.org/heals/ingredient/ChickenEgg'),
 'gluten free flour': rdflib.term.URIRef('http://purl.org/heals/ingredient/GlutenFreeFlour'),
 'flour': rdflib.term.URIRef('http://

### Define Synonyms
As part of the data preparation, build a lookup table for translating ingredient names (synonyms) to the preferred labels for known ingredients:

In [14]:
kg.add_syn(["pepper"], ["black pepper"])
kg.add_syn(["fresh ground black pepper"], ["black pepper"])
kg.add_syn(["salt and pepper"], ["black pepper", "salt"])

kg.add_syn(["carrots"], ["carrot"])

kg.add_syn(["basil"], ["dried basil"])

kg.add_syn(["eggs", "chicken egg"], ["egg"])

kg.add_syn(["all-purpose flour"], ["flour"])
kg.add_syn(["all purpose flour"], ["flour"])

kg.add_syn(["garlic clove"], ["garlic"])
kg.add_syn(["garlic cloves"], ["garlic"])

kg.add_syn(["cow milk"], ["milk"])

kg.add_syn(["vegetable oil"], ["oil"])
kg.add_syn(["extra virgin olive oil"], ["olive oil"])
kg.add_syn(["onions"], ["onion"])

kg.add_syn(["potatoes"], ["potato"])

kg.add_syn(["cane sugar", "granulated sugar", "white sugar"], ["sugar"])

kg.add_syn(["fresh tomatoes", "tomatoes"], ["tomato"])

kg.add_syn(["vanilla extract"], ["vanilla"])
kg.add_syn(["apple cider vinegar"], ["vinegar"])


In [15]:
known_syn = {*{
    x 
    for l in kg._SYNONYMS.values()
    for x in l
}}

ic(len(known_syn));

ic| len(known_syn): 16


---
### Analyze Ingredients
**ITERATE HERE**:
Analyze the ingredient labels from 250K recipes.

In [16]:
MAX_ROW = 250000 # 231638

max_context = 0
min_context = 1000

RECIPES: typing.List[Recipe] = []
VOCAB: typing.Set[str] = set()

with open(dirname(os.getcwd()) + "/dat/all_ind.csv", "r") as f:
    reader = csv.reader(f)
    next(reader, None) # remove file header

    for i, row in enumerate(reader):
        # parse the recipe metadata
        id = row[0]
        definition = row[1].replace("   ", " ")
        cook_time = int(row[2])

        # substitute synonyms for ingredients
        ind_set = {
            ind
            for raw_ind in eval(row[3])
            for ind in kg.map_syn(raw_ind)
        }

        if len(ind_set) > 1:
            RECIPES.append(Recipe(id, definition, cook_time, ind_set))
            VOCAB.update(ind_set)
        
            max_context = max(max_context, len(ind_set))
            min_context = min(min_context, len(ind_set))

        if i > MAX_ROW:
            break

In [17]:
print("max context: {} unique ingredients per recipe".format(max_context))
print("min context: {} unique ingredients per recipe".format(min_context))
print("vocab size", len(list(VOCAB)))

max context: 43 unique ingredients per recipe
min context: 2 unique ingredients per recipe
vocab size 14924


Use `pickle` to save this larger superset of the recipes dataset to the `tmp.pkl` file:

In [18]:
pickle.dump(RECIPES, open("tmp.pkl", "wb"))
RECIPES[:3]

[Recipe(id='137739', definition='arriba baked winter squash mexican style', cook_time=55, ind_set={'butter', 'salt', 'winter squash', 'honey', 'mixed spice', 'mexican seasoning', 'olive oil'}),
 Recipe(id='31490', definition='a bit different  breakfast pizza', cook_time=30, ind_set={'milk', 'salt', 'egg', 'sausage patty', 'black pepper', 'prepared pizza crust', 'cheese'}),
 Recipe(id='112140', definition='all in the kitchen  chili', cook_time=130, ind_set={'water', 'tomato soup', 'kidney beans', 'salt', 'lettuce', 'ground cumin', 'yellow onions', 'tomato paste', 'chili powder', 'rotel tomatoes', 'ground beef', 'cheddar cheese', 'diced tomatoes'})]

How many of the recipes are complete in the sense that each of their ingredients is known?

In [19]:
unknown_ind = defaultdict(int)
complete_recipes = 0

for r in RECIPES:
    has_all_ind = True
    
    for ind in r.ind_set:
        if ind not in kg._INGREDIENTS:
            unknown_ind[ind] += 1
            has_all_ind = False

    if has_all_ind:
        complete_recipes += 1

ic(max(unknown_ind.values()))
ic(complete_recipes)

ic| max(unknown_ind.values()): 14807
ic| complete_recipes: 1373


1373

Which among the unknown ingredients are the most popularly used in recipes?

In [20]:
sorted(unknown_ind.items(), key=lambda item: item[1], reverse=True)[:20]

[('parmesan cheese', 14807),
 ('cinnamon', 12560),
 ('sour cream', 11779),
 ('garlic powder', 10887),
 ('cream cheese', 9827),
 ('cheddar cheese', 8969),
 ('unsalted butter', 8935),
 ('worcestershire sauce', 7832),
 ('fresh parsley', 7656),
 ('chili powder', 6984),
 ('ground cinnamon', 6864),
 ('nutmeg', 6299),
 ('cayenne pepper', 6285),
 ('ground cumin', 6169),
 ('ground beef', 5824),
 ('green onions', 5814),
 ('red onion', 5777),
 ('walnuts', 5764),
 ('pecans', 5752),
 ('dijon mustard', 5599)]

Which among the unknown ingredients did we **miss** from the pre-defined ones in FoodOn?

In [None]:
MAX_ADD = 500

pre_def = list(IND_DF.label.unique())
count = 0

for label in unknown_ind:
    if label in pre_def:
        ind, _, descrip = IND_DF.loc[IND_DF["label"] == label].values[0]
        entity = ind.split(":")[1]
        ic(entity, label, descrip)
        kg.add_ind(label, entity)
    else:
        if count < MAX_ADD:
            entity = re.sub("[^0-9a-zA-Z]+", " ", label)
            entity = re.sub("[\s]+", " ", entity).strip()
            entity = entity.replace(" ", "_")
            kg.new_ind(label, entity)

            count += 1

ic| node: rdflib.term.URIRef('https://www.food.com/recipe/357451')
ic| node: rdflib.term.URIRef('https://www.food.com/recipe/357451')
ic| node: rdflib.term.URIRef('https://www.food.com/recipe/357451')
ic| node: rdflib.term.URIRef('https://www.food.com/recipe/357451')
ic| node: rdflib.term.URIRef('https://www.food.com/recipe/357451')
ic| node: rdflib.term.URIRef('https://www.food.com/recipe/357451')
ic| node: rdflib.term.URIRef('https://www.food.com/recipe/357451')
ic| node: rdflib.term.URIRef('https://www.food.com/recipe/357451')
ic| node: rdflib.term.URIRef('https://www.food.com/recipe/357451')
ic| node: rdflib.term.URIRef('https://www.food.com/recipe/357451')
ic| node: rdflib.term.URIRef('https://www.food.com/recipe/357451')
ic| node: rdflib.term.URIRef('https://www.food.com/recipe/357451')
ic| node: rdflib.term.URIRef('https://www.food.com/recipe/357451')
ic| node: rdflib.term.URIRef('https://www.food.com/recipe/357451')
ic| node: rdflib.term.URIRef('https://www.food.com/recipe/3574

In [54]:
ic(len(kg._INGREDIENTS))
kg._INGREDIENTS

ic| len(kg._INGREDIENTS): 574


{'parsley': rdflib.term.URIRef('http://purl.org/heals/ingredient/Parsley'),
 'onion': rdflib.term.URIRef('http://purl.org/heals/ingredient/Onion'),
 'chicken broth': rdflib.term.URIRef('http://purl.org/heals/ingredient/ChickenBroth'),
 'green pepper': rdflib.term.URIRef('http://purl.org/heals/ingredient/GreenPepper'),
 'oil': rdflib.term.URIRef('http://purl.org/heals/ingredient/VegetableOil'),
 'corned beef': rdflib.term.URIRef('http://purl.org/heals/ingredient/CornedBeef'),
 'brown sugar': rdflib.term.URIRef('http://purl.org/heals/ingredient/BrownSugar'),
 'bacon': rdflib.term.URIRef('http://purl.org/heals/ingredient/Bacon'),
 'paprika': rdflib.term.URIRef('http://purl.org/heals/ingredient/Paprika'),
 'lemon juice': rdflib.term.URIRef('http://purl.org/heals/ingredient/LemonJuice'),
 'egg': rdflib.term.URIRef('http://purl.org/heals/ingredient/ChickenEgg'),
 'gluten free flour': rdflib.term.URIRef('http://purl.org/heals/ingredient/GlutenFreeFlour'),
 'flour': rdflib.term.URIRef('http://

Which among the pre-defined ingredients did we **miss** from adding to our KG?

In [55]:
for label in pre_def:
    if label not in kg._SYNONYMS and label not in kg._INGREDIENTS:
        ind, _, descrip = IND_DF.loc[IND_DF["label"] == label].values[0]
        entity = ind.split(":")[1]
        ic(entity, label, descrip)
        kg.add_ind(label, entity)

For each identified "miss":

  * add to the synonyms list as needed
  * iterate from the `Analyze Ingredients` section forward

### Construct KG
Add the recipe entities to the KG:

In [56]:
for r in RECIPES:
    has_all_ind = True
    
    for ind in r.ind_set:
        if ind not in kg._INGREDIENTS:
            unknown_ind[ind] += 1
            has_all_ind = False

    if has_all_ind:
        node = rdflib.URIRef(f"https://www.food.com/recipe/{r.id}")
        kg.add( node, kg.get_ns("rdf").type, kg.get_ns("wtm").Recipe )
        kg.add( node, kg.get_ns("skos").definition, rdflib.Literal(r.definition) )

        code_time_node = rdflib.Literal(f"PT{r.cook_time}M", datatype=kg.get_ns("xsd").duration)
        kg.add( node, kg.get_ns("wtm").hasCookTime, code_time_node)

        for ind in r.ind_set:
            if ind == "cream of tartar":
                ic(node)
                break

            obj = kg._INGREDIENTS[ind]
            kg.add( node, kg.get_ns("wtm").hasIngredient, obj )

How many recipes dow we have now?

In [57]:
sparql = """
    SELECT ?rec
    WHERE {
        ?rec rdf:type wtm:Recipe .
    }
"""

df = kg.query_as_df(sparql)
len(df)

15407

In [58]:
measure = kglab.Measure()
measure.measure_graph(kg)

ic(measure.get_edge_count())
ic(measure.get_node_count())

ic| measure.get_edge_count(): 160980
ic| measure.get_node_count(): 15983


15983

Serialize a "checkpoint" of our larger KG:

In [59]:
kg.save_rdf("recipe_lg.ttl")

In [26]:
1 / 0 # stop execution

ZeroDivisionError: division by zero

---

## Gensim Embedding...

Now reshape this data into a vector of vectors of ingredients per recipe, to use for training a [*word2vec*](https://arxiv.org/abs/1301.3781) vector embedding model:

In [None]:
vectors = [
    [
        ind
        for ind in ind_set
    ]
    for id, ind_set in recipes
]

vectors[:3]

We'll use the [`Word2Vec`](https://radimrehurek.com/gensim/models/word2vec.html) implementation in the `gensim` library (i.e., *deep learning*) to train an embedding model.
This approach tends to work best if the training data has at least 100K rows.

Let's also show how to serialize the *word2vec* results, saving them to the `tmp.w2v` file so they could be restored later for other use cases.

NB: there is work in progress which will replace `gensim` with `pytorch` instead.

In [None]:
import gensim

MIN_COUNT = 2
model_path = "tmp.w2v"

model = gensim.models.Word2Vec(vectors, min_count=MIN_COUNT, window=max_context)
model.save(model_path)

The `get_related()` function takes any ingredient as input, using the embedding model to find the most similar other ingredients – along with calculating [`levenshtein`](https://github.com/toastdriven/pylev) edit distances (string similarity) among these labels. Then it calculates *percentiles* for both metrics in [`numpy`](https://numpy.org/) and returns the results as a [`pandas`](https://pandas.pydata.org/) DataFrame.

In [None]:
import numpy as np
import pandas as pd
import pylev

def term_ratio (target, description):
    d_set = set(description.split(" "))
    num_inter = len(d_set.intersection(target))
    return num_inter / float(len(target))


def get_related (model, query, target, n=20, granularity=100):
    """return a DataFrame of the closely related items"""
    try:
        bins = np.linspace(0, 1, num=granularity, endpoint=True)

        v = sorted(
            model.wv.most_similar(positive=[query], topn=n), 
            key=lambda x: x[1], 
            reverse=True
        )
        
        df = pd.DataFrame(v, columns=["ingredient", "similarity"])
        
        s = df["similarity"]
        quantiles = s.quantile(bins, interpolation="nearest")
        df["sim_pct"] = np.digitize(s, quantiles) - 1
        
        df["levenshtein"] = [ pylev.levenshtein(d, query) / len(query) for d in df["ingredient"] ]
        s = df["levenshtein"]
        quantiles = s.quantile(bins, interpolation="nearest")
        df["lev_pct"] = granularity - np.digitize(s, quantiles)

        df["term_ratio"] = [ term_ratio(target, d) for d in df["ingredient"] ]

        return df
    except KeyError:
        return pd.DataFrame(columns=["ingredient", "similarity", "percentile"])

Let's try this with `dried basil` as the ingredient to query, and review the top `50` most similar other ingredients returned as the DataFrame `df`:

In [None]:
pd.set_option("max_rows", None)

target = set([ "basil" ])

df = get_related(model, "dried basil", target, n=50)
df

Note how some of the most similar items, based on vector embedding, are *synonyms* or special forms of our query `dried basil` ingredient: `dried basil leaves`, `dry basil`, `dried sweet basil leaves`, etc. These tend to rank high in terms of levenshtein distance too.

Let's plot the similarity measures:

In [None]:
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use("ggplot")

df["similarity"].plot(alpha=0.75, rot=0)
plt.show()

Notice the inflection points at approximately `0.56` and again at `0.47` in that plot.
We could use some statistical techniques (e.g., clustering) to segment the similarities into a few groups:

  * highest similarity – potential synonyms for the query
  * mid-range similarity – potential [hypernyms and hyponyms](https://en.wikipedia.org/wiki/Hyponymy_and_hypernymy) for the query
  * long-tail similarity – other ingredients that pair well with the query

In this example, below a threshold of the 75th percentile for vector embedding similarity, the related ingredients are less about being synonyms and more about other foods that pair well with basil.

Let's define another function `rank_related()` which ranks the related ingredients based on a combination of these two metrics.
This uses a cheap approximation of a [*pareto archive*](https://www.cs.bham.ac.uk/~jdk/multi/) for the ranking -- which comes in handy for recommender systems and custom search applications that must combine multiple ranking metrics:

In [None]:
from kglab import root_mean_square

def rank_related (df):
    df2 = df.copy(deep=True)
    df2["related"] = df2.apply(lambda row: root_mean_square([ row[2], row[4] ]), axis=1)
    return df2.sort_values(by=["related"], ascending=False)

In [None]:
df = rank_related(df)
df

Notice how the "synonym" cases tend to move up to the top now?
Meanwhile while the "pairs well with" are in the lower half of the ranked list: `fresh mushrooms`, `italian turkey sausage`, `cooked spaghetti`, `white kidney beans`, etc.

In [None]:
df.loc[ (df["related"] >= 50) & (df["term_ratio"] > 0) ]

---

## Exercises

**Exercise 1:**

Build a report for a *human-in-the-loop* reviewer, using the `rank_related()` function while iterating over `vocab` to make algorithmic suggestions for possible synonyms.

**Exercise 2:**

How would you make algorithmic suggestions for a reviewer about which ingredients could be related to a query, e.g., using the `skos:broader` and `skos:narrower` relations in the [`skos`](https://www.w3.org/2004/02/skos/) vocabulary to represent *hypernyms* and *hyponyms* respectively?
This could extend the KG to provide a kind of thesaurus about recipe ingredients.