# Extraction of Relevant Action and Object Knowledge from the Web

Before starting this tutorial, make sure the necessary packages are installed (see requirements.txt). Additionally, the following files need to be downloaded / extracted into this folder:

- *foodon.owl* - FoodOn used for extracting information about fruits and vegetables (found [here](https://github.com/FoodOntology/foodon))
- *numberbatch-en.txt* - ConceptNet Numberbatch embeddings for object property extraction (download *English-only V. 19.08* from [here](https://github.com/commonsense/conceptnet-numberbatch?tab=readme-ov-file#downloads))
- *NASARI_embed_english.txt* - NASARI embeddings also used for object property extraction (download *English - Embed(Wiki)* from [here](http://lcl.uniroma1.it/nasari/#two))

## Extraction of Action Knowledge

The extraction of knowledge about different and relevant actions consists of three main steps:

1. Setting the central verb & providing an exemplary sentence (e.g. 'cut')
2. Extracting synonyms and hyponyms from WordNet & VerbNet
3. Filtering the extracted words on their relevance using a recipe and a WikiHow corpus

In [None]:
# imports
import pandas as pd

# download wordnet & verbnet corpus
import nltk
nltk.download('wordnet')
nltk.download('verbnet')
from nltk.corpus import verbnet, wordnet

### Extracting Synonyms and Hyponyms from WordNet & VerbNet

In [None]:
# setting the target action and an exemplary sentence
target_action = "cut"
verbs = []

# iterating over all WordNet synsets containing the verb and ...
synsets = wordnet.synsets(target_action, pos=wordnet.VERB)
print(f"{len(synsets)} synsets found for '{target_action}'")
for syn in synsets:
    # ... gathering all synonyms & direct hyponyms
    verbs.extend(syn.lemma_names())
    for h in syn.hyponyms():
        verbs.extend(h.lemma_names())

    # ... getting the associated VerbNet class
    key = str(syn.lemmas()[0].key()).replace("::", "")
    vn_classes = verbnet.classids(wordnetid=key)
    for vn_class in vn_classes:
        verbs.extend(verbnet.lemmas(vn_class))

# removing duplicates and printing results
verbs = set(verbs)
print(f"{len(verbs)} synonyms or hyponyms found for '{target_action}'")

In [None]:
# pre-process the found synonyms and hyponyms
filtered_verbs = {v.split('_')[0] for v in verbs}
filtered_verbs = sorted(set(filtered_verbs))

print(f"{len(filtered_verbs)} remaining words:")
for verb in filtered_verbs:
    print(verb)

### Filtering the extracted verbs

In [None]:
# read the (extracted) occurrence data
v_occurrences = "./verb_occurrences.csv"
voc_dat = pd.read_csv(v_occurrences)

# remove all verbs with 0 occurrences
most_used = voc_dat[(voc_dat['SUM'] > 0)]
print(f"{len(most_used)} verbs that occur at least once")

# remove all verbs with too few available sentences (Step Desc <= threshold)
thresh = 100
most_used = most_used[(most_used['Step Desc'] >= thresh)]
print(f"{len(most_used)} verbs that occur in more than {thresh} step descriptions:")
print(most_used['Verb'].to_string(index=False))

## Extraction of Object Knowledge

The extraction of knowledge about different objects and their task-specific properties consists of four main steps:

1. Choosing a group of relevant objects (e.g. 'Fruits & Vegetables')
2. Gathering all available objects of that group from a suitable taxonomy or ontology (e.g. FoodOn)
3. Filtering the extracted objects using fitting text corpora (e.g. Recipe1M+)
4. Gathering information about task-specific properties using 3 different word embeddings (GloVe, NASARI & ConceptNet Numberbatch) and a recipe corpus

In [None]:
# imports
from rdflib import Graph, Literal, Namespace, RDF, RDFS, URIRef
from rdflib.plugins.sparql import prepareQuery
import pandas as pd
import gensim
import torch
import torchtext

### Extracting fruits and vegetables from the FoodOn

In [None]:
# load the ontology and set the namespace prefixes
foodon_loc = "./foodon.owl"
g = Graph()
g.parse(foodon_loc)

FOOD = Namespace("http://purl.obolibrary.org/obo/")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")

# get the fruit data through the SPARQL query 
query = prepareQuery(   
    """
    SELECT ?fruit_label (SAMPLE(?fruit_id) AS ?rndm_fruit_id) (SAMPLE(?def) AS ?rndm_def)
    WHERE {
        ?fruit_id rdfs:label ?label.
        ?fruit_id rdfs:subClassOf+ food:PO_0009001.
        OPTIONAL { ?fruit_id food:IAO_0000115 ?def. }

        BIND (LCASE(STR(?label)) AS ?str_label).
        BIND (STRBEFORE(?str_label, "(") AS ?fruit_label).
        FILTER CONTAINS(?str_label, "whole").
        FILTER NOT EXISTS { ?fruit_id rdfs:subClassOf* food:PO_0030104. }
        FILTER (?fruit_id != food:FOODON_03304644).
    }
    GROUP BY ?fruit_label
    ORDER BY ?fruit_label
    """,
    initNs={"food": FOOD, "rdfs": RDFS}
)
fruits = g.query(query)

# get the vegetable data through the SPARQL query 
query = prepareQuery(
    """
    SELECT ?veg_label (SAMPLE(?veg_id) AS ?rndm_veg_id) (SAMPLE(?def) AS ?rndm_def)
    WHERE {
        ?veg_id rdfs:label ?label.
        ?veg_id rdfs:subClassOf+ food:FOODON_03302008.
        OPTIONAL { ?veg_id food:IAO_0000115 ?def. }

        BIND (LCASE(STR(?label)) AS ?str_label).
        BIND (STRBEFORE(?str_label, "(") AS ?veg_label).
        FILTER NOT EXISTS { ?veg_id rdfs:subClassOf* food:FOODON_03302007. }
    }
    GROUP BY ?veg_label
    ORDER BY ?veg_label
    """,
    initNs={"food": FOOD, "rdfs": RDFS}
)
veggies = g.query(query)

# convert query results into panda dataframes for further analysis
fruit_list = [(str(row[0]), str(row[1]), str(row[2])) for row in fruits]
veggie_list = [(str(row[0]), str(row[1]), str(row[2])) for row in veggies]

fruit_df = pd.DataFrame(fruit_list, columns=["label", "rndm_id", "rndm_def"])
veggie_df = pd.DataFrame(veggie_list, columns=["label", "rndm_id", "rndm_def"])
combined_df = pd.concat([fruit_df, veggie_df], ignore_index=True)
print(combined_df)

### Filter the fruits and vegetables using WikiHow and Recipe1M+ data

In [None]:
f_occurrences = "./fruit_occurrences.csv"

# read and map the (extracted) occurrence data
foc_dat = pd.read_csv(f_occurrences)
foc_dat = foc_dat.astype({'Recipes-Title':'int','Recipes-Title [%]':'float', 'Recipes-Steps':'int','Recipes-Steps [%]':'float',
                'WikiHow-Title':'int','WikiHow-Title [%]':'float', 'WikiHow-TitleDescription':'int','WikiHow-TitleDescription [%]':'float',
                'WikiHow-Method':'int','WikiHow-Method [%]':'float', 'WikiHow-StepHeadline':'int','WikiHow-StepHeadline [%]':'float',
                'WikiHow-StepDescription':'int','WikiHow-StepDescription [%]':'float'})

# remove all items with too few occurrences in any column (less than 1%)
thresh = 0.01
most_used = foc_dat[(foc_dat['Recipes-Title [%]'] >= thresh) | (foc_dat['Recipes-Steps [%]'] >= thresh) | (foc_dat['WikiHow-Title [%]'] >= thresh) | 
                  (foc_dat['WikiHow-TitleDescription [%]'] >= thresh) | (foc_dat['WikiHow-Method [%]'] >= thresh) | (foc_dat['WikiHow-StepHeadline [%]'] >= thresh) |
                  (foc_dat['WikiHow-StepDescription [%]'] >= thresh)]
print(most_used)

### Extract object properties using 3 different embeddings

In [None]:
# prepare the list of possible fruits and possible food parts
parts = ['core', 'shell', 'peel', 'stem']
foods = list(most_used['Name'])

In [None]:
# GloVe embeddings
glove_sim = 0.5
glove = torchtext.vocab.GloVe(name="6B", dim=50)

for f in foods:
    for p in parts:
        sim = torch.cosine_similarity(glove[f].unsqueeze(0), glove[p].unsqueeze(0)).item()
        if sim >= glove_sim:
            print(f'[GloVe] {f} hasPart {p} (Similarity: {sim})')

In [None]:
# ConceptNet Numberbatch embeddings
numberbatch_sim = 0.3
numberbatch = gensim.models.KeyedVectors.load_word2vec_format('./numberbatch-en.txt', binary=False)

# cosine similarity between ConceptNet Numberbatch embeddings
for f in foods:
    for p in parts:
        sim = numberbatch.similarity(f, p)
        if sim >= numberbatch_sim:
            print(f'[CN Numberbatch] {f} hasPart {p} (Similarity: {sim})')

In [None]:
# NASARI embeddings 
parts_nasari = ['bn:00071005n', 'peel_(fruit)', 'plant_stem']
nasari_sim = 0.75
nasari = gensim.models.KeyedVectors.load_word2vec_format('./NASARI_embed_english.txt', binary=False)
               
# define function for finding the key based on the given concept name
def find_key(concept):
    concept_is_synset = "bn:" in concept
    keys = [key for key in nasari.index_to_key if concept in key.lower()]
    for key in keys:
        cut = key.split('__')[1].lower()
        if (cut == concept and not concept_is_synset) or (concept_is_synset and concept in key.lower()):
            return key
    return concept
    
# cosine similarity between NASARI embeddings
# Sadly, the BabelNet synsets for core (bn:04772260n) does not exist in the NASARI embeddings and 
# for 'shell' we need to look for the concrete synset (bn:00071005n) instead 
for f in foods:
    for p in parts_nasari:
        f_key = find_key(f)
        p_key = find_key(p)
        if (f_key in nasari.index_to_key) and (p_key in nasari.index_to_key):
            sim = nasari.similarity(f_key, p_key)
            if sim >= nasari_sim:
                print(f'[NASARI] {f} hasPart {p} (Similarity: {sim})')         

### Extract object properties using Recipe1M+ data

In [None]:
# read the (extracted) occurrence data from Recipe1M+
p_occurrences = "./part_occurrences.csv"
poc_dat = pd.read_csv(p_occurrences)
print(poc_dat)

In [None]:
# set the thresholds & possible parts
parts = ['core', 'shell', 'peel', 'stem']
recipe_thresh = 0.01
step_thresh = 0.01

# print object-part-relations
for idx, row in poc_dat.iterrows():
    for p in parts:
        rec_ratio = row[f'{p}_R'] / row['recipes']
        step_ratio = row[f'{p}_S'] / row['steps']
        if rec_ratio >= recipe_thresh and step_ratio >= step_thresh:
            fruit = row['food']
            rec_perc = '{:.2f}'.format(rec_ratio*100)
            step_perc = '{:.2f}'.format(step_ratio*100)
            print(f'{fruit} hasPart {p} ({rec_perc}% of Recipes & {step_perc}% of Steps)')