In [1]:
%time
%load_ext autotime
%load_ext autoreload
%autoreload 2

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.44 µs


In [7]:
import os, tqdm, sys
parent_dir = os.path.abspath(os.getcwd()+'/../')
sys.path.append(parent_dir)

import numpy as np
import spacy
import pickle

def load_pickle(filename):
    with open(filename, 'rb') as gfp:
        r = pickle.load(gfp)
    return r

time: 16.8 ms


In [3]:
class spacy_extension:
    def __init__(self):
        '''
        Args: sent: string
        '''
        self.spacy = spacy.load('en_core_web_lg')
        
    def root_visual(self, lst):
        '''
        Args: lst: a list of ingredient names
        used when len(lst) must equal to root_match
        '''
        root_match = {}
        for ingr in lst:
            phrase = 'Mix the %s and water.'%ingr
            doc = self.spacy(phrase)
            for chunk in doc.noun_chunks:
                if chunk.text != 'water':
                    root_match.update({ingr: doc[chunk.end - 1]})
        return root_match
    
sp = spacy_extension()

time: 10.4 s


In [8]:
database = load_pickle('../big_data/database.pickle')

time: 16.6 ms


In [9]:
def highlight(ingr, instr, generate = 'instr'):
    
    '''Args:
    ingr: list of ingredients; a set list;
    instr: str, a paragraph of cooking instructions
    
    * this version only considers the root nouns; thus, it is fine to calculate F1
    '''
    assert generate in ['instr', 'ingr']
    
    ingr = sp.root_visual(ingr)
    instr = sp.spacy(instr)
    hl_ingr = []
    hl_instr = dict({'correct':[],'wrong':[]})
    lemma = {}
    
    for k, v in ingr.items():
        lemma.setdefault(v.lemma_, []).append(len(hl_ingr))
        hl_ingr.append({'input': k, 'to_highlight': v.text, 'color': 'wrong'})

    for chunk in instr.noun_chunks:
        idx_rootnoun = chunk.end - 1
        str_rootnoun = instr[idx_rootnoun].lemma_
        if str_rootnoun in lemma.keys():
            hl_instr['correct'].append(instr[idx_rootnoun].text)
            for i in lemma[str_rootnoun]:
                hl_ingr[i].update({'color':'correct'})
        elif str_rootnoun in database:
            hl_instr['wrong'].append(instr[idx_rootnoun].text)
            
    # if this is instruction generation
    recall = np.mean([line['color']=='correct' for line in hl_ingr])
    precision = len(hl_instr['correct'])/ (len(hl_instr['correct']) + len(hl_instr['wrong']))
    
    if generate == 'ingr':
        recall, precision = precision, recall
        
    return {'ingr': hl_ingr, 'instr': hl_instr, 'recall': recall, 'precision': precision}

time: 19.8 ms


In [13]:
ingr = ['ice', 'jigger water','fluid pineapples','chocolate','pineapple','cherry']
instr = 'in a cocktail shaker, muddle the pineapple. \
add ice and the vodka, cream of coconut, and pineapple juice \
shake well strain into an ice filled water highball glass \
garnish with the pineapple wedge and maraschino'

time: 23 ms


In [14]:
output = highlight(ingr, instr)

time: 105 ms


In [15]:
output

{'ingr': [{'input': 'ice', 'to_highlight': 'ice', 'color': 'correct'},
  {'input': 'jigger water', 'to_highlight': 'water', 'color': 'wrong'},
  {'input': 'fluid pineapples',
   'to_highlight': 'pineapples',
   'color': 'correct'},
  {'input': 'chocolate', 'to_highlight': 'chocolate', 'color': 'wrong'},
  {'input': 'pineapple', 'to_highlight': 'pineapple', 'color': 'correct'},
  {'input': 'cherry', 'to_highlight': 'cherry', 'color': 'wrong'}],
 'instr': {'correct': ['pineapple', 'ice'],
  'wrong': ['vodka', 'cream', 'coconut', 'juice', 'wedge', 'maraschino']},
 'recall': 0.5,
 'precision': 0.25}

time: 36 ms
