In [1]:
%time
%load_ext autotime
%load_ext autoreload
%autoreload 2

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 14.5 µs


In [2]:
import os, tqdm, sys
parent_dir = os.path.abspath(os.getcwd()+'/../')
sys.path.append(parent_dir)

import numpy as np
import spacy
import pickle

def load_pickle(filename):
    with open(filename, 'rb') as gfp:
        r = pickle.load(gfp)
    return r

time: 911 ms


In [3]:
database = load_pickle('../big_data/database.pickle')

time: 17.9 ms


In [4]:
class spacy_extension:
    def __init__(self):
        '''
        Args: sent: string
        '''
        self.spacy = spacy.load('en_core_web_lg')
        
    def ingr(self, lst):
        '''
        Args: lst: a list of ingredient names
        used when len(lst) must equal to root_match
        '''
        hl = [[{'text':x, 'highlight': None} for x in i.split(' ')] for i in lst]
        root_match = []
        for i, ingr in enumerate(lst):
            if ' ' not in ingr:
                hl[i][0]['highlight'] = 'wrong'
                doc = self.spacy(ingr)
                root_match.append(doc[0].lemma_)
            else:
                phrase = 'Mix the %s and water.'%ingr
                doc = self.spacy(phrase)
                for chunk in doc.noun_chunks:
                    if chunk.text != 'water':
                        for j, word in enumerate(hl[i]):
                            if word['text'] == doc[chunk.end - 1].text:
                                hl[i][j]['highlight'] = 'wrong' 
                                root_match.append(doc[chunk.end - 1].lemma_)
        return root_match, hl
    
    def instr(self, directions):
        instr = self.spacy(directions)
        hl_instr = [{'text': token.text, 'highlight': None} for token in instr]
        return instr, hl_instr

time: 9.85 s


In [5]:
def highlight(ingredients, directions, generate = 'directions'):
    
    '''Args:
    ingr: list of ingredients; a set list;
    instr: str/list, a paragraph of cooking instructions
    
    * this version only considers the root nouns; thus, it is fine to calculate F1
    '''
    
    # check the inputs
    assert generate in ['directions', 'ingredients']
    directions = ' '.join(directions) if type(directions) == list else directions
    
    # send to spacy
    root_ingr, hl_ingr = sp.ingr(ingredients)
    instr, hl_instr = sp.instr(directions)
    root_instr = []
    
    # highlighting
    for chunk in instr.noun_chunks:
        idx_rootnoun = chunk.end - 1
        str_rootnoun = instr[idx_rootnoun].lemma_
        if str_rootnoun in root_ingr:
            root_instr.append(str_rootnoun)
            hl_instr[idx_rootnoun]['highlight'] = 'correct'
            
            for idx, root in enumerate(root_ingr):
                if root == str_rootnoun:
                    for j, word in enumerate(hl_ingr[idx]):
                        if word['highlight'] =='wrong':
                            hl_ingr[idx][j]['highlight'] = 'correct'
            
        elif str_rootnoun in database:
            root_instr.append(str_rootnoun)
            hl_instr[idx_rootnoun]['highlight'] = 'wrong'
    
    # delimit the sentences
    hl_instr = parse_instr(hl_instr)
   
    # calculate precision and recall
    root_ingr, root_instr = set(root_ingr), set(root_instr)
    TP = len(root_ingr & root_instr)
    recall = TP/len(root_ingr) if len(root_ingr) >0 else 0
    precision = TP/len(root_instr) if len(root_instr) >0 else 0
    
    # if this is ingredients generation
    if generate == 'ingredients':
        recall, precision = precision, recall
    return {'ingredients': hl_ingr, 'directions': hl_instr, 'recall': recall, 'precision': precision}

def parse_instr(hl_instr):
    par_hl, sent = [], []
    for word in hl_instr:
        if word['text'] !='.':
            sent.append(word)
        else:
            sent.append(word)
            par_hl.append(sent)
            sent = []
    if sent:
        par_hl.append(sent)
    return par_hl

time: 28.1 ms


In [6]:
ingredeints = ['garlic','breasts','bsp. oil']
directions = 'heat oil in large skillet on medium - high heat. \
Add chicken and garlic . or until chicken is done'

time: 15.9 ms


In [7]:
output = highlight(ingredeints, directions)

time: 62.5 ms


In [8]:
output

{'ingredients': [[{'text': 'garlic', 'highlight': 'correct'}],
  [{'text': 'breasts', 'highlight': 'wrong'}],
  [{'text': 'bsp.', 'highlight': None},
   {'text': 'oil', 'highlight': 'correct'}]],
 'directions': [[{'text': 'heat', 'highlight': None},
   {'text': 'oil', 'highlight': 'correct'},
   {'text': 'in', 'highlight': None},
   {'text': 'large', 'highlight': None},
   {'text': 'skillet', 'highlight': None},
   {'text': 'on', 'highlight': None},
   {'text': 'medium', 'highlight': None},
   {'text': '-', 'highlight': None},
   {'text': 'high', 'highlight': None},
   {'text': 'heat', 'highlight': None},
   {'text': '.', 'highlight': None}],
  [{'text': 'Add', 'highlight': None},
   {'text': 'chicken', 'highlight': 'wrong'},
   {'text': 'and', 'highlight': None},
   {'text': 'garlic', 'highlight': 'correct'},
   {'text': '.', 'highlight': None}],
  [{'text': 'or', 'highlight': None},
   {'text': 'until', 'highlight': None},
   {'text': 'chicken', 'highlight': 'wrong'},
   {'text': 'is

time: 27.8 ms
