In this notebook, based on food_taxonomy.txt and our own annotations, we create an ingredient root noun database. The root noun is extracted by spaCy. Lastly, we save it as database.pickle

In [1]:
from dependency import parent_dir 
from common.basics import *
from common.save import save_pickle, load_pickle

In [2]:
import spacy
!python -m spacy download en_core_web_lg

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [3]:
df = pd.read_csv('../big_data/food_taxonomy.txt', delimiter='\t', header = None)

In [4]:
def cleaning(df):
    '''
    eliminate the row if it contains the following non-ingredient words
    '''
    print('origin: length of data: %d' % len(df))
    eliminate = ['Snack brand', 'Preparation', 'Fast food', 'Dietary Supplement', 'Dessert']
    for i in range(2):
        df  = df[df.apply(lambda x: x[i] not in eliminate, axis = 1)]
    # uncased
    for i in range(3):
        df[i]  = df.apply(lambda x: x[i].lower(), axis = 1)
    print('drop some fields: length of data: %d' % len(df))    
    
    # drop the duplicates
    df = df[~df[2].duplicated()]
    df = df.reset_index(drop = True)
    print('drop duplicates: length of data: %d' % len(df))
        
    return df
df = cleaning(df)

origin: length of data: 4294
drop some fields: length of data: 2682
drop duplicates: length of data: 2582


In [5]:
# firstly, we notice salvador identified several useful ingredients that are not found in our taxnomy
# then, we selected those words based on frequency and manually add words to our taxnomy
additional_ingr=\
[
['Condiment', 'Sweet', 'sugar'],
['Condiment', 'Sweet', 'applesauce'],
['Flour', 'Flour','flour'],
['Baking powder','Baking powder','baking powder'],
['Water','Water','water'],
['Water','Water','iced'],
['Water','Water','ice'],
['Herb and spice','spices','jalapeno'],
['Condiment','Condiment', 'oil'],
['Beverage','Fruit juice', 'juice'],
['Staple','Maize','enchilada'],
['Condiment','Condiment', 'dressing mix'],
['Condiment','Condiment', 'cheese mix'],
['Baking powder','Baking powder','baking soda'],
['Condiment','Condiment', 'dry mix'],
['Condiment','Condiment','chocolate'],
['Vegetable','Bulb and stem vegetables','tapioca'],
['Flour', 'Flour','xanthan gum'],
['Flour', 'Flour','starch'],
['Egg and dairy', 'Dairy product','buttermilk'],
['Condiment','Condiment','chili'],
['Condiment','Condiment','chile'],
['Condiment','Condiment','chilis'],
['Condiment','Condiment','chiles'],
['Condiment','Condiment','chilies'],
['Flour', 'Flour','corn muffin mix'],
['Beverage','Chocolate','chocolate mix'],
['Meat','dumpling','dumpling'],
['Meat','dumpling','wonton'],
['Staple','Wheat','pizza dough'],
['Staple','Wheat','dough'],
['Condiment','Condiment', 'pizza sauce'],
['Flour', 'Flour','yeast'],
['Condiment','Sweet','cocoa'],
['Staple','Maize','chip'],
['Egg and dairy','Dairy product','ricotta'],
['Condiment','Condiment','seasoning'],
['Beverage','Alcohol','sherry'],
['Staple','Rice','grain rice'], 
['Staple','Wheat','shell'],
['Meat','Beef','fillet'],
['Staple','Maize','cornmeal'],
['Condiment','Condiment','seed oil'],
['Nut and seed','Other','seed'],
['Condiment', 'Sweet', 'sugar blend'],
['Soup','Soup','broth'],
['Soup','Soup','stock'],
['Condiment', 'Sweet', 'marshmallow'],
['Condiment', 'Dry Condiment', 'dried vegetable flakes'],
['Condiment', 'Dry Condiment', 'dried celery flakes'],
['Flour', 'Flour','cornstarch'],
['Staple','Wheat','double crust'],
['Staple','Wheat','crust'],
['Staple','Wheat','pastry crust'],
['Egg and dairy','Dairy product','gorgonzola'],
['Beverage','juice','drink mix'],
['Egg and dairy','Egg','Egg whites'],
['Baking powder','Baking powder','baking mix'],
['Staple','Rice','brown rice'],
['Condiment','Condiment','five spice'],
['Meat','Beef','tenderloin'],
['Meat','Pork','prosciutto'],
['Condiment', 'Sweet', 'whipped topping'],
['Condiment', 'Sweet', 'topping'],
['Beverage','Alcohol','cider'],
['Meat','Shellfish','crabmeat'],
['Condiment', 'Sweet', 'candy'],
['Condiment', 'Sweet', 'caramel'],
['Condiment', 'Sweet', 'molasses'],
['Vegetable','Podded vegetables','cannellini'],
['Vegetable','Fruits','fruit'],
['Staple','Wheat','saltine'],
['Condiment','Condiment', 'habanero'],
['Beverage','Juice','jell o'],
['Beverage','Juice','jelly'],
['Beverage','Soft drink','carbonated beverage'],
['Egg and dairy','Dairy product','gruyere'],
['Vegetable','Leafy and Salad','beet'],
['Water','Water','icing'],
['Egg and dairy','Dairy product','parmigiano'],
['Beverage','Alcohol','liqueur'],
['Condiment','Condiment', 'lard'],
['Staple','Wheat','crumb'],
['Herb and spice','Herb','peppermint'],
['Beverage','Alcohol','marsala'],
['Side dish','Potatoes','hash brown'],
['Meat','Beef','steak'],
['Condiment','Condiment','gelatin'],
['Meat','Beef','chuck'],
['Egg and dairy','Dairy product','colby'],
['Condiment', 'Sweet', 'jam'],
['Condiment', 'Sweet', 'cool whip'],
['Condiment', 'Sweet', 'stevia'],
['Staple','Wheat','bran'],
['Condiment','Condiment','pimento'],
['Condiment','Condiment','food coloring'],
['Meat','Meat','rib'],
['Condiment','Condiment','shortening'],
['Vegetable','Fruits','sweet pickles'],
['Condiment', 'Sweet', 'white confectioner'],
['Condiment', 'Sweet', 'confectioner'],  
['Vegetable','Root and tuberous vegetabless','rhubarb'],
['Condiment', 'Condiment', 'cooking spray']
]

def add_rows(df, additional_ingr):
    add = pd.DataFrame(additional_ingr)
    print('before processing: length of data: %d' % len(df))
    df = pd.concat([df,add]).reset_index(drop =True)

    # uncased
    for i in range(3):
        df[i]  = df.apply(lambda x: x[i].lower().strip(), axis = 1)
    print('add some rows: length of data: %d' % len(df))    
    
    # drop the duplicates
    df = df[~df[2].duplicated()]
    df = df.reset_index(drop = True)
    print('drop duplicates: length of data: %d' % len(df))
    return df

df = add_rows(df, additional_ingr)


unwelcomed_ingr =['salt and pepper', 'muffin']
def delete_rows(df, unwelcomed_ingr):
    return df[df[2].apply(lambda x: x not in unwelcomed_ingr)]

df = delete_rows(df, unwelcomed_ingr)

before processing: length of data: 2582
add some rows: length of data: 2685
drop duplicates: length of data: 2684


In [6]:
class spacy_extension(object):
    def __init__(self):
        self.spacy = spacy.load('en_core_web_lg')
    
    def ingr(self, lst):
        '''Note that this one is slightly different from the one in utils.spacy_func
        This version is used when number of outputs must equal to number of inputs

        Args: 
          lst: A list of ingredient names
        Return:
          root_match: A list of root nouns, may conntain 'CANNOT_DETECT' answer
        '''
        hl = [[{'text':x, 'highlight': None} for x in i.split(' ')] for i in lst]
        root_match = []
        for i, ingr in enumerate(lst):
            if ' ' not in ingr:
                hl[i][0]['highlight'] = 'wrong'
                doc = self.spacy(ingr)
                root_match.append(doc[0].lemma_)
            else:
                phrase = 'Mix the %s and water.'%ingr
                doc = self.spacy(phrase)
                
                last_chunk = None
                for chunk in doc.noun_chunks:
                    if chunk.text != 'water':
                        last_chunk = chunk
                if not last_chunk:
                    root_match.append('CANNOT_DETECT')
                else:
                    found = False
                    for j, word in enumerate(hl[i]):
                        if doc[last_chunk.end - 1].text in word['text']:
                            hl[i][j]['highlight'] = 'wrong' 
                            root_match.append(doc[last_chunk.end - 1].lemma_)
                            found = True
                            break
                    if not found:
                        root_match.append('CANNOT_DETECT')
                        
        assert len(root_match) == len(lst)
        return root_match

In [7]:
sp = spacy_extension()
database = sp.ingr(df[2].values)
database = list(set(database))
blocklist = ['-PRON-','sheet','boil','time','light','CANNOT_DETECT','cup']
database = list(set([word for word in database if word not in blocklist and len(word)>2 ]))
print(len(database))

1992


In [8]:
save_pickle(obj = database,filename= '../big_data/database.pickle',overwrite =True)