In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import ast

In [2]:
# Scripts
from scripts import scrapers, db_funcs

ModuleNotFoundError: No module named 'pymongo'

In [3]:
# Db Information
urls, recipes = db_funcs.get_scraper_dbs()

# Getting Raw Ingredient dataset
The recipes are scraped into a local MongoDB using the scrapers notebook and scripts folder. The following is my scraped library to parse together the ingredients into usable and consistent formats

In [None]:
# Logic to get it from my database 
# (reading into pandas in case I want to use other fields later)
df = pd.DataFrame(list(recipes.find({})))

idl = []
for ing_list in df.ingredients:
    if ing_list is not None:
        for ing in ing_list:
            idl.append(ing)

In [2]:
# Logic to read from flat-file
idf = pd.read_csv('./data/ingredient_list.csv')
idl = list(idf['0'])
idf = idf.rename(columns={'0':'ing'})[['ing']]
idf = idf[~idf.ing.isna()]

In [58]:
# nyt cooking training
nyt = pd.read_csv('nyt_ingredients_training.csv').drop(columns=['index'])
nyt = nyt[~nyt.input.isna()]
nyt = nyt[~nyt.name.isna()]
nyt = nyt[~(nyt.name == 'NaN')]

In [59]:
print(f"Unparsed Ingredients: {len(idf)}")
print(f"NYT Trainable Parsed Ingredients {len(nyt)}")

Unparsed Ingredients: 2321769
NYT Trainable Parsed Ingredients 178668


In [None]:
# writing to .txt files for tokenization training
with open('nyt_parsed.txt', 'w', encoding='utf-8') as file:
    for ing in nyt.input:
        file.write(ing+"\n")
        
with open('unparsed_ing_list.txt', 'w', encoding='utf-8') as file:
    for ing in idf.ing:
        file.write(ing+"\n")      

In [53]:
from simpletransformers.seq2seq import Seq2SeqModel

train_df = pd.DataFrame({'input_text':nyt.input[:10000], 'target_text':nyt.name[:10000]})
eval_df = pd.DataFrame({'input_text':nyt.input[10000:20000], 'target_text':nyt.name[10000:20000]})

In [55]:
train_df

Unnamed: 0,input_text,target_text
0,1 1/4 cups cooked and pureed fresh butternut s...,butternut squash
1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts
2,"1 medium-size onion, peeled and chopped",onion
3,"2 stalks celery, chopped coarse",celery
4,1 1/2 tablespoons vegetable oil,vegetable oil
6,"2 tablespoons unflavored gelatin, dissolved in...",gelatin
7,Salt,Salt
8,1 cup canned plum tomatoes with juice,plum tomatoes
9,6 cups veal or beef stock,stock
10,1/3 cup Worcestershire sauce,Worcestershire sauce


In [54]:
model_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "train_batch_size": 500,
    "num_train_epochs": 10,
    "save_eval_checkpoints": False,
    "save_model_every_epoch": False,
    "evaluate_generated_text": True,
    "evaluate_during_training_verbose": True,
    "use_multiprocessing": True,
    "manual_seed": 4,
}

encoder_type = "roberta"

model = Seq2SeqModel(
    "roberta",
    "roberta-base",
    "bert-base-uncased",
    args=model_args,
    use_cuda=False,
)

model.train_model(train_df)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer

  0%|          | 0/10000 [00:00<?, ?it/s]

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [20]:
results = model.eval_model(eval_df)

  0%|          | 0/10100 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [15]:
check = '2 tbsp corn flakes'
check2 = '1 large egg'

In [17]:
model.predict(check2)

Generating outputs:   0%|          | 0/2 [00:00<?, ?it/s]

['Salt...',
 '.....',
 's...',
 'Salt..',
 'Salt...',
 '....',
 '##um...',
 'Salt..',
 'Salt...',
 'Salt..',
 'Salt....']

In [None]:

model_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "max_seq_length": 10,
    "train_batch_size": 2,
    "num_train_epochs": 10,
    "save_eval_checkpoints": False,
    "save_model_every_epoch": False,
    "evaluate_generated_text": True,
    "evaluate_during_training_verbose": True,
    "use_multiprocessing": False,
    "max_length": 15,
    "manual_seed": 4,
}

encoder_type = "roberta"

model = Seq2SeqModel(
    encoder_type,
    "roberta-base",
    "bert-base-cased",
    args=model_args,
    use_cuda=False,
)

model.train_model(train_df)

results = model.eval_model(eval_df)

print(model.predict(["five"]))


model1 = Seq2SeqModel(
    encoder_type,
    encoder_decoder_name="outputs",
    args=model_args,
    use_cuda=True,
)
print(model1.predict(["five"])

# Setup
Currently all that is available is a list of ingredients. Nothing is labeled on them, though they do follow a non-enforced structure. What we want to know about them:

    - Ingredient - What ingredient is it? This needs to be a machine-readable format where all variants of the word flour that still mean flour are captured as a single ingredient type
    - Quantity - How much of the ingredient? This requires the unit and the quantity of that unit.
    - Unit - What is the quantity measured in? Ideally this will connect many
    - Other Descriptions - Things like Chopping style, to taste, etc.
    
This is not a new problem, NYT Cooking ran into a similar problem when sifting through their recipe archives https://github.com/nytimes/ingredient-phrase-tagger. Using humans, they labeled aroud 180K ingredient phrases with their corresponding amounts, ingredients and descriptors. The method they used to model this was an NLP technique called CLF, but I will be using a language model, both custom built and pre-trained out of the huggingface transformers package.

## Step 1 - Ingredient Name

There are two objectives in this step. The first is to find, amidst a lot of informaiton, what the item name is. By comparison, there are 180K inputs and only ~16,000 names. The model must identify which name belongs to the input. This will influence how the units are tracked as well as how relevant the descriptors are.

In [7]:
nyt.name.str.lower().value_counts()[:20]

salt                     8333
garlic                   5646
olive oil                4822
sugar                    4034
butter                   3016
onion                    2864
black pepper             2621
unsalted butter          2429
pepper                   2251
water                    2130
eggs                     2070
parsley                  2001
salt and pepper          1942
lemon juice              1933
egg                      1570
heavy cream              1561
flour                    1539
tomatoes                 1429
milk                     1385
salt and black pepper    1282
Name: name, dtype: int64

In [8]:
nyt.input.str.lower()[:20]

0     1 1/4 cups cooked and pureed fresh butternut s...
1     1 cup peeled and cooked fresh chestnuts (about...
2               1 medium-size onion, peeled and chopped
3                       2 stalks celery, chopped coarse
4                       1 1/2 tablespoons vegetable oil
6     2 tablespoons unflavored gelatin, dissolved in...
7                                                  salt
8                 1 cup canned plum tomatoes with juice
9                             6 cups veal or beef stock
10                         1/3 cup worcestershire sauce
11                     1 tablespoon louisiana hot sauce
12                   1/2 teaspoon hot red pepper flakes
13                                         4 bay leaves
14                 6 cloves garlic, crushed and chopped
15                          2 carrots, peeled and diced
16                               2 medium onions, diced
17                                 6 tablespoons butter
18    1 tablespoon creole seasoning, or other se

## Tokenization Process

To prepare the dataset tokenization needs to be done:
    - Normalization Available Methods:
          BertNormalizer
          Lowercase
          NFC
          NFD
          NFKC
          NFKD
          Nmt
          Precompiled
          Replace
          Sequence
          Strip
          StripAccents
    - pre tokenization
    - Tokenization
    - Post-tokenization

### Normalization
First step in preparing the inputs, taking sentences and cleaning them of random sentence noise. Huggingface has the implementation of many normalizers, all of which can be stringed together. To keep the NYT and my own scraped ingredients consistent the normalizer will be shared for both of them. The normalization elemnts being applied:
    - NFC normalization, for unicode character cleaning, though it shouldn't affect much
    - Strip Accents, remove potential accents from ethnic cuisine foods that might have them
    - Lowercase, implementing in huggingface for consistency
    - Replacements, Fractional representations are converted to just a number with slash, slash types unified

In [9]:
fractions = {"↉": "0", "⅒": "1/10", "⅑": "1/9", "⅛": "1/8",
                     "⅐": "1/7", "⅙": "1/6", "⅕": "1/5", "¼": "1/4",
                     "⅓": "1/3", "½": "1/2", "⅖": "2/3", "⅔": "2/3",
                     "⅜": "3/8", "⅗": "3/5", "¾": "3/4", "⅘": "4/5",
                     "⅝": "5/8", "⅚": "5/6", "⅞": "7/8"}
fraction_replacers = [normalizers.Replace(pattern=key, content=item) for key, item in fractions.items()]

AttributeError: module 'tokenizers.normalizers' has no attribute 'Replace'

In [None]:
normalizer = normalizers.Sequence([normalizers.NFC(), # Unicode cleaning
                                   normalizers.StripAccents(),
                                   normalizers.Lowercase(),
                                   normalizers.Replace(pattern="⁄", content="/")] + # remove potentially odd symbols
                                   fraction_replacers)

### Pre-tokenization
This prepares sequences by determining what the splits will be on and the resulting lengths. For recipes, the right pre-tokenization pattern has to be chosen to retain as much grammatical information as possible while cutting out as much noise as possible. Depending on what information we are trying to extract the tokenizer might need to be changed slightly. When extracting the amount, for instance, the tokenizer will need to be senesitive to digits, whereas for the item digits aren't as important. For the item name, the Whitespace tokenizer seems to be sufficient

In [None]:
pt_item_name = pre_tokenizers.Sequence([pre_tokenizers.Whitespace()])

In [None]:
str_input = idf.ing[2321800]
print("Before: "+str(str_input))
print("Normalization: "+normalizer.normalize_str(str_input))
print(pt.pre_tokenize_str(normalizer.normalize_str(str_input)))

### Tokenizer Training
Takes the pre-tokens and outputs them into the tokenized vocabulary set. Here, three trainers are passed in and the results are compared. Given the pre-tokenization and normalization are equal, the results between the Word Piece and BPE are not obvious, but the Unigram model appears to be splitting apart the words to finely. Moving forward, BPE will be used

Post-processing will be skipped for this model, though a special token might be needed when obtaining amounts in order to decipher the units apart from the rest of the string since the quantity is almost always before a unit. However, it might not be necessary as the base tokenization might be enough to pick out this information on its own.

In [None]:
wpt = Tokenizer(models.WordPiece())
wpt.normalizer = normalizer
wpt.pre_tokenizer = pt_item_name
wpt.train(trainers.WordPieceTrainer(), files=["./nyt_parsed.txt",
                                              "./unparsed_ing_list.txt"])

In [None]:
ut = Tokenizer(models.Unigram())
ut.normalizer = normalizer
ut.pre_tokenizer = pt_item_name
ut.train(trainers.UnigramTrainer(), files=["./nyt_parsed.txt",
                                           "./unparsed_ing_list.txt"])

In [None]:
bpet = Tokenizer(models.BPE())
bpet.normalizer = normalizer
bpet.pre_tokenizer = pt_item_name
bpet.train(trainers.BpeTrainer(), files=["./nyt_parsed.txt",
                                         "./unparsed_ing_list.txt"])

In [None]:
str_to_encode = idf.ing[1]
print(wpt.encode(str_to_encode).tokens)
print(ut.encode(str_to_encode).tokens)
print(bpet.encode(str_to_encode).tokens)

In [13]:
nyt.head(1)

Unnamed: 0,input,name,qty,range_end,unit,comment
0,1 1/4 cups cooked and pureed fresh butternut s...,butternut squash,1.25,0.0,cup,"cooked and pureed fresh, or 1 10-ounce package..."


# Modeling

In [3]:
from transformers import T5Tokenizer, TFT5Model, TFTrainer, TFTrainingArguments
import tensorflow as tf

In [4]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = TFT5Model.from_pretrained('t5-small')

All model checkpoint weights were used when initializing TFT5Model.

All the weights of TFT5Model were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5Model for predictions without further training.


In [10]:
# Converting to a tensorflow dataset
train_df = tf.data.Dataset.from_tensor_slices((tokenizer.batch_encode_plus(nyt.input[:100], return_tensors="tf", padding=True).input_ids, 
                                               tokenizer.batch_encode_plus(nyt.input[:100], return_tensors="tf", padding=True).input_ids))
eval_df = tf.data.Dataset.from_tensor_slices((tokenizer.batch_encode_plus(nyt.input[100:200], return_tensors="tf", padding=True).input_ids, 
                                               tokenizer.batch_encode_plus(nyt.input[100:200], return_tensors="tf", padding=True).input_ids))


In [11]:
training_args = TFTrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total # of training epochs
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

trainer = TFTrainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_df,    # tensorflow_datasets training dataset
    eval_dataset=eval_df       # tensorflow_datasets evaluation dataset
)

In [16]:
model(input_ids=tokenizer.batch_encode_plus(nyt.input[:100], return_tensors="tf", padding=True).input_ids, 
          decoder_input_ids=tokenizer.batch_encode_plus(nyt.input[:100], return_tensors="tf", padding=True).input_ids)

ValueError: The first argument to `Layer.call` must always be passed.

In [73]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = TFT5Model.from_pretrained('t5-small')

input_ids = tokenizer(nyt.input[0], return_tensors="tf").input_ids # Batch size 1
decoder_input_ids = tokenizer(nyt.name[0], return_tensors="tf").input_ids

print(input_ids)
print(decoder_input_ids)

All model checkpoint weights were used when initializing TFT5Model.

All the weights of TFT5Model were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5Model for predictions without further training.


tf.Tensor(
[[  209 13004 12294  8311    11  4621    15    26  1434  4194  4796 21248
      6    42   209  9445  7906  2642 10451 21248     6    20  6155  6265
      1]], shape=(1, 25), dtype=int32)
tf.Tensor([[ 4194  4796 21248     1]], shape=(1, 4), dtype=int32)


In [91]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
model.fit(x=input_ids.numpy(), y=decoder_input_ids.numpy())

ValueError: ('Error when checking model target: expected no data, but got:', array([[ 4194,  4796, 21248,     1]]))

In [89]:
model.predict(input_ids)

ValueError: You have to specify either inputs or inputs_embeds