In [1]:
datasetFolder = "/home/john/Downloads/food-ingredient-lists/"
datasetName = "ingredients v1.csv"
datasetPath = datasetFolder + datasetName

## Read Data

In [2]:
import csv

dataset = []
with open(datasetPath, mode="r", encoding="utf-8-sig") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    keys = []
    for row in csv_reader:
        if line_count == 0:
            keys = row
            line_count += 1
        else:
            record = dict(zip(keys, row))
            dataset+= [record]
    print(f'Processed {len(dataset)} lines.')

print(dataset[0])

Processed 10000 lines.
{'id': 'AVphBRHOilAPnD_x0OrE', 'asins': 'B00HXST15C', 'brand': 'Simon Fischer', 'categories': 'Grocery & Gourmet Food,Food,Grocery', 'dateAdded': '2017-01-07T20:13:17Z', 'dateUpdated': '2017-06-30T16:48:02Z', 'ean': '41642026706', 'features.key': 'Ingredients', 'features.value': 'Dried Prunes,Water,Corn Syrup,Sugar,Pectin.', 'manufacturer': 'Sokol And Company', 'manufacturerNumber': '33829', 'name': 'Simon Fischer Fruit Bttr Prune Lekvar', 'sizes': '', 'upc': '41642026706', 'weight': '10.6 pounds', '': ''}


## Filtering Food which doesn't has ingredients

In [3]:
dataset = [row for row in dataset 
           if row["features.key"].lower() == "ingredients"]
print(dataset[0])

{'id': 'AVphBRHOilAPnD_x0OrE', 'asins': 'B00HXST15C', 'brand': 'Simon Fischer', 'categories': 'Grocery & Gourmet Food,Food,Grocery', 'dateAdded': '2017-01-07T20:13:17Z', 'dateUpdated': '2017-06-30T16:48:02Z', 'ean': '41642026706', 'features.key': 'Ingredients', 'features.value': 'Dried Prunes,Water,Corn Syrup,Sugar,Pectin.', 'manufacturer': 'Sokol And Company', 'manufacturerNumber': '33829', 'name': 'Simon Fischer Fruit Bttr Prune Lekvar', 'sizes': '', 'upc': '41642026706', 'weight': '10.6 pounds', '': ''}


## get food label, ingredients tuple

In [4]:
finalDataset = [
    {"Name":record["name"], "Ingredients": record["features.value"].strip(".").split(",")}
    for record in dataset 
    if len(record["features.value"].strip(".").split(",")) < 50 and 
    2 < len(record["features.value"].strip(".").split(",")) and
    len(record["name"].split()) < 6]
print(finalDataset[0])

{'Name': 'Jolly Time Popcorn', 'Ingredients': ['Salt', ' Yellow 5 Lake', ' Tricalcium Phosphate And Artificial Butter Flavor']}


In [5]:
ingredients = [
    ['startseq'] + [ing.lower().strip() for ing in rec["Ingredients"]] + ['endseq']
    for rec in finalDataset]
food = [rec["Name"].lower().strip() for rec in finalDataset]
print(food[0])
print(ingredients[0])

jolly time popcorn
['salt', 'yellow 5 lake', 'tricalcium phosphate and artificial butter flavor']


## Cleaning Data

In [6]:
import string
# prepare translation table for removing punctuation
table = str.maketrans('', '', string.punctuation)
for i in range(len(ingredients)):
    key = ingredients[i]
    for j in range(len(key)):
        desc = key[j]
        # tokenize
        desc = desc.split()
        # convert to lower case
        desc = [word.lower() for word in desc]
        # remove punctuation from each token
        desc = [w.translate(table) for w in desc]
        # remove hanging 's' and 'a'
        desc = [word for word in desc if len(word)>1]
        # remove tokens with numbers in them
        desc = [word for word in desc if word.isalpha()]
        # store as string
        key[j] = ' '.join(desc)
print(ingredients[0])

['salt', 'yellow lake', 'tricalcium phosphate and artificial butter flavor']


## Original & Distinct Vocabulary Space

In [7]:
allVocab = [i for ing in ingredients for i in ing]
vocabulary = set(allVocab)
print("All Vocabulary = %d" % len(allVocab))
print("Distinct Vocabulary = %d" % len(vocabulary))

All Vocabulary = 23385
Distinct Vocabulary = 5523


## Freq-based Filtering

In [8]:
# Create a list of all the training captions
all_train_ingreds = allVocab
# for key, val in train_descriptions.items():
#     for cap in val:
#         all_train_captions.append(cap)

# Consider only words which occur at least 10 times in the corpus
word_count_threshold = 10
word_counts = {}
nIngreds = 0
for ingred in all_train_ingreds:
    nIngreds += 1
#     for w in ingred.split(' '):
    w = ingred
    word_counts[w] = word_counts.get(w, 0) + 1

vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]

print(vocab[0])
print('preprocessed words %d ' % len(vocab))
# preprocessed words 1651

salt
preprocessed words 344 


## Vocabulary Encoding

In [68]:
encVocab = {"<UNK>": 1, "<PAD>": 0}
for ingred in vocab: # list of list of ingredients
    for token in ingred.split():
        if token not in encVocab.keys():
            encVocab[token] = len(encVocab)
    if ingred not in encVocab.keys():
        encVocab["_".join(ingred.split())] = len(encVocab)
        
for f in food:
    if f not in encVocab.keys():
        encVocab["_".join(f.split())] = len(encVocab)
    for token in f.split():
        if token not in encVocab.keys():
            encVocab[token] = len(encVocab)
print("Number of unique tokens: {}".format(len(encVocab)))
vocab_size = len(encVocab)

Number of unique tokens: 4851


In [69]:
wordtoix = {}
ixtoword = {}
for word, ix in encVocab.items():
    wordtoix[word] = ix
    ixtoword[ix] = word


## Determine the maximum ingredients count per meal

In [70]:
allIngreds = ingredients

def maxCount(allIngreds):
    mx = max(len(ingreds) for ingreds in allIngreds)
    return mx

max_count = maxCount(allIngreds)
print("Maximum Ingredients count = %d" % max_count)

Maximum Ingredients count = 49


## Data Generator

In [107]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
# data generator, intended to be used in a call to model.fit_generator()
def data_generator(ingredients, foods, wordtoix, max_length, num_foods_per_batch):
    X1, X2, y = list(), list(), list()
    n=0
    # loop for ever over foods
    while 1:
        for idx, ingreds in enumerate(ingredients):
            n+=1
            # retrieve the food feature
            food = foods[idx]
            # encode the sequence
            seq = [wordtoix[word] for word in ingreds if word in wordtoix]
            
            # split one sequence into multiple X, y pairs
            for i in range(1, len(seq)):
                # split into input and output pair
                in_seq, out_seq = seq[:i], seq[i]
                # pad input sequence
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                # encode output sequence
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                # store
                X1.append(food)
                X2.append(in_seq)
                y.append(out_seq)
            # yield the batch data
            if n==num_foods_per_batch:
                yield [[array(X1), array(X2)], array(y)]
                X1, X2, y = list(), list(), list()
                n=0

## Word Embeddings

In [72]:
import os, numpy as np
# Load Glove vectors
glove_dir = '../dataset/glove'
embeddings_index = {} # empty dictionary
f = open(os.path.join(glove_dir, 'glove.6B.200d.txt'), encoding="utf-8")

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [73]:
embedding_dim = 200# Get 200-dim dense vector for each of the 10000 words in out vocabulary
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for ingred, i in wordtoix.items():
    #if i < max_words:
    # else np.zeros((200,))
    embedding_vector = np.sum([embeddings_index.get(word) for word in ingred.split("_") if embeddings_index.get(word) is not None], axis = 0)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        embedding_matrix[i] = embedding_vector

# 12. Model Architecture

In [87]:
cleanIngredsList = [" ".join(["_".join(ingred.split()) for ingred in ingreds]) for ingreds in ingredients]

cleanFood = ["_".join(f.split()) for f in food]
food_vocab = list(set(cleanFood))
food_vocab_size = len(food_vocab)

print("Food Vocab size: %d" % food_vocab_size)
print(cleanFood[0])
print(cleanIngredsList[0])

Food Vocab size: 1702
jolly_time_popcorn
salt yellow_lake tricalcium_phosphate_and_artificial_butter_flavor


In [127]:
from keras.layers import Dropout, Dense, Embedding, LSTM
from keras import Input
from keras.layers.merge import add
from keras.models import Model

# image feature extractor model
inputs1 = Input(shape=(200,))
# fe0 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs1)
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

# partial caption sequence model
inputs2 = Input(shape=(max_count,))
se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

# decoder (feed forward) model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

# merge the two input models
model = Model(inputs=[inputs1, inputs2], outputs=outputs)

In [128]:
model.summary()

Model: "model_9"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_25 (InputLayer)           (None, 49)           0                                            
__________________________________________________________________________________________________
input_24 (InputLayer)           (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_16 (Embedding)        (None, 49, 200)      970200      input_25[0][0]                   
__________________________________________________________________________________________________
dropout_23 (Dropout)            (None, 200)          0           input_24[0][0]                   
____________________________________________________________________________________________

### set embedding layer to untrainable

In [130]:
model.layers[2]

<keras.layers.embeddings.Embedding at 0x7f569e984550>

In [131]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable = False

# model.layers[3].set_weights([embedding_matrix])
# model.layers[3].trainable = False

### compile the model using adam optimizer

In [132]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [133]:
epochs = 10
number_pics_per_bath = 3
# training_food_ingredients = dict(zip(cleanFood, cleanIngredsList))
steps = len(cleanIngredsList)//number_pics_per_bath

In [134]:
foodEmbeddings = np.zeros((len(food), embedding_dim))

for i, f in enumerate(food):
    embedding_vector = np.sum([embeddings_index.get(word) for word in f.split("_") if embeddings_index.get(word) is not None], axis = 0)
    if embedding_vector is not None:
        # Words not found in the embedding index will be all zeros
        foodEmbeddings[i] = embedding_vector

In [136]:
from numpy import array
for i in range(epochs):
    generator = data_generator(cleanIngredsList, foodEmbeddings, wordtoix, max_count, number_pics_per_bath)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    model.save('./model_weights/model_' + str(i) + '.h5')

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


In [137]:
for i in range(epochs):
    generator = data_generator(cleanIngredsList, foodEmbeddings, wordtoix, max_count, number_pics_per_bath)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    model.save('./model_weights/model_' + str(i) + '.h5')

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


In [140]:
model.optimizer.learning_rate = 0.0001

In [142]:
# model.optimizer.lr = 0.0001
epochs = 10
number_pics_per_bath = 6
steps = len(cleanIngredsList)//number_pics_per_bath

In [None]:
for i in range(epochs):
    generator = data_generator(cleanIngredsList, foodEmbeddings, wordtoix, max_count, number_pics_per_bath)
    model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
    #model.save('./model_weights/model_' + str(i) + '.h5')

Epoch 1/1
Epoch 1/1

In [None]:
model.save_weights('./model_weights/model_30.h5')

# Prediction

Load Model

In [None]:
model.load_weights('./model_weights/model_30.h5')

Greedy Prediction Function

In [None]:
max_length = max_countount
def greedySearch(food):
    in_text = 'startseq'
#     in_text = ''
    for i in range(max_length):
        sequence = [wordtoix[w] for w in in_text.split() if w in wordtoix]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([food,sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = ixtoword[yhat]
        in_text += ' ' + word
#   in_text in_text.strip()
        if word == 'endseq':
            break
    final = in_text.split()
    final = final[1:-1]
    final = ' '.join(final)
    return final

In [None]:
z= 15
key = listfoodEmbeddingsEmbeddings.kefoodEmbeddingsz]
foodEmbedding = foodEmbeddings[key]
print(foodEmbedding.shape)

print("Label: %s" % cleanFood[key])
print("Actual Ingredients: ", cleanIngredsList[key])
print("Greedy Ingredients: ",greedySearch(image))