In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import gc, sys
from tqdm import tqdm_notebook as tqdm

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
#read the recipes file
pp_recipes = pd.read_csv('../../data/PP_recipes.csv')
pp_recipes.sort_values(['i'], inplace=True)
pp_recipes.head(3)

Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids
46053,40893,0,"[40480, 1454, 16201, 2056, 955, 541, 11332, 82...","[[1424, 8876, 11007], [3484, 21453], [38966, 2...","[40480, 40482, 23667, 11007, 240, 21453, 240, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ...",0,"[3384, 7979, 2127, 3502, 3217, 1257, 2778, 500..."
108526,44394,1,"[40480, 34712, 22683, 11274, 5409, 29868, 40481]","[[5343, 535, 2044, 5409, 7087], [17869, 6020],...","[40480, 40482, 12172, 1281, 5409, 7087, 240, 6...","[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[912, 7557, 2499, 5382]"
41248,85009,2,"[40480, 12187, 13995, 571, 14719, 40481]","[[1061, 494, 813, 2141], [31843], [30645, 4785...","[40480, 40482, 2572, 19472, 31757, 512, 823, 4...","[1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, ...",2,"[4623, 6265, 1168, 6016, 3597, 3440, 7213, 169..."


In [3]:
# we need to create a collection of ingredients_ids to build vectors of recipes
set_ingredients = set()

for i in tqdm(range(len(pp_recipes['ingredient_ids']))):
    current = [int(x.strip()) for x in pp_recipes['ingredient_ids'].values[i][1:-1].split(',')]
    set_ingredients.update(current)
    
print('Number of ingredients in total :', len(set_ingredients))

HBox(children=(IntProgress(value=0, max=178265), HTML(value='')))


Number of ingredients in total : 7993


In [7]:
list_ingredients = list(set_ingredients)

#initialize the matrix
np_matrix = np.zeros((len(pp_recipes), list_ingredients[-1] + 1),dtype=np.uint8)

#populate it
for i in tqdm(range(len(pp_recipes['ingredient_ids']))):
    for x in pp_recipes['ingredient_ids'].values[i][1:-1].split(','):
        np_matrix[i,int(x.strip())] = 1

HBox(children=(IntProgress(value=0, max=178265), HTML(value='')))




In [8]:
#wrap the matrix in a pandas dataframe
pd_matrix = pd.DataFrame(np_matrix,columns=range(list_ingredients[-1] + 1), index=pp_recipes['i'].values,dtype=np.uint8)

del np_matrix
gc.collect()

pd_matrix.shape

(178265, 8023)

In [33]:
def BuildAEModel(n_ingredients):
    inputs = tf.keras.layers.Input((n_ingredients,))
    encoded_layer1 = tf.keras.layers.Dense(4096,activation=None, name='Encoder_Layer_1')(inputs)
    encoded_layer2 = tf.keras.layers.Dense(2048,activation=None, name='Encoder_Layer_2')(encoded_layer1)
    encoded_layer3 = tf.keras.layers.Dense(2048,activation=None, name='Encoder_Layer_3')(encoded_layer2)
    
    embedded = tf.keras.layers.Dense(1024,activation=None, name='embedder')(encoded_layer2)
    
    decoded_layer1 = tf.keras.layers.Dense(2048,activation=None, name='Decoder_Layer_1')(embedded)
    decoded_layer2 = tf.keras.layers.Dense(4096,activation=None, name='Decoder_Layer_2')(decoded_layer1)
    decoded_layer3 = tf.keras.layers.Dense(8192,activation=None, name='Decoder_Layer_3')(decoded_layer2)
    
    outputs = tf.keras.layers.Dense(n_ingredients, activation='sigmoid', name = 'Reconstructor')(decoded_layer2)
    
    model = tf.keras.Model(inputs=inputs, outputs = [outputs])
    return model

In [34]:
my_model = BuildAEModel(pd_matrix.shape[1])
my_model.summary()
my_model.compile('adam',loss='mse')

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_12 (InputLayer)        (None, 8023)              0         
_________________________________________________________________
Encoder_Layer_1 (Dense)      (None, 4096)              32866304  
_________________________________________________________________
Encoder_Layer_2 (Dense)      (None, 2048)              8390656   
_________________________________________________________________
embedder (Dense)             (None, 1024)              2098176   
_________________________________________________________________
Decoder_Layer_1 (Dense)      (None, 2048)              2099200   
_________________________________________________________________
Decoder_Layer_2 (Dense)      (None, 4096)              8392704   
_________________________________________________________________
Reconstructor (Dense)        (None, 8023)              32870231  
Total para

In [35]:
hist = my_model.fit(x = pd_matrix.values, y = pd_matrix.values, batch_size=32, epochs=3)

Epoch 1/3
  2016/178265 [..............................] - ETA: 1:21:04 - loss: 0.0137

KeyboardInterrupt: 