In [1]:
import tensorflow as tf
from keras.utils import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
from sklearn.model_selection import train_test_split

# from tensorflow import set_random_seed
# from numpy.random import seed
# set_random_seed(2)
# seed(1)

import pandas as pd
import numpy as np
import string, os 
import re
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
current_dir = os.path.dirname(os.path.realpath("__file__"))
repo_dir = os.path.dirname(current_dir)
DATA_FILE = os.path.join(repo_dir,"data","cleaned_data.csv")
data = pd.read_csv(DATA_FILE, encoding="utf-8",sep=',')

In [3]:
recipes = data["recettes"].to_list()
recipes = [re.sub("\\n\s+","\\n",recipe) for recipe in recipes]

In [4]:
recipes = recipes[:5000]

In [5]:
tokenizer = Tokenizer()
def get_sequence_of_tokens(recipes):
    tokenizer.fit_on_texts(recipes)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for recipe in recipes:
        token_list = tokenizer.texts_to_sequences([recipe])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(recipes)

In [6]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [7]:
X_train = predictors[:int(0.8*len(predictors))]
y_train = label[:int(0.8*len(predictors))]
X_test = predictors[int(0.8*len(predictors)):]
y_test = label[int(0.8*len(predictors)):]

In [8]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics="accuracy")
    
    return model



In [31]:
model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 226, 10)           14410     
                                                                 
 lstm_1 (LSTM)               (None, 100)               44400     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_1 (Dense)             (None, 1441)              145541    
                                                                 
Total params: 204,351
Trainable params: 204,351
Non-trainable params: 0
_________________________________________________________________


In [32]:
model.fit(X_train,
  y_train,
  batch_size=128,
  epochs=10,
  validation_data=(X_test, y_test))

Epoch 1/10


InvalidArgumentError: ignored

In [11]:
model.save(os.path.join("drive","MyDrive","models","lstm_1hl_1do_bs128_10epc.h5"))

In [24]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)

        predicted_ind = np.argmax(predicted, axis=1)
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted_ind:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [18]:
token_list = tokenizer.texts_to_sequences([seed_text])[0]
token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
predicted = model.predict(token_list, verbose=0)

In [21]:
predicted

array([[3.7025348e-08, 6.1544780e-02, 1.2873349e-03, ..., 2.1118794e-06,
        6.1669120e-06, 8.3111502e-08]], dtype=float32)

In [29]:
seed_text = "Ingregients:\n1 pate feuilletee \n12 olives vertes denoyaute \n2 tranches jambon cru"
generated_recipe = generate_text(seed_text = seed_text, next_words=100, model=model, max_sequence_len= max_sequence_len)
print(generated_recipe)

Ingregients:
1 Pate Feuilletee 
12 Olives Vertes Denoyaute 
2 Tranches Jambon Cru Potiron Legumes 1 Branche Celeri 2 Pommes Terre 2 Tomates 1 Courgette 1 Oignon 10 Cl Creme Fraiche 1 Navet 1 Bouillon Ustensiles Louche Mixeur Plongeant Blender Chauffant Couteau Set 3 Poeles Marmite Marmiton Couvercle Cocotte Minute Mixeur Cuillere Bois Mijoteuse Electrique Balance Cuisine Instructions Peler Laver Couper Legumes Petits Morceaux Faire Revenir Poireau Beurre 10 Minutes Marmite Mettre Legumes Recouvrir Eau Ajouter Moitie Bouillon Cube Faire Cuire 30 Minutes 10 Minutes Autocuiseur Recouvrir Poireau Eau Ajouter Bouillon Cube Faire Cuire 20 Minutes Legumes Carottes Navet Tomates Poireau Cuits Mixer Assaisonner Gout Ajouter Creme Fraiche Poireau Non Mixe Melanger Deguster
