### 1.0 Importing Dependecies:

In [15]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

#configure
import nltk

#stop-words
from nltk.corpus import stopwords
stop_words=set(nltk.corpus.stopwords.words('english'))

# tokenizing
from nltk import word_tokenize, sent_tokenize

# sklearn
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

#keras
import keras
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding, Input, Activation, LSTM
from keras.models import Sequential, Model, load_model

# sets matplotlib to inline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

plt.rcParams['figure.figsize'] = (10.0, 7.5)
pd.set_option('display.max_colwidth', 250)

In [5]:
df = pd.read_csv('data/strings/df_clean.csv', index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59628 entries, p3pKOD6jIHEcjf20CCXohP8uqkG5dGi to 7cXA77UpdDtIfBug2v6lEVIuV3Zcvhm
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   title                59628 non-null  object
 1   ingredients          58153 non-null  object
 2   instructions         59612 non-null  object
 3   ingredients_vector   59628 non-null  object
 4   instructions_vector  59628 non-null  object
dtypes: object(5)
memory usage: 2.7+ MB


In [6]:
df[['title', 'ingredients']] = df[['title', 'ingredients']].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59628 entries, p3pKOD6jIHEcjf20CCXohP8uqkG5dGi to 7cXA77UpdDtIfBug2v6lEVIuV3Zcvhm
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   title                59628 non-null  object
 1   ingredients          59628 non-null  object
 2   instructions         59612 non-null  object
 3   ingredients_vector   59628 non-null  object
 4   instructions_vector  59628 non-null  object
dtypes: object(5)
memory usage: 2.7+ MB


#### 2.0 Defining and Splitting Data:

In [7]:
VOCAB_SIZE = 50000

tfidf_vec = TfidfVectorizer(max_features=VOCAB_SIZE)
label_encoder = LabelEncoder()

X = tfidf_vec.fit_transform(df['ingredients'])
y = label_encoder.fit_transform(df['title'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
print('Train: ', X_train.shape)
print('Test: ', X_test.shape)
print('DF: ', X.shape)

Train:  (39950, 44673)
Test:  (19678, 44673)
DF:  (59628, 44673)


#### Machine Learning:

In [None]:
bayes = MultinomialNB()
bayes.fit(X_train, y_train)
predictions = bayes.predict(X_test)
precision_score(predictions, y_test, average='micro')

In [None]:
classifiers = {'sgd': SGDClassifier(loss='hinge'),
               'svm': SVC(),
               'random_forest': RandomForestClassifier()}

for lbl, clf in classifiers.items():
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print(lbl, precision_score(predictions, y_test, average='micro'))

In [None]:
from scipy.sparse import eye
d = eye(len(tfidf_vec.vocabulary_))
word_pred = bayes.predict_proba(d)
inverse_vocab = {idx: word for word, idx in tfidf_vec.vocabulary_.items()}

In [None]:
from collections import Counter, defaultdict
by_cls = defaultdict(Counter)
for word_idx, pred in enumerate(word_pred):
    for class_idx, score in enumerate(pred):
        cls = label_encoder.classes_[class_idx]
        by_cls[cls][inverse_vocab[word_idx]] = score

In [None]:
for k in by_cls:
    words = [x[0] for x in by_cls[k].most_common(5)]
    print(k, ':', ' '.join(words))

#### Deep Learning:

#### Prepping Non Vectorized Columns:

In [9]:
from itertools import chain
from keras.preprocessing.sequence import pad_sequences
import numpy as np

# Taking all the different characters found in the DF:
chars = list(sorted(set(chain(*df['ingredients']))))
# Giving a unique index to all characters found:
char_to_idx = {ch: idx for idx, ch in enumerate(chars)}
# Taking the longest sequence:
max_sequence_len = max(len(x) for x in df['ingredients'])

In [10]:
char_vectors = []

# Creating vectors for each title:
for txt in df['ingredients']:
    vec = np.zeros((max_sequence_len, len(char_to_idx)))
    vec[np.arange(len(txt)), [char_to_idx[ch] for ch in txt]] = 1
    char_vectors.append(vec)

In [11]:
# Transforming vectors to arrays:
char_vectors = [np.asarray(x) for x in char_vectors]
char_vectors = pad_sequences(char_vectors)

# Encoding titles:
labels = label_encoder.transform(df['title'])

In [12]:
# Splitting data into X & Y train test:
def split(lst):
    training_count = int(0.9 * len(char_vectors))
    return lst[:training_count], lst[training_count:]

training_char_vectors, test_char_vectors = split(char_vectors)
training_labels, test_labels = split(labels)

char_vectors.shape

(59628, 2311, 48)

#### Char CNN Model:

In [16]:
from keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM, Concatenate
from keras.models import Model
from keras import regularizers

def create_char_cnn_model(num_chars, max_sequence_len, num_labels):
    char_input = Input(shape=(max_sequence_len, num_chars), name='input')
    
    conv_1x = Conv1D(128, 6, activation='relu', padding='valid')(char_input)
    max_pool_1x = MaxPooling1D(6)(conv_1x)
    conv_2x = Conv1D(256, 6, activation='relu', padding='valid')(max_pool_1x)
    max_pool_2x = MaxPooling1D(6)(conv_2x)

    flatten = Flatten()(max_pool_2x)
    dense = Dense(128, 
                  activation='relu',
                  kernel_regularizer=regularizers.l2(0.01))(flatten)
    preds = Dense(num_labels, activation='softmax')(dense)

    model = Model(char_input, preds)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])
    return model

char_cnn_model = create_char_cnn_model(len(char_to_idx), char_vectors.shape[1], len(label_encoder.classes_))
char_cnn_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 2311, 48)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 2306, 128)         36992     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 384, 128)          0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 379, 256)          196864    
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 63, 256)           0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 16128)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               2064512   
__________

In [None]:
char_cnn_model.fit(training_char_vectors, training_labels, epochs=20, batch_size=1024)
char_cnn_model.evaluate(test_char_vectors, test_labels)

In [None]:
# save model and architecture to single file
char_cnn_model.save("char_cnn_model.h5")
print("Saved model to disk")

In [19]:
# load model
char_cnn_model_loaded = load_model('char_cnn_model.h5')
# evaluating model
char_cnn_model_loaded.evaluate(test_char_vectors, test_labels)



[14.64173422158814, 0.0001677008217340265]

#### CNN CNN Model 2:

In [20]:
from keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM
from keras.models import Model
from keras.layers.merge import Concatenate

def create_char_cnn_model(num_chars, max_sequence_len, num_labels):
    char_input = Input(shape=(max_sequence_len, num_chars), name='input')
    
    layers = []
    for window in (5, 6, 7):
        conv_1x = Conv1D(128, window, activation='relu', padding='valid')(char_input)
        max_pool_1x = MaxPooling1D(window)(conv_1x)
        dropout_1x = Dropout(0.3)(max_pool_1x)
        conv_2x = Conv1D(128, window, activation='relu', padding='valid')(dropout_1x)
        max_pool_2x = MaxPooling1D(window)(conv_2x)
        dropout_2x = Dropout(0.3)(max_pool_2x)
        layers.append(dropout_2x)

    if len(layers) > 1:
        merged = Concatenate(axis=1)(layers)
    else:
        merged = layers[0]

    dropout = Dropout(0.3)(merged)
    
    flatten = Flatten()(dropout)
    dense = Dense(128, activation='relu')(flatten)
    preds = Dense(num_labels, activation='softmax')(dense)

    model = Model(char_input, preds)
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])
    return model

char_cnn_model_2 = create_char_cnn_model(len(char_to_idx), char_vectors.shape[1], len(label_encoder.classes_))
char_cnn_model_2.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              (None, 2311, 48)     0                                            
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 2307, 128)    30848       input[0][0]                      
__________________________________________________________________________________________________
conv1d_7 (Conv1D)               (None, 2306, 128)    36992       input[0][0]                      
__________________________________________________________________________________________________
conv1d_9 (Conv1D)               (None, 2305, 128)    43136       input[0][0]                      
__________________________________________________________________________________________________
max_poolin

In [None]:
char_cnn_model_2.fit(training_char_vectors, training_labels, epochs=20, batch_size=1024)
char_cnn_model_2.evaluate(test_char_vectors, test_labels)

In [None]:
# save model and architecture to single file
char_cnn_model_2.save("char_cnn_model_2.h5")
print("Saved model to disk")

In [21]:
# load model
char_cnn_model_2_loaded = load_model('char_cnn_model_2.h5')
# evaluate model
char_cnn_model_2_loaded.evaluate(test_char_vectors, test_labels)



[14.684636452844554, 0.0011739057521381855]

#### Training on Tokenized Columns:

In [22]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot

VOCAB_SIZE = 50000
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(df['ingredients_vector'])

In [23]:
import gensim
import os
import re

CACHE_DIR = os.path.expanduser('~/.cache/dl-cookbook')

def download(url):
    filename = os.path.join(CACHE_DIR, re.sub('[^a-zA-Z0-9.]+', '_', url))
    if os.path.exists(filename):
        return filename
    else:
        os.system('mkdir -p "%s"' % CACHE_DIR)
        assert os.system('wget -O "%s" "%s"' % (filename, url)) == 0
        return filename

def load_w2v(tokenizer=None):
    word2vec_gz = download('https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz')
    word2vec_vectors = word2vec_gz.replace('.gz', '')
    if not os.path.exists(word2vec_vectors):
        assert os.system('gunzip -d --keep "%s"' % word2vec_gz) == 0
        
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_vectors, binary=True)
    
    total_count = sum(tokenizer.word_counts.values())
    idf_dict = { k: np.log(total_count/v) for (k,v) in tokenizer.word_counts.items() }
    
    w2v = np.zeros((tokenizer.num_words, w2v_model.syn0.shape[1]))
    idf = np.zeros((tokenizer.num_words, 1))

    for k, v in tokenizer.word_index.items():
        if v >= tokenizer.num_words:
            continue

        if k in w2v_model:
            w2v[v] = w2v_model[k]
            idf[v] = idf_dict[k]

    del w2v_model
    return w2v, idf

In [24]:
w2v, idf = load_w2v(tokenizer)

In [25]:
tokens = tokenizer.texts_to_sequences(df['ingredients_vector'])
tokens = pad_sequences(tokens)

training_count = int(0.9 * len(tokens))

training_tokens, training_labels = tokens[:training_count], labels[:training_count]
test_tokens, test_labels = tokens[training_count:], labels[training_count:]

#### Training With Vectorized Column:

In [26]:
from keras import layers, models
import keras.backend as K


def make_embedding(name, vocab_size, embedding_size, weights=None, mask_zero=True):
    if weights is not None:
        return layers.Embedding(mask_zero=mask_zero, input_dim=vocab_size, 
                                output_dim=weights.shape[1], 
                                weights=[weights], trainable=False, 
                                name='%s/embedding' % name)
    else:
        return layers.Embedding(mask_zero=mask_zero, input_dim=vocab_size, 
                                output_dim=embedding_size,
                                name='%s/embedding' % name)

def create_unigram_model(vocab_size, embedding_size=None, embedding_weights=None, idf_weights=None):
    assert not (embedding_size is None and embedding_weights is None)
    message = layers.Input(shape=(None,), dtype='int32', name='message')
    
    embedding = make_embedding('message_vec', vocab_size, embedding_size, embedding_weights)
    idf = make_embedding('message_idf', vocab_size, embedding_size, idf_weights)

    mask = layers.Masking(mask_value=0)
    def _combine_and_sum(args):
        embedding, idf = args
        return K.sum(embedding * K.abs(idf), axis=1)

    sum_layer = layers.Lambda(_combine_and_sum, name='combine_and_sum')
    sum_msg = sum_layer([mask(embedding(message)), idf(message)])
    fc1 = layers.Dense(units=128, activation='relu')(sum_msg)
    categories = layers.Dense(units=len(label_encoder.classes_), activation='softmax')(fc1)
    
    model = models.Model(
        inputs=[message],
        outputs=categories,
    )
    
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    model.summary()
    return model

unigram_model = create_unigram_model(vocab_size=VOCAB_SIZE,
                                     embedding_weights=w2v,
                                     idf_weights=idf)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
message (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
message_vec/embedding (Embeddin (None, None, 300)    15000000    message[0][0]                    
__________________________________________________________________________________________________
masking_1 (Masking)             (None, None, 300)    0           message_vec/embedding[0][0]      
__________________________________________________________________________________________________
message_idf/embedding (Embeddin (None, None, 1)      50000       message[0][0]                    
__________________________________________________________________________________________________
combine_an

In [None]:
unigram_model.fit(training_tokens, training_labels, epochs=10)
unigram_model.evaluate(test_tokens, test_labels, verbose=2)

In [None]:
# save model and architecture to single file
unigram_model.save("unigram_model.h5")
print("Saved model to disk")

In [28]:
# load model
unigram_model_loaded = load_model('unigram_model.h5')
# evaluate model
unigram_model_loaded.evaluate(test_tokens, test_labels, verbose=2)

[11.765235011497241, 0.000670803286936106]

#### Learning Embeddings:

In [29]:
learned_embeddings_model = create_unigram_model(vocab_size=VOCAB_SIZE, embedding_size=25)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
message (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
message_vec/embedding (Embeddin (None, None, 25)     1250000     message[0][0]                    
__________________________________________________________________________________________________
masking_2 (Masking)             (None, None, 25)     0           message_vec/embedding[0][0]      
__________________________________________________________________________________________________
message_idf/embedding (Embeddin (None, None, 25)     1250000     message[0][0]                    
__________________________________________________________________________________________________
combine_an

In [None]:
learned_embeddings_model.fit(training_tokens, training_labels, epochs=10, batch_size=128)
learned_embeddings_model.evaluate(test_tokens, test_labels, verbose=2)

In [None]:
# save model and architecture to single file
learned_embeddings_model.save("learned_embeddings_model.h5")
print("Saved model to disk")

In [31]:
# load model
learned_embeddings_model_loaded = load_model('learned_embeddings_model.h5')
# evaluate model
learned_embeddings_model_loaded.evaluate(test_tokens, test_labels, verbose=2)

[11.91387734916133, 0.008049639443233272]

#### CNN Model:

In [32]:
def create_cnn_model(vocab_size, embedding_size=None, embedding_weights=None):
    message = layers.Input(shape=(None,), dtype='int32', name='title')
    
    # The convolution layer in keras does not support masking, so we just allow
    # the embedding layer to learn an explicit value.
    embedding = make_embedding('message_vec', vocab_size, embedding_size, embedding_weights,
                              mask_zero=False)

    def _combine_sum(v):
        return K.sum(v, axis=1)

    cnn_1 = layers.Convolution1D(128, 3)
    cnn_2 = layers.Convolution1D(128, 3)
    cnn_3 = layers.Convolution1D(128, 3)
    
    global_pool = layers.GlobalMaxPooling1D()
    local_pool = layers.MaxPooling1D(strides=1, pool_size=3)

    cnn_encoding = global_pool(cnn_3(local_pool(cnn_2(local_pool(cnn_1(embedding(message)))))))
    fc1 = layers.Dense(units=128, activation='elu')(cnn_encoding)
    categories = layers.Dense(units=len(label_encoder.classes_), activation='softmax')(fc1)
    model = models.Model(
        inputs=[message],
        outputs=[categories],
    )
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

In [33]:
cnn_model = create_cnn_model(VOCAB_SIZE, embedding_weights=w2v)
cnn_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
title (InputLayer)              (None, None)         0                                            
__________________________________________________________________________________________________
message_vec/embedding (Embeddin (None, None, 300)    15000000    title[0][0]                      
__________________________________________________________________________________________________
conv1d_11 (Conv1D)              (None, None, 128)    115328      message_vec/embedding[0][0]      
__________________________________________________________________________________________________
max_pooling1d_11 (MaxPooling1D) (None, None, 128)    0           conv1d_11[0][0]                  
                                                                 conv1d_12[0][0]                  
__________

In [None]:
cnn_model.fit(training_tokens, training_labels, epochs=10)
cnn_model.evaluate(test_tokens, test_labels)

In [None]:
# save model and architecture to single file
cnn_model.save("cnn_model.h5")
print("Saved model to disk")

In [34]:
# load model
cnn_model_loaded = load_model('cnn_model.h5')
# evaluate model
cnn_model_loaded.evaluate(test_tokens, test_labels, verbose=2)

[11.793557491075386, 0.000670803286936106]

#### LSTM Model:

In [35]:
def create_lstm_model(vocab_size, embedding_size=None, embedding_weights=None):
    message = layers.Input(shape=(None,), dtype='int32', name='title')
    embedding = make_embedding('message_vec', vocab_size, embedding_size, embedding_weights)(message)

    lstm_1 = layers.LSTM(units=128, return_sequences=False)(embedding)
#     lstm_2 = layers.LSTM(units=128, return_sequences=False)(lstm_1)
    category = layers.Dense(units=len(label_encoder.classes_), activation='softmax')(lstm_1)
    
    model = models.Model(
        inputs=[message],
        outputs=[category],
    )
    model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    return model

In [36]:
lstm_model = create_lstm_model(VOCAB_SIZE, embedding_weights=w2v)
lstm_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
title (InputLayer)           (None, None)              0         
_________________________________________________________________
message_vec/embedding (Embed (None, None, 300)         15000000  
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               219648    
_________________________________________________________________
dense_13 (Dense)             (None, 52474)             6769146   
Total params: 21,988,794
Trainable params: 6,988,794
Non-trainable params: 15,000,000
_________________________________________________________________


In [37]:
lstm_model.fit(training_tokens, training_labels, epochs=10, batch_size=128)
lstm_model.evaluate(test_tokens, test_labels)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[11.46580704646324, 0.0008385041086701325]

In [38]:
# save model and architecture to single file
lstm_model.save("lstm_model.h5")
print("Saved model to disk")

Saved model to disk


#### Predictions:

In [40]:
predictions = {
    'lstm': lstm_model.predict(test_tokens[:100]),
    'char_cnn': char_cnn_model.predict(test_char_vectors[:100]),
    'cnn': cnn_model.predict(test_tokens[:100]),
    'unigram': unigram_model.predict(test_tokens[:100]),
}

In [44]:
# Make a dataframe just for test data

pd.options.display.max_colwidth = 128
test_df = df[training_count:training_count+100].reset_index()
eval_df = pd.DataFrame({
    'ingredients': test_df['ingredients'],
    'title': test_df['title'],
    'lstm': [label_encoder.classes_[np.argmax(x)] for x in predictions['lstm']],
    'cnn': [label_encoder.classes_[np.argmax(x)] for x in predictions['cnn']],
    'char_cnn': [label_encoder.classes_[np.argmax(x)] for x in predictions['char_cnn']],
    'unigram': [label_encoder.classes_[np.argmax(x)] for x in predictions['unigram']]
})
eval_df = eval_df[['ingredients', 'title', 'lstm', 'cnn', 'char_cnn', 'unigram']]
eval_df.head(10)

Unnamed: 0,ingredients,title,lstm,cnn,char_cnn,unigram
0,beer type shrimp olive oil soy sauce fresh lime juice tabasco sauce tomato peeled seeded diced peeled grated gingerroot shop...,cold beer shrimp,guacamole,minute chili,spiced butternut squash stew with couscous,minute chili
1,neelys bbq seasoning recipe follows crab boil seasoning recommended old bay tablespoons soy sauce egg lightly beaten cups ne...,bbq turkey meatloaf,caesar salad,minute chili,gravlax with dill mayonnaise,minute chili
2,pint vanilla ice cream whole milk tablespoons bourbon teaspoons cane syrup molasses honey crumbled shortbread cookies tables...,bourbon pecan pie milk shake,guacamole,minute chili,spiced butternut squash stew with couscous,minute chili
3,swiss chard stems removed leaves thinly sliced tablespoons extravirgin olive oil cups crusty bread cubeskosher salt freshly ...,seared steak with chard salad,guacamole,minute chili,spiced butternut squash stew with couscous,minute chili
4,cups smoked turkey ham ground cups colby jack mozzarella grated onion finely choppedsalt freshly ground black pepper salt cl...,junes chile rellenos,guacamole,minute chili,spiced butternut squash stew with couscous,minute chili
5,assorted mini squash patty pans andor baby zucchini tablespoons extravirgin olive oil frozen pearl onions pint cherry tomato...,sauteed mini vegetable medley,guacamole,minute chili,spiced butternut squash stew with couscous,minute chili
6,butter anchovy fillets drained chopped tablespoons chopped fresh italian parsley chopped fresh thyme leaves minced garlic mi...,cheese crostini with anchovy herb butter,guacamole,minute chili,spiced butternut squash stew with couscous,minute chili
7,shrimp peeled deveined tablespoons canola oil kosher salt teaspoons freshly ground black pepper,perfectly grilled shrimp,guacamole,minute chili,outdoor grilled striped bass with vegetable tian and basil cream,minute chili
8,thin egg noodles angel hair spaghettini cups grated carrots cups bean sprouts rinsed drained cups minced scallion greens clo...,dan dan sesame noodles,chicken cacciatore,minute chili,the ultimate breakfast for dinner sausage and spinach egg strata,minute chili
9,soy sauce enough cover ribs head garlic minced sugar racks spare ribs,al frankens spare ribs,guacamole,minute chili,thin fries,minute chili


In [46]:
eval_df[eval_df['lstm'] != eval_df['title']].head(10)

Unnamed: 0,ingredients,title,lstm,cnn,char_cnn,unigram
0,beer type shrimp olive oil soy sauce fresh lime juice tabasco sauce tomato peeled seeded diced peeled grated gingerroot shop...,cold beer shrimp,guacamole,minute chili,spiced butternut squash stew with couscous,minute chili
1,neelys bbq seasoning recipe follows crab boil seasoning recommended old bay tablespoons soy sauce egg lightly beaten cups ne...,bbq turkey meatloaf,caesar salad,minute chili,gravlax with dill mayonnaise,minute chili
2,pint vanilla ice cream whole milk tablespoons bourbon teaspoons cane syrup molasses honey crumbled shortbread cookies tables...,bourbon pecan pie milk shake,guacamole,minute chili,spiced butternut squash stew with couscous,minute chili
3,swiss chard stems removed leaves thinly sliced tablespoons extravirgin olive oil cups crusty bread cubeskosher salt freshly ...,seared steak with chard salad,guacamole,minute chili,spiced butternut squash stew with couscous,minute chili
4,cups smoked turkey ham ground cups colby jack mozzarella grated onion finely choppedsalt freshly ground black pepper salt cl...,junes chile rellenos,guacamole,minute chili,spiced butternut squash stew with couscous,minute chili
5,assorted mini squash patty pans andor baby zucchini tablespoons extravirgin olive oil frozen pearl onions pint cherry tomato...,sauteed mini vegetable medley,guacamole,minute chili,spiced butternut squash stew with couscous,minute chili
6,butter anchovy fillets drained chopped tablespoons chopped fresh italian parsley chopped fresh thyme leaves minced garlic mi...,cheese crostini with anchovy herb butter,guacamole,minute chili,spiced butternut squash stew with couscous,minute chili
7,shrimp peeled deveined tablespoons canola oil kosher salt teaspoons freshly ground black pepper,perfectly grilled shrimp,guacamole,minute chili,outdoor grilled striped bass with vegetable tian and basil cream,minute chili
8,thin egg noodles angel hair spaghettini cups grated carrots cups bean sprouts rinsed drained cups minced scallion greens clo...,dan dan sesame noodles,chicken cacciatore,minute chili,the ultimate breakfast for dinner sausage and spinach egg strata,minute chili
9,soy sauce enough cover ribs head garlic minced sugar racks spare ribs,al frankens spare ribs,guacamole,minute chili,thin fries,minute chili


In [None]:
dictionary = tokenizer.word_index

In [None]:
def convert_text_to_index_array(text):
    return [dictionary[word] for word in kpt.text_to_word_sequence(text)]

In [None]:
allWordIndices = []
for text in X_train:
    wordIndices = convert_text_to_index_array(text)
    allWordIndices.append(wordIndices)

In [None]:
allWordIndices = np.asarray(allWordIndices)

X_train = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=num_classes)

In [None]:
y_train = keras.utils.to_categorical(y_train.factorize()[0], num_classes)
y_test = keras.utils.to_categorical(y_test.factorize()[0], num_classes)

In [None]:
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

In [None]:
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

## Parameters:

In [None]:
max_words = 1000
batch_size = 100
epochs = 8

## Training Models:

In [None]:
print('Building model...')
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(256))
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    shuffle=True)

score = model.evaluate(X_test, y_test,
                       batch_size=batch_size, verbose=1)

print('Test score:', score[0])
print('Test accuracy:', score[1])

In [None]:
model_json = model.to_json()
with open('model.json', 'w') as json_file:
    json_file.write(model_json)

model.save_weights('model.h5')

In [None]:
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
score = loaded_model.evaluate(X_train, y_train, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

In [None]:
import matplotlib.pyplot as plt

history = model.fit(x, y, validation_split=0.25, epochs=50, batch_size=16, verbose=1)

# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()