In [1]:
# Importing dependecies:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re
from bs4 import BeautifulSoup
import sys
import os
from collections import Counter
import tqdm

from nltk.tokenize import sent_tokenize

import keras
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import text
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model, Sequential
from keras.layers import Input, Embedding, Dense, Flatten, Dropout, Activation
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.merge import concatenate

import matplotlib.pyplot as plt
import seaborn as sns
plt.switch_backend('agg')
%matplotlib inline
from pandas import compat
compat.PY3 = True

# Configuring Notebook environment:
sns.set()
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

plt.rcParams['figure.figsize'] = (10.0, 7.5)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.


In [2]:
df = pd.read_csv('data/strings/df_clean.csv', index_col=0)
df.head()

Unnamed: 0,title,ingredients,instructions,ingredients_vector,instructions_vector
p3pKOD6jIHEcjf20CCXohP8uqkG5dGi,grammie hamblets deviled crab,celery finely chopped green pepper finely chop...,toss ingredients lightly spoon buttered baking...,"['celery', 'finely', 'chopped', 'green', 'pepp...","['toss', 'ingredients', 'lightly', 'spoon', 'b..."
S7aeOIrsrgT0jLP32jKGg4j.o9zi2DO,infineon raceway baked beans,skirt steak cut inch dicekosher salt fresh cra...,watch make recipe sprinkle steak salt pepper s...,"['skirt', 'steak', 'cut', 'inch', 'dicekosher'...","['watch', 'make', 'recipe', 'sprinkle', 'steak..."
o9MItV9txfoPsUQ4v8b0vh1.VdjwfsK,southwestern black bean dip,cups dried black beans picked rinsed cups wate...,saucepan let beans soak enough cold water cove...,"['cups', 'dried', 'black', 'beans', 'picked', ...","['saucepan', 'let', 'beans', 'soak', 'enough',..."
5l1yTSYFifF/M2dfbD6DX28WWQpLWNK,sour cream noodle bake,ground chuckone tomato sauce saltfreshly groun...,watch make recipe preheat oven degrees f brown...,"['ground', 'chuckone', 'tomato', 'sauce', 'sal...","['watch', 'make', 'recipe', 'preheat', 'oven',..."
kRBQSWtqYWqtkb34FGeenBSbC32gIdO,sushi renovation,rice brown mediumgrain cookedcup quinoacup swe...,special equipment sushi mat cook brown rice qu...,"['rice', 'brown', 'mediumgrain', 'cookedcup', ...","['special', 'equipment', 'sushi', 'mat', 'cook..."


## Tokenizing Titles:

In [3]:
df['title_tokenized'] = list(df['title'].apply(sent_tokenize).astype(str))
df.head()

Unnamed: 0,title,ingredients,instructions,ingredients_vector,instructions_vector,title_tokenized
p3pKOD6jIHEcjf20CCXohP8uqkG5dGi,grammie hamblets deviled crab,celery finely chopped green pepper finely chop...,toss ingredients lightly spoon buttered baking...,"['celery', 'finely', 'chopped', 'green', 'pepp...","['toss', 'ingredients', 'lightly', 'spoon', 'b...",['grammie hamblets deviled crab']
S7aeOIrsrgT0jLP32jKGg4j.o9zi2DO,infineon raceway baked beans,skirt steak cut inch dicekosher salt fresh cra...,watch make recipe sprinkle steak salt pepper s...,"['skirt', 'steak', 'cut', 'inch', 'dicekosher'...","['watch', 'make', 'recipe', 'sprinkle', 'steak...",['infineon raceway baked beans']
o9MItV9txfoPsUQ4v8b0vh1.VdjwfsK,southwestern black bean dip,cups dried black beans picked rinsed cups wate...,saucepan let beans soak enough cold water cove...,"['cups', 'dried', 'black', 'beans', 'picked', ...","['saucepan', 'let', 'beans', 'soak', 'enough',...",['southwestern black bean dip']
5l1yTSYFifF/M2dfbD6DX28WWQpLWNK,sour cream noodle bake,ground chuckone tomato sauce saltfreshly groun...,watch make recipe preheat oven degrees f brown...,"['ground', 'chuckone', 'tomato', 'sauce', 'sal...","['watch', 'make', 'recipe', 'preheat', 'oven',...",['sour cream noodle bake']
kRBQSWtqYWqtkb34FGeenBSbC32gIdO,sushi renovation,rice brown mediumgrain cookedcup quinoacup swe...,special equipment sushi mat cook brown rice qu...,"['rice', 'brown', 'mediumgrain', 'cookedcup', ...","['special', 'equipment', 'sushi', 'mat', 'cook...",['sushi renovation']


## Defining and Splitting Data:

In [4]:
X = df['ingredients_vector']
y = df['title_tokenized']
X

p3pKOD6jIHEcjf20CCXohP8uqkG5dGi    ['celery', 'finely', 'chopped', 'green', 'pepp...
S7aeOIrsrgT0jLP32jKGg4j.o9zi2DO    ['skirt', 'steak', 'cut', 'inch', 'dicekosher'...
o9MItV9txfoPsUQ4v8b0vh1.VdjwfsK    ['cups', 'dried', 'black', 'beans', 'picked', ...
5l1yTSYFifF/M2dfbD6DX28WWQpLWNK    ['ground', 'chuckone', 'tomato', 'sauce', 'sal...
kRBQSWtqYWqtkb34FGeenBSbC32gIdO    ['rice', 'brown', 'mediumgrain', 'cookedcup', ...
                                                         ...                        
4bfMWxlbKhx/McJq/89k0SBdw.VvAzW    ['ears', 'fresh', 'corn', 'heads', 'belgian', ...
T8lWBA1fcVdjxhMSWuoAbGoy5Lj.A8m    ['plum', 'tomatoessalt', 'sugar', 'zucchini', ...
f/coffo2TMs2J2gq5nTOUIqH2TRAkui    ['tablespoons', 'olive', 'oil', 'tablespoons',...
q3aDJc4zoEF5QT4e7Mn.ieQwV.DyHwS    ['ounces', 'butter', 'ounces', 'bittersweet', ...
7cXA77UpdDtIfBug2v6lEVIuV3Zcvhm    ['cans', 'restaurantstyle', 'condensed', 'crab...
Name: ingredients_vector, Length: 59612, dtype: object

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [29]:
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

47689 train sequences
11923 test sequences


In [30]:
num_classes = len(y_train) + 1
num_classes

47690

## Tokenizing Split Data:

In [32]:
print('Vectorizing sequence data...')
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
tokenizer.fit_on_texts(X_test)

Vectorizing sequence data...


In [33]:
dictionary = tokenizer.word_index

In [34]:
def convert_text_to_index_array(text):
    return [dictionary[word] for word in kpt.text_to_word_sequence(text)]

In [35]:
allWordIndices = []
for text in X_train:
    wordIndices = convert_text_to_index_array(text)
    allWordIndices.append(wordIndices)

In [36]:
allWordIndices = np.asarray(allWordIndices)

X_train = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test, maxlen=max_words)

In [37]:
y_train = keras.utils.to_categorical(y_train.factorize()[0], num_classes)
y_test = keras.utils.to_categorical(y_test.factorize()[0], num_classes)

In [38]:
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

y_train shape: (47689, 47690)
y_test shape: (11923, 47690)


In [39]:
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

X_train shape: (47689, 1000)
X_test shape: (11923, 1000)


## Parameters:

In [None]:
max_words = 1000
batch_size = 100
epochs = 8

## Training Models:

In [None]:
print('Building model...')
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
# model.add(Dense(256))
# model.add(Activation('sigmoid'))
# model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

# model = Sequential()
# model.add(Dense(512, input_shape=(max_words,), activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(256, input_shape=(max_words,), activation='sigmoid'))
# model.add(Dropout(0.5))
# model.add(Dense(2, input_shape=(num_classes), activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    shuffle=True)

score = model.evaluate(X_test, y_test,
                       batch_size=batch_size, verbose=1)

print('Test score:', score[0])
print('Test accuracy:', score[1])

Building model...
Train on 42920 samples, validate on 4769 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8

In [None]:
model_json = model.to_json()
with open('model.json', 'w') as json_file:
    json_file.write(model_json)

model.save_weights('model.h5')