In [1]:
import os
import pandas as pd
import numpy as np
import collections
import tensorflow as tf
import pickle

In [2]:
models_folder_name = os.path.join(os.getcwd(),'models')
path_to_preprocessed_texts = os.path.join(os.getcwd(),
                                          'texts','preprocessed_texts_for_doc2vec.pkl') 

df_preprocessed_texts = pd.read_pickle(path_to_preprocessed_texts)

preprocessed_texts = df_preprocessed_texts.preprocessed_texts.values.tolist()
labels = df_preprocessed_texts['labels'].values.tolist()

unique_labels=sorted(set(labels))
number_categories=len(unique_labels)
categories_indices=np.linspace(0,number_categories-1,number_categories,dtype=int)
labels2integers=dict(zip(unique_labels,categories_indices))

print(labels2integers)

{'sushi': 2, 'steak': 1, 'tiramisu': 3, 'sashimi': 0}


In [3]:
batch_size = 2
generations = 100000
model_learning_rate = 0.001

embedding_size = 24   #word embedding size
doc_embedding_size = 12  #document embedding size
concatenated_size = embedding_size + doc_embedding_size

save_embeddings_every = 5000
print_valid_every = 5000
print_loss_every = 50

In [4]:
def build_dictionary(preprocessed_texts):
    words=[w for words_in_recipe in preprocessed_texts for w in words_in_recipe]
    count = []
    count.extend(collections.Counter(words))
    count=sorted(count)
    word_dict = {}
    for word in count:
        word_dict[word] = len(word_dict)
    
    return (word_dict)

#replace each word in texts with integer value
def text_to_numbers(preprocessed_texts, word_dict):
    data = []
    for prepr_text in preprocessed_texts:
        text_data = []
        for word in prepr_text:
            if word in word_dict:
                word_ix = word_dict[word]
            else:
                word_ix = 0
            text_data.append(word_ix)
        data.append(text_data)
    return (data)


def create_batch_data(text_with_words_conv_to_numbers, batch_size=batch_size):
    batch_data = []
    label_data = []
    
    rand_text_ix = int(np.random.choice(len(text_with_words_conv_to_numbers), size=1))
    rand_text = text_with_words_conv_to_numbers[rand_text_ix]
    word_to_predict_label=np.random.choice(list(set(rand_text)), size=1,replace=False)
    
    while len(batch_data) < batch_size:
        item_in_batch=[]        
        
        label_words=np.random.choice(rand_text, size=1,replace=False)

        item_in_batch.extend(word_to_predict_label)
        item_in_batch.append(rand_text_ix)     
        label_data.extend(label_words)
        batch_data.append(item_in_batch)

        
    batch_data = np.array(batch_data)
    label_data = np.transpose(np.array(label_data))

    return (batch_data, label_data)

In [5]:
word_dictionary=build_dictionary(preprocessed_texts)
vocabulary_size=len(word_dictionary)
print(word_dictionary)
print(vocabulary_size)

word_dictionary_rev = dict(zip(word_dictionary.values(), word_dictionary.keys()))

{'seed': 100, 'surface': 122, 'sashimi': 96, 'starch': 115, 'press': 86, 'garnish': 47, 'stick': 117, 'soy': 108, 'onion': 75, 'tempura': 124, 'sauce': 97, 'breast': 14, 'speed': 110, 'carrot': 18, 'curl': 33, 'paper': 76, 'pinch': 81, 'noodle': 72, 'skewer': 106, 'rice': 90, 'vinegar': 138, 'cream': 31, 'quantity': 88, 'bag': 4, 'spicy': 111, 'pressure': 87, 'roe': 91, 'guacamole': 53, 'butter': 16, 'shrimp': 103, 'sesame': 101, 'thumb': 127, 'salt': 95, 'water': 141, 'confectioner': 28, 'pepper': 79, 'sirloin': 105, 'spread': 112, 'bottom': 13, 'liqueur': 60, 'leg': 58, 'preheat': 85, 'batter': 7, 'lime': 59, 'tomato': 129, 'chicken': 20, 'tuna': 135, 'yolk': 147, 'juice': 54, 'daikon': 36, 'salmon': 94, 'wafer': 139, 'chocolate': 22, 'grill': 51, 'sushi': 123, 'leaf': 57, 'strawberry': 119, 'mixture': 70, 'sugar': 121, 'teriyaki': 125, 'coffee': 26, 'mushroom': 71, 'pour': 83, 'sprinkle': 114, 'grate': 50, 'wasabi': 140, 'grain': 49, 'finger': 43, 'space': 109, 'tobikko': 128, 'powd

In [6]:
text_data = text_to_numbers(preprocessed_texts, word_dictionary)
print(text_data)

[[94, 3, 75, 140, 100, 90, 72, 80, 94, 95, 94, 141, 44, 53, 75, 129, 95, 95, 95, 95, 95, 95, 95, 95, 34, 3, 3, 3, 3, 80, 75, 70, 3, 70, 53, 92, 59, 54, 59, 59, 54, 59, 54, 3, 54, 59, 53, 70, 140, 140, 10, 140, 53, 10, 10, 101, 100, 108, 97, 129, 129, 129, 3, 29, 57, 57, 70, 29, 53, 24, 94, 95, 94, 94, 44, 96, 34, 107, 24, 10, 115, 24, 107, 94, 115, 24, 115, 94, 94, 115, 94, 94, 94, 46, 107, 46, 94, 74, 94, 94, 117, 46, 94, 94, 94, 94, 94, 94, 46, 74, 94, 74, 94, 47, 94, 90, 72, 72, 94, 53, 29, 90, 72, 53], [135, 140, 108, 97, 48, 49, 82, 43, 49, 107], [96, 94, 41, 101, 100, 74, 101, 100, 97, 48, 41, 101, 100, 74, 122, 101, 100, 74, 43, 94, 41, 101, 100, 122, 43, 41, 41, 44, 101, 100, 74, 117, 41, 41, 44, 82, 41, 43, 41, 41, 34, 41, 97, 122, 11, 94, 107, 43, 82, 94, 97, 48, 94, 96, 107, 97], [123, 41, 12, 123, 41, 12, 29, 57, 101, 100, 74, 135, 135, 41, 107, 107, 41, 34, 135, 107, 12, 12, 135, 100, 74, 43, 74, 135, 29, 41, 122, 41, 12, 11, 44, 135, 12, 44, 44, 82, 41, 12, 1, 34, 1, 11, 

In [7]:
valid_words = ['tuna', 'rice', 'sushi', 'roll', 'sashimi','steak','grill', 'sauce', 'cream']

valid_examples = [word_dictionary[x] for x in valid_words]
print(valid_examples)

[135, 90, 123, 92, 96, 116, 51, 97, 31]


In [8]:
batch_data, label_data = create_batch_data(preprocessed_texts)
print(batch_data)
print(label_data)
print(np.shape(label_data))

[['soy' '28']
 ['soy' '28']]
['rice' 'water']
(2,)


In [9]:
sess = tf.Session()

In [10]:
print('Creating Model')

embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
doc_embeddings = tf.Variable(tf.random_uniform([len(preprocessed_texts), doc_embedding_size], -1.0, 1.0))

decoder_weights = tf.Variable(tf.truncated_normal([vocabulary_size, concatenated_size],
                                               stddev=1.0 / np.sqrt(concatenated_size)))
decoder_biases = tf.Variable(tf.zeros([vocabulary_size]))


x_inputs = tf.placeholder(tf.int32, shape=[None, 2]) #1 for word index and 1 for doc index
y_target = tf.placeholder(tf.int32, shape=[batch_size])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

embed= tf.nn.embedding_lookup(embeddings, x_inputs[:, 0])
    
doc_indices = tf.slice(x_inputs, [0,1],[batch_size,1])
doc_embed = tf.nn.embedding_lookup(doc_embeddings,doc_indices)
final_embed = tf.concat([embed, tf.squeeze(doc_embed)],1)

logits = tf.matmul(final_embed, tf.transpose(decoder_weights)) + decoder_biases


loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,labels=y_target))
optimizer = tf.train.AdamOptimizer(learning_rate=model_learning_rate)
train_step = optimizer.minimize(loss)

#cosine similarity between words
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)


saver = tf.train.Saver({"embeddings": embeddings, "doc_embeddings": doc_embeddings})

init = tf.initialize_all_variables()
sess.run(init)


print('Starting Training')

loss_vec = []
loss_x_vec = []
for i in range(generations):
    batch_inputs, batch_labels = create_batch_data(text_data)
    feed_dict = {x_inputs : batch_inputs, y_target : batch_labels}

    #run the train step
    sess.run(train_step, feed_dict=feed_dict)

    #return the loss
    if (i+1) % print_loss_every == 0:
        loss_val = sess.run(loss, feed_dict=feed_dict)
        loss_vec.append(loss_val)
        loss_x_vec.append(i+1)
        print('Loss at step {} : {}'.format(i+1, loss_val))
        
    #validation: print some random words and top 5 related words
    if (i+1) % print_valid_every == 0:
        sim = sess.run(similarity, feed_dict=feed_dict)
        for j in range(len(valid_words)):
            valid_word = word_dictionary_rev[valid_examples[j]]
            top_k = 5 # number of nearest neighbors
            nearest = (-sim[j, :]).argsort()[1:top_k+1]
            log_str = "Nearest to {}:".format(valid_word)
            for k in range(top_k):
                close_word = word_dictionary_rev[nearest[k]]
                log_str = '{} {},'.format(log_str, close_word)
            print(log_str)
            
    #save dictionary + embeddings
    if (i+1) % save_embeddings_every == 0:
        #save vocabulary dictionary
        with open(os.path.join(models_folder_name,'doc2vec_recipes_dict_words_integers.pkl'), 'wb') as f:
            pickle.dump(word_dictionary, f)
        
        #save embeddings
        model_checkpoint_path = os.path.join(os.getcwd(),models_folder_name,'doc2vec_recipes_checkpoint.ckpt')
        save_path = saver.save(sess, model_checkpoint_path)
        print('Model saved in file: {}'.format(save_path))

Creating Model
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
Use `tf.global_variables_initializer` instead.
Starting Training
Loss at step 50 : 5.709970951080322
Loss at step 100 : 4.424684047698975
Loss at step 150 : 5.260358810424805
Loss at step 200 : 5.357036590576172
Loss at step 250 : 5.188162803649902
Loss at step 300 : 5.177769660949707
Loss at step 350 : 5.223682403564453
Loss at step 400 : 5.191035270690918
Loss at step 450 : 4.744154930114746
Loss at step 500 : 5.264348030090332
Loss at step 550 : 4.773748397827148
Loss at step 600 : 4.752451419830322
Loss at step 650 : 5.3109259605407715
Loss at step 700 : 4.93099308013916
Loss at step 750 : 4.642880916595459
Loss at step 800 : 4.118608474731445
Loss at step 850 : 4.740657806396484
Loss at step 900 : 5.834427833557129
Loss at step 950 : 4.929481029510498
Loss at step 1000 : 4.926198482513428
Loss at step 1050 : 4.7685370445251465
Loss at step 1100 : 4.662642002105713
Los

Loss at step 9250 : 2.464491128921509
Loss at step 9300 : 3.1034622192382812
Loss at step 9350 : 3.695023536682129
Loss at step 9400 : 4.652491569519043
Loss at step 9450 : 3.9373040199279785
Loss at step 9500 : 5.968338966369629
Loss at step 9550 : 2.275705337524414
Loss at step 9600 : 3.187108278274536
Loss at step 9650 : 3.0416078567504883
Loss at step 9700 : 4.031565189361572
Loss at step 9750 : 4.243646144866943
Loss at step 9800 : 2.9661710262298584
Loss at step 9850 : 4.129406929016113
Loss at step 9900 : 3.2145445346832275
Loss at step 9950 : 3.6891138553619385
Loss at step 10000 : 3.98911714553833
Nearest to tuna: salmon, espresso, wafer, batter, spring,
Nearest to rice: cling, position, cucumber, space, sauce,
Nearest to sushi: towel, vinegar, flesh, zip, strip,
Nearest to roll: flesh, sesame, sauce, cone, dipping,
Nearest to sashimi: piece, leaf, torch, topping, daikon,
Nearest to steak: salt, leaf, onion, pepper, ground,
Nearest to grill: grate, worcestershire, chive, salt,

Loss at step 18100 : 2.9585819244384766
Loss at step 18150 : 3.6475372314453125
Loss at step 18200 : 5.69390344619751
Loss at step 18250 : 4.089812755584717
Loss at step 18300 : 3.378868818283081
Loss at step 18350 : 3.0530407428741455
Loss at step 18400 : 0.8624075055122375
Loss at step 18450 : 4.986883640289307
Loss at step 18500 : 4.409065246582031
Loss at step 18550 : 2.4723587036132812
Loss at step 18600 : 4.020962238311768
Loss at step 18650 : 3.7619361877441406
Loss at step 18700 : 3.6376419067382812
Loss at step 18750 : 2.9598886966705322
Loss at step 18800 : 3.5508806705474854
Loss at step 18850 : 1.8267490863800049
Loss at step 18900 : 5.214532852172852
Loss at step 18950 : 3.141719341278076
Loss at step 19000 : 4.310990810394287
Loss at step 19050 : 2.5568935871124268
Loss at step 19100 : 3.5694637298583984
Loss at step 19150 : 2.9866292476654053
Loss at step 19200 : 3.4422473907470703
Loss at step 19250 : 0.9102305769920349
Loss at step 19300 : 2.7261962890625
Loss at step 

Loss at step 27000 : 2.9443163871765137
Loss at step 27050 : 2.595182418823242
Loss at step 27100 : 3.2917532920837402
Loss at step 27150 : 2.4603588581085205
Loss at step 27200 : 3.7058727741241455
Loss at step 27250 : 2.747903823852539
Loss at step 27300 : 5.492403984069824
Loss at step 27350 : 3.084425687789917
Loss at step 27400 : 3.2286510467529297
Loss at step 27450 : 3.3954687118530273
Loss at step 27500 : 2.8226919174194336
Loss at step 27550 : 2.9183273315429688
Loss at step 27600 : 2.9342098236083984
Loss at step 27650 : 4.356785774230957
Loss at step 27700 : 4.089685440063477
Loss at step 27750 : 3.3576622009277344
Loss at step 27800 : 3.5520176887512207
Loss at step 27850 : 3.4748029708862305
Loss at step 27900 : 2.6164774894714355
Loss at step 27950 : 4.430897235870361
Loss at step 28000 : 3.0526623725891113
Loss at step 28050 : 2.715397834777832
Loss at step 28100 : 2.5369174480438232
Loss at step 28150 : 3.08416748046875
Loss at step 28200 : 2.2701871395111084
Loss at st

Loss at step 35900 : 2.661323308944702
Loss at step 35950 : 3.1783242225646973
Loss at step 36000 : 3.066302537918091
Loss at step 36050 : 5.271042823791504
Loss at step 36100 : 3.08550763130188
Loss at step 36150 : 2.9678473472595215
Loss at step 36200 : 4.599382400512695
Loss at step 36250 : 4.209259510040283
Loss at step 36300 : 2.052501916885376
Loss at step 36350 : 2.4843480587005615
Loss at step 36400 : 2.3564658164978027
Loss at step 36450 : 2.652085781097412
Loss at step 36500 : 2.2219014167785645
Loss at step 36550 : 2.2271344661712646
Loss at step 36600 : 2.946514129638672
Loss at step 36650 : 2.442829132080078
Loss at step 36700 : 2.1596226692199707
Loss at step 36750 : 2.716336727142334
Loss at step 36800 : 4.5653533935546875
Loss at step 36850 : 2.561020851135254
Loss at step 36900 : 1.828066110610962
Loss at step 36950 : 2.5727365016937256
Loss at step 37000 : 2.928762435913086
Loss at step 37050 : 3.8836286067962646
Loss at step 37100 : 3.3459279537200928
Loss at step 37

Model saved in file: /notebooks/school/text_feature_extraction/models/doc2vec_recipes_checkpoint.ckpt
Loss at step 45050 : 2.9423041343688965
Loss at step 45100 : 3.6911678314208984
Loss at step 45150 : 2.7664008140563965
Loss at step 45200 : 3.050328493118286
Loss at step 45250 : 4.157364368438721
Loss at step 45300 : 1.6424990892410278
Loss at step 45350 : 2.024791717529297
Loss at step 45400 : 4.145397186279297
Loss at step 45450 : 3.793700695037842
Loss at step 45500 : 2.9577395915985107
Loss at step 45550 : 4.773894786834717
Loss at step 45600 : 2.4538161754608154
Loss at step 45650 : 2.523203134536743
Loss at step 45700 : 3.7238528728485107
Loss at step 45750 : 3.231067419052124
Loss at step 45800 : 3.901339530944824
Loss at step 45850 : 4.41019344329834
Loss at step 45900 : 2.112452268600464
Loss at step 45950 : 2.128045082092285
Loss at step 46000 : 2.0744540691375732
Loss at step 46050 : 3.3924074172973633
Loss at step 46100 : 4.242480278015137
Loss at step 46150 : 2.474838495

Loss at step 54550 : 3.492454767227173
Loss at step 54600 : 3.438828945159912
Loss at step 54650 : 2.784482479095459
Loss at step 54700 : 2.5015997886657715
Loss at step 54750 : 1.6735605001449585
Loss at step 54800 : 2.465521812438965
Loss at step 54850 : 3.055821418762207
Loss at step 54900 : 4.563014507293701
Loss at step 54950 : 4.09375524520874
Loss at step 55000 : 2.792577028274536
Nearest to tuna: mushroom, block, spring, piece, crab,
Nearest to rice: salmon, finger, strip, cling, cucumber,
Nearest to sushi: towel, fish, strip, flesh, roll,
Nearest to roll: sesame, flesh, sushi, bamboo, bit,
Nearest to sashimi: leaf, piece, coriander, torch, angle,
Nearest to steak: onion, salt, grill, marinade, pepper,
Nearest to grill: worcestershire, marinade, preheat, grate, steak,
Nearest to sauce: chive, salmon, grill, ground, press,
Nearest to cream: liqueur, mascarpone, wafer, coffee, chocolate,
Model saved in file: /notebooks/school/text_feature_extraction/models/doc2vec_recipes_checkpo

Loss at step 63400 : 4.558870792388916
Loss at step 63450 : 2.710268974304199
Loss at step 63500 : 2.1965832710266113
Loss at step 63550 : 2.426025867462158
Loss at step 63600 : 5.615057468414307
Loss at step 63650 : 3.3772387504577637
Loss at step 63700 : 2.8770928382873535
Loss at step 63750 : 4.877509117126465
Loss at step 63800 : 2.7750353813171387
Loss at step 63850 : 2.7906055450439453
Loss at step 63900 : 2.877695083618164
Loss at step 63950 : 2.4403204917907715
Loss at step 64000 : 3.4884424209594727
Loss at step 64050 : 3.9508163928985596
Loss at step 64100 : 3.767225980758667
Loss at step 64150 : 3.4206182956695557
Loss at step 64200 : 3.000096082687378
Loss at step 64250 : 2.422187328338623
Loss at step 64300 : 2.8212547302246094
Loss at step 64350 : 3.2788143157958984
Loss at step 64400 : 3.5826988220214844
Loss at step 64450 : 3.7994251251220703
Loss at step 64500 : 2.2215359210968018
Loss at step 64550 : 3.3639817237854004
Loss at step 64600 : 2.53136944770813
Loss at ste

Loss at step 72250 : 2.9077465534210205
Loss at step 72300 : 2.947680711746216
Loss at step 72350 : 2.6043169498443604
Loss at step 72400 : 3.8368682861328125
Loss at step 72450 : 1.7162790298461914
Loss at step 72500 : 2.349435806274414
Loss at step 72550 : 3.710644006729126
Loss at step 72600 : 3.0131728649139404
Loss at step 72650 : 2.2287354469299316
Loss at step 72700 : 2.34416127204895
Loss at step 72750 : 3.5570034980773926
Loss at step 72800 : 3.4924187660217285
Loss at step 72850 : 2.4188828468322754
Loss at step 72900 : 2.94832444190979
Loss at step 72950 : 2.121757745742798
Loss at step 73000 : 2.626128673553467
Loss at step 73050 : 2.481660842895508
Loss at step 73100 : 3.5667850971221924
Loss at step 73150 : 2.682290554046631
Loss at step 73200 : 5.021601676940918
Loss at step 73250 : 3.5182547569274902
Loss at step 73300 : 2.6504740715026855
Loss at step 73350 : 3.0604021549224854
Loss at step 73400 : 3.68375301361084
Loss at step 73450 : 2.2158634662628174
Loss at step 7

Loss at step 81100 : 2.795480251312256
Loss at step 81150 : 2.957104206085205
Loss at step 81200 : 2.9193665981292725
Loss at step 81250 : 3.4734623432159424
Loss at step 81300 : 2.1194686889648438
Loss at step 81350 : 2.806828498840332
Loss at step 81400 : 2.2994589805603027
Loss at step 81450 : 2.6702280044555664
Loss at step 81500 : 4.221332550048828
Loss at step 81550 : 2.6607918739318848
Loss at step 81600 : 3.2559471130371094
Loss at step 81650 : 5.035838603973389
Loss at step 81700 : 4.774707794189453
Loss at step 81750 : 2.2532453536987305
Loss at step 81800 : 1.7159794569015503
Loss at step 81850 : 4.329127311706543
Loss at step 81900 : 2.3767411708831787
Loss at step 81950 : 2.6047966480255127
Loss at step 82000 : 3.6123411655426025
Loss at step 82050 : 3.1980667114257812
Loss at step 82100 : 4.820920944213867
Loss at step 82150 : 2.9532103538513184
Loss at step 82200 : 2.420614004135132
Loss at step 82250 : 2.4464306831359863
Loss at step 82300 : 2.5578763484954834
Loss at s

Loss at step 90100 : 3.094653606414795
Loss at step 90150 : 2.389223575592041
Loss at step 90200 : 2.0408384799957275
Loss at step 90250 : 3.503093719482422
Loss at step 90300 : 2.556767463684082
Loss at step 90350 : 2.6688218116760254
Loss at step 90400 : 4.00308895111084
Loss at step 90450 : 1.9277732372283936
Loss at step 90500 : 2.606478691101074
Loss at step 90550 : 2.5991709232330322
Loss at step 90600 : 2.5254175662994385
Loss at step 90650 : 2.7527029514312744
Loss at step 90700 : 2.5497875213623047
Loss at step 90750 : 2.834559202194214
Loss at step 90800 : 3.051604747772217
Loss at step 90850 : 3.081176280975342
Loss at step 90900 : 3.527125358581543
Loss at step 90950 : 1.9408667087554932
Loss at step 91000 : 3.209052801132202
Loss at step 91050 : 2.760664224624634
Loss at step 91100 : 5.228216648101807
Loss at step 91150 : 2.953228712081909
Loss at step 91200 : 2.3559389114379883
Loss at step 91250 : 3.352815628051758
Loss at step 91300 : 1.5593271255493164
Loss at step 913

Loss at step 99750 : 1.6886131763458252
Loss at step 99800 : 2.5632266998291016
Loss at step 99850 : 2.465719699859619
Loss at step 99900 : 1.6104168891906738
Loss at step 99950 : 4.316152572631836
Loss at step 100000 : 3.9104461669921875
Nearest to tuna: mushroom, spring, block, crab, cucumber,
Nearest to rice: cucumber, finger, bit, spicy, salmon,
Nearest to sushi: fish, towel, roe, zip, mat,
Nearest to roll: flesh, piece, sesame, daikon, strip,
Nearest to sashimi: coriander, leaf, piece, strip, sesame,
Nearest to steak: grill, grate, pepper, marinade, onion,
Nearest to grill: grate, worcestershire, marinade, preheat, pepper,
Nearest to sauce: finger, filet, soy, salmon, slice,
Nearest to cream: liqueur, mascarpone, coffee, yolk, cocoa,
Model saved in file: /notebooks/school/text_feature_extraction/models/doc2vec_recipes_checkpoint.ckpt
