In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import os
import pandas as pd 
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
import requests
nltk.download('punkt')
import gensim
import re
import sys
import random
import numpy as np
from collections import OrderedDict
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline
import json

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Prepare Data

### Prepare query words for querying the server

In [None]:
foods = pd.read_csv("foods.csv", delimiter=",", skipinitialspace=True)

# Get unique words
unique_words = foods["food_name"].unique()
query_words = list(unique_words) # list of words we will query
# query_words = query_words[::2]
query_words = random.sample(query_words, 200)

def make_query_string(blim, tlim):
    string = ""
    a = 0
    for word in query_words[blim:tlim]:
        a+=1
        word.replace(",", "")
        if a == len(query_words[blim:tlim]):
            string = string+word
        else:
            string = string+word+","
    return string

def make_all_query_strings():
    strings = []
    blim = 0
    tlim = 100
    a = False
    while tlim <= len(query_words):
        string = make_query_string(blim, tlim)
        strings.append(string)
        if a is True:
            return strings
        blim += 100
        tlim += 100
        if tlim >= len(query_words) and a == False:
            tlim = len(query_words)
            a = True
        print(a)
        

server_name = "http://localhost:5000"

def get_corpus():
#     strings = make_all_query_strings()
    responses = []
    a = 0
    for string in strings:
        r = requests.get(server_name+"/recipe?query={}&number=10".format(string))
        if r.status_code != 200:
            print("Error: {} - {}".format(r.status_code, r.text))
        else:
            print("Success")
        responses.append(r.json())
        print(a)
        a+=1
    return responses

### Get and Save Corpus

In [None]:
corpus = get_corpus()

In [None]:
# Save to JSON so that we don't have to query again
my_json = json.dumps(corpus)
with open("corpus0-150.json", "w") as f:
    json.dump(my_json, f)

### Call load corpus.json if already ran get_corpus(), make vocab

In [None]:
with open("corpus0-50_3.json", "r") as f:
    corpus = json.load(f)
corpus=json.loads(corpus)

In [None]:
words = []
for response in corpus: # for query word response
    for query in response:
        if query["corpus"] == "":
            continue
        else:
            paragraph = query["corpus"].lower()
            sentences = re.split(r"[.!?]", paragraph)
            for sentence in sentences: 
                sentence = sentence.replace(",", "")
                nltk_tokens = word_tokenize(sentence)
                nltk_tokens = [word for word in nltk_tokens if len(word)>1 and word.isalpha()]
                words.extend(nltk_tokens)

In [None]:
vocab = set(words)
char_to_int = dict((c,i) for i,c in enumerate(vocab))
int_to_char = dict((i,c) for i,c in enumerate(vocab))
print(len(vocab))

### Context

In [None]:
X = []
Y = []
temp_dict = []
window_size = 10
for i in range(len(words)):
    a = i-window_size
    b= i+window_size
    curr_word = words[i]
    for z in range(a,i):
        if z >=0:
            temp_dict.append((curr_word,words[z]))
    for z in range(i+1,b):
        if z<len(vocab):
            temp_dict.append((curr_word,words[z]))
for pair in temp_dict:
    tempx = np.zeros(len(vocab))
    tempy = np.zeros(len(vocab))
    tempx[char_to_int[pair[0]]] = 1
    tempy[char_to_int[pair[1]]] = 1
    X.append(tempx)
    Y.append(tempy)

### Embeddings and Weight Updating

In [None]:
embedding_size = 100
batch_size = 64
epochs = 100 
n_batches = int(len(X)/batch_size)
learning_rate= 0.001
x = tf.placeholder(tf.float32,shape = (None,len(vocab)))
y = tf.placeholder(tf.float32,shape = (None,len(vocab)))
w1 = tf.Variable(tf.random_normal([len(vocab),embedding_size]),dtype = tf.float32)
b1 = tf.Variable(tf.random_normal([embedding_size]),dtype = tf.float32)
w2 = tf.Variable(tf.random_normal([embedding_size,len(vocab)]),dtype = tf.float32)
b2 = tf.Variable(tf.random_normal([len(vocab)]),dtype = tf.float32)
hidden_y = tf.matmul(x,w1) + b1
y_pred = tf.matmul(hidden_y,w2) + b2
cost = tf.reduce_mean(tf.losses.mean_squared_error(y_pred,y))
optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
init = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
saver = tf.train.Saver()
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.33)
sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options))
sess.run(init)

### Train

In [None]:
for epoch in range(5):
    avg_cost = 0
    for i in range(n_batches-1):
        batch_x = X[i*batch_size:(i+1)*batch_size]
        batch_y = Y[i*batch_size:(i+1)*batch_size]
        #print(batch_x.shape)
        _,c = sess.run([optimizer,cost],feed_dict = {x:batch_x,y:batch_y})
        #print(test.shape)
        
        avg_cost += c/n_batches
    print('Epoch',epoch,' - ',avg_cost)
save_path = saver.save(sess,'/Users/Owner/repos/food2vec-text-api/weights.ckpt')

### Get all the words embedded

In [None]:
embeddings = dict()
for i in vocab:
    temp_a = np.zeros([1,len(vocab)])
    temp_a[0][char_to_int[i]] = 1
    temp_emb = sess.run([y_pred],feed_dict = {x:temp_a})
    temp_emb = np.array(temp_emb)
    embeddings[i] = temp_emb.reshape([len(vocab)])

### Find the closest words

In [None]:
def closest(word,n):
    distances = dict()
    for w in embeddings.keys():
        distances[w] = cosine_similarity(embeddings[w],embeddings[word])
    d_sorted = OrderedDict(sorted(distances.items(),key = lambda x:x[1] ,reverse = True))
    s_words = d_sorted.keys()
    print(s_words[:n])

### Visualization

In [None]:
labels = []
tokens = []
for w in embeddings.keys():
    labels.append(w)
    tokens.append(embeddings[w])
tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
new_values = tsne_model.fit_transform(tokens)
x = []
y = []
for value in new_values:
    x.append(value[0])
    y.append(value[1])
    
plt.figure(figsize=(16, 16)) 
for i in range(len(x)):
    plt.scatter(x[i],y[i])
    plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
plt.show()