Training a 4-Layered Neural network. Using Bag-of-Words Model. Framework: TensorFlow

In [1]:
import json
import re
from random import shuffle
import pandas as pd
import numpy as np
import tensorflow as tf

# Reading Data

In [2]:
def open_file(file_name):
    with open(file_name,'r') as f:
        data=json.load(f)
    return data

In [4]:
train_data=open_file("train.json")
test_data=open_file("test.json")

# Cleaning Data and Creating a bag of Words

In [9]:
def get_label_set(train_data):
    labels = [str(i["cuisine"]).lower() for i in train_data]
    label_set = list(set(labels))
    np.save("Labels.npy",np.array(label_set))
    return label_set

def get_encoded_label(label,label_set):
    encoded_label=np.zeros(shape=(1,len(label_set)))
    index=label_set.index(label)
    encoded_label[0][index]=1
    return encoded_label.tolist()

def clean_data(ingredient):
    return re.sub(r'[\W]', ' ', ingredient.lower())

def create_bag_of_words(data,file_out="lexicon.npy"):
    bag=[]
    for row in data:
        ingredients=row["ingredients"]
        for ingredient in ingredients:
            words=clean_data(ingredient).split()
            bag+=words

    bag=np.array(list(set(bag)))
    np.save(file_out,bag)
    return bag.tolist()


In [10]:
bag_of_words=create_bag_of_words(train_data,"lexicon.npy")

# Preprocess Data

In [13]:
def process_data(in_data,bag_of_words,file_out="processed_train_data.npy",flag=0):
    data=[]
    if flag is 0:
        label_set = get_label_set(in_data)

    for row in in_data:
        ingredients=row["ingredients"]
        features = np.zeros(shape=(len(bag_of_words))).tolist()
        current_words=[]
        for ingredient in ingredients:
            current_words += clean_data(ingredient).split()
        for word in current_words:
            if word in bag_of_words:
                index_value = bag_of_words.index(word.lower())
                features[index_value] += 1
        if flag is 0:
            encoded_label=get_encoded_label(str(row['cuisine']).lower(),label_set)
            data.append([features,np.array(encoded_label)])
        elif flag is 1:
            data.append(features)
    data = np.array(data)
    shuffle(data)
    np.save(file_out,data)

In [14]:
process_data(train_data,bag_of_words,"processed_train_data.npy",0)
process_data(test_data,bag_of_words,"processed_test_data.npy",1)

In [15]:
def get_train_data(train_file="processed_train_data.npy",label_file="Labels.npy"):
    train_data=np.load(train_file)
    labels=np.load(label_file)
    return train_data,labels

In [16]:
train_data,labels=get_train_data()
train_data=pd.DataFrame(train_data,columns=["feature_set","labels"])

## Separate Training and Cross-Validation Data

In [17]:
X=np.array(train_data["feature_set"])
Y=np.array(train_data["labels"])
X=np.array([np.array(x,dtype=np.float32) for x in X])
Y=np.array([np.array(y,dtype=np.float32) for y in Y])
Y=np.reshape(Y,newshape=(len(X),len(labels)))

In [18]:
train_X=X[:-5000]
train_y=Y[:-5000]
test_X=X[-5000:]
test_y=Y[-5000:]

# Model

In [19]:
input_nodes=len(X[0])
n_nodes_1=1200
n_nodes_2=1500
n_nodes_3=1000
n_classes=len(labels)
hm_epochs=1
batch_size=128
n_batches=int((len(train_X))/batch_size)


In [20]:
x=tf.placeholder('float',[None,input_nodes])
y=tf.placeholder('float')
hidden_layer_1={'f_num':n_nodes_1,"weights":tf.Variable(tf.random_normal([input_nodes,n_nodes_1])),"bias":tf.Variable(tf.random_normal([n_nodes_1]))}
hidden_layer_2={'f_num':n_nodes_2,"weights":tf.Variable(tf.random_normal([n_nodes_1,n_nodes_2])),"bias":tf.Variable(tf.random_normal([n_nodes_2]))}
hidden_layer_3={'f_num':n_nodes_3,"weights":tf.Variable(tf.random_normal([n_nodes_2,n_nodes_3])),"bias":tf.Variable(tf.random_normal([n_nodes_3]))}
output_layer={'f_num':n_classes,"weights":tf.Variable(tf.random_normal([n_nodes_3,n_classes])),"bias":tf.Variable(tf.random_normal([n_classes]))}


In [21]:
def neural_network_model(data):
    l1 = tf.add(tf.matmul(data, hidden_layer_1['weights']), hidden_layer_1['bias'])
    l1 = tf.nn.relu(l1)

    l2 = tf.add(tf.matmul(l1, hidden_layer_2['weights']), hidden_layer_2['bias'])
    l2 = tf.nn.relu(l2)

    l3 = tf.add(tf.matmul(l2, hidden_layer_3['weights']), hidden_layer_3['bias'])
    l3 = tf.nn.relu(l3)

    output = tf.matmul(l3, output_layer['weights']) + output_layer['bias']

    return output


# Training

In [22]:
def train_neural_network(X,Y,saver):

    prediction=neural_network_model(x)
    cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y,logits=prediction))
    optimizer=tf.train.AdamOptimizer(learning_rate=0.001).minimize(cost)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch in range(hm_epochs):
            epoch_loss=0
            print(str(epoch)+"/"+str(hm_epochs))
            for batch in range(n_batches):
                start=batch*batch_size
                end=start+batch_size
                batch_x=X[start:end]
                batch_y=Y[start:end]
                _,c=sess.run([optimizer,cost],feed_dict={x:batch_x,y: batch_y})
                epoch_loss+=c
                correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
                accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
                print("Batch: "+str(batch)+" Epoch Loss: "+str(epoch_loss)+" Batch Accuaracy: "+str(accuracy.eval({x:test_X,y:test_y})))
            correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
            accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
            print("Epoch: "+str(epoch)+" Epoch Loss: " + str(epoch_loss) + " Validation Accuaracy: " + str(accuracy.eval({x:test_X,y:test_y})))
        saver.save(sess,"What's Cooking/model.ckpt")


In [23]:
saver=tf.train.Saver()
train_neural_network(train_X,train_y,saver)

0/1
Batch: 0 Epoch Loss: 150944.90625 Batch Accuaracy: 0.0318
Batch: 1 Epoch Loss: 259233.015625 Batch Accuaracy: 0.0492
Batch: 2 Epoch Loss: 364763.945313 Batch Accuaracy: 0.0756
Batch: 3 Epoch Loss: 450674.945313 Batch Accuaracy: 0.1072
Batch: 4 Epoch Loss: 530328.835938 Batch Accuaracy: 0.1412
Batch: 5 Epoch Loss: 604813.40625 Batch Accuaracy: 0.165
Batch: 6 Epoch Loss: 671933.640625 Batch Accuaracy: 0.179
Batch: 7 Epoch Loss: 738488.546875 Batch Accuaracy: 0.1932
Batch: 8 Epoch Loss: 807042.398438 Batch Accuaracy: 0.2062
Batch: 9 Epoch Loss: 871345.210938 Batch Accuaracy: 0.2118
Batch: 10 Epoch Loss: 922815.429688 Batch Accuaracy: 0.2118
Batch: 11 Epoch Loss: 983135.234375 Batch Accuaracy: 0.2112
Batch: 12 Epoch Loss: 1034310.30469 Batch Accuaracy: 0.2026
Batch: 13 Epoch Loss: 1081541.03906 Batch Accuaracy: 0.2022
Batch: 14 Epoch Loss: 1132952.52344 Batch Accuaracy: 0.2058
Batch: 15 Epoch Loss: 1187424.63281 Batch Accuaracy: 0.2108
Batch: 16 Epoch Loss: 1226438.55469 Batch Accuarac

Batch: 137 Epoch Loss: 4064508.48926 Batch Accuaracy: 0.5314
Batch: 138 Epoch Loss: 4080937.00293 Batch Accuaracy: 0.5298
Batch: 139 Epoch Loss: 4094773.12793 Batch Accuaracy: 0.528
Batch: 140 Epoch Loss: 4113931.89746 Batch Accuaracy: 0.519
Batch: 141 Epoch Loss: 4130301.47363 Batch Accuaracy: 0.5176
Batch: 142 Epoch Loss: 4144932.16309 Batch Accuaracy: 0.5176
Batch: 143 Epoch Loss: 4162509.60059 Batch Accuaracy: 0.5156
Batch: 144 Epoch Loss: 4178182.65723 Batch Accuaracy: 0.5228
Batch: 145 Epoch Loss: 4191007.98145 Batch Accuaracy: 0.527
Batch: 146 Epoch Loss: 4207344.81738 Batch Accuaracy: 0.5306
Batch: 147 Epoch Loss: 4220916.86523 Batch Accuaracy: 0.534
Batch: 148 Epoch Loss: 4235732.12109 Batch Accuaracy: 0.536
Batch: 149 Epoch Loss: 4250475.67969 Batch Accuaracy: 0.537
Batch: 150 Epoch Loss: 4266647.25586 Batch Accuaracy: 0.538
Batch: 151 Epoch Loss: 4281300.73242 Batch Accuaracy: 0.5388
Batch: 152 Epoch Loss: 4295471.49609 Batch Accuaracy: 0.5432
Batch: 153 Epoch Loss: 4310266.