In [0]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.model_selection import train_test_split

In [0]:
data = pd.read_csv("mrdata.tsv", sep="\t")

In [0]:
data

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


In [0]:
data['Length'] = data['Phrase'].apply(lambda x:len(x.split()))
data['Length'].max()

52

In [0]:
words = set({})
for i in range(data.shape[0]):
    words = words.union(set(data['Phrase'][i].split()))

In [0]:
words = np.array([" "] + sorted(words))

In [0]:
print(words.shape)

(18227,)


In [0]:
wordsDict = defaultdict(lambda:0)
count = 0
for i in words:
    wordsDict[i] = count
    count += 1
print(wordsDict)



In [0]:
x = np.zeros((len(data), data['Length'].max()))
y = np.zeros((len(data), 5))
max_length=data['Length'].max()
for i in range(len(data)):
    x[i] = np.array(list(map(lambda row:wordsDict[row], data['Phrase'][i].split())) + [0]*(max_length - data['Length'][i]))
    y[i][data['Sentiment'][i]]=1

In [0]:
embedding_matrix = np.identity((len(wordsDict)))
embedding_matrix

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [0]:
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [0]:
batch_size = 512
epochs = 1
no_of_classes = 5
hiddenLayers = 3

In [0]:
tf.reset_default_graph()
input_data = tf.placeholder(tf.int32, [None, max_length])
labels = tf.placeholder(tf.float32, [None, no_of_classes])
embed = tf.placeholder(tf.float32, [len(wordsDict), len(wordsDict)])
embedded_vector = tf.Variable(tf.zeros([batch_size, max_length, len(wordsDict)]), dtype=tf.float32)
embedded_vector = tf.nn.embedding_lookup(embed, input_data)
lstmCell = tf.contrib.rnn.BasicLSTMCell(hiddenLayers)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCell, embedded_vector, dtype=tf.float32)
value = tf.transpose(value, [1, 0, 2])
result = tf.gather(value, int(value.get_shape()[0]) - 1)
weight = tf.Variable(tf.truncated_normal([hiddenLayers, no_of_classes]))
bias = tf.Variable(tf.constant(0.1, shape=[no_of_classes]))
prediction = (tf.matmul(result, weight) + bias)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)
accuracy = tf.reduce_sum(tf.cast(tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1)), tf.float32))
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    no_of_batches = len(xTrain)//batch_size
    print("Train:")
    for epoch in range(epochs):
        avg_cost = 0
        for i in range(no_of_batches):
            _, c = (sess.run([optimizer, loss], feed_dict={embed:embedding_matrix, input_data:xTrain[i*batch_size:(i+1)*batch_size], labels:yTrain[i*batch_size:(i+1)*batch_size]}))
            avg_cost += c/no_of_batches
            print(epoch, i)
        print("Average Cost:", avg_cost)
    print("Train Accuracy Testing:")
    for epoch in range(epochs):
        avg_acc = 0
        for i in range(no_of_batches):
            acc = (sess.run(accuracy, feed_dict={embed:embedding_matrix, input_data:xTrain[i*batch_size:(i+1)*batch_size], labels:yTrain[i*batch_size:(i+1)*batch_size]}))
            avg_acc += acc/len(xTrain)
            print(epoch, i)
        print("Train accuracy:", avg_acc*100)
    no_of_batches = len(xTest)//batch_size
    print("Test:")
    for epoch in range(epochs):
        avg_acc = 0
        for i in range(no_of_batches):
            acc = (sess.run(accuracy, feed_dict={embed:embedding_matrix, input_data:xTest[i*batch_size:(i+1)*batch_size], labels:yTest[i*batch_size:(i+1)*batch_size]}))
            avg_acc += acc/len(xTest)
            print(epoch, i)
        print("Test accuracy:", avg_acc*100)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train:
0 0
0 1
0 2
0 3
0 4
0 5
0 6
0 7
0 8
0 9
0 10
0 11
0 12
0 13
0 14
0 15
0 16
0 17
0 18
0 19
0 20
0 21
0 22
0 23
0 24
0 25
0 26
0 27
0 28
0 29
0 30
0 31
0 32
0 33
0 34
0 35
0 36
0 37
0 38
0 39
0 40
0 41
0 42
0 43
0 44
0 45
0 46
0 47
0 48
0 49
0 50
0 51
0 52
0 53
0 54
0 55
0 56
0 57
0 58
0 59
0 60
0 61
0 62
0 63
0 64
0 65
0 66
0 67
0 68
0 69
0 70
0 71
0 72
0 73
0 74
0 75
0 76
0 77
0 78
0 79
0 80
0 81
0 82
0 83
0 84
0 85
0 86
0 87
0 88
0 89
0 90
0 91
0 92
0 93
0 94
0 95
0 96
0 97
0 98
0 99
0 100
0 101
0 102
0 103
0 104
0 105
0 106
0 107
0 108
0 109
0 110
0 111
0 112
0 113
0 114
0 115
0 116
0 117
0 118
0 119
0 120
0 121
0 122
0 123
0 124
0 125
0 126
0 127
0 128
0 129
0 130
0 131
0 132
0 133
0 134
0 135
0 136
0 137
0 138
0 139
0 140
0 141
0 142
0 143
0 144
0 145
0 146
0 147
0 148
0 149
0 150
0 151
0 152
0 153
0 154
0 155
0 156
0 157
0 158
0 159
0 160
0 161
0 162
0 163
0 164
0 165
0 166
0 167
0 168
0 169
0 170
0 171
0 172
0 173
0 174
0 175
0 176
0 177
0 178
0 179
0 180
0 181
0 182
0 183