forked from tmdavid/NLI_Code
-
Notifications
You must be signed in to change notification settings - Fork 0
/
model_simple_rnn.py
194 lines (162 loc) · 7.88 KB
/
model_simple_rnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
__author__ = 'david_torrejon'
"""
This module implements a simple RNN model for the snli paper:
http://nlp.stanford.edu/pubs/snli_paper.pdf
"""
from keras.layers import recurrent
from keras.models import Sequential, slice_X
from keras.layers.core import Activation, TimeDistributedDense, RepeatVector, Merge, Dense, Dropout, Flatten
from keras.regularizers import l2, activity_l2
from keras.optimizers import SGD
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer, one_hot, base_filter
import numpy as np
import glove
from extract_sentences import return_sparse_vector
class paper_model():
#parameters
def __init__(self, number_stacked_layers=3, vocabulary_size=300, is_tbir=False):
self.RNN = recurrent.LSTM
self.stacked_layers = number_stacked_layers
self.vocab_size = vocabulary_size #dimensions of embeddings
self.weights_path = "./weights.hdf5"
self.nli_model = ''
if is_tbir:
self.filename_output = 'predictions_tbir.txt'
open(self.filename_output, 'w')
else:
self.filename_output = 'predictions.txt'
open(self.filename_output, 'w')
#NN paramters!
"""
simply a stack of three 200d
tanh layers, with the bottom layer taking the concatenated
sentence representations as input and the
top layer feeding a softmax classifier, all trained
jointly with the sentence embedding model itself.
"""
"""
models are randomly
initialized using standard techniques and trained using AdaDelta (Zeiler, 2012) minibatch SGD until
performance on the development set stops improving.
We applied L2 regularization to all models, manually tuning the strength coefficient (lambda) for
each, and additionally applied dropout (Srivastava et al., 2014) to the inputs
and outputs of the sentence embedding models (though not to its internal
connections) with a fixed dropout rate. All models were implemented in a common framework for this paper.
"""
def data_preparation_nn(self, sentences, diferences=3):
premises_encoded = []
hypothesis_encoded = []
expected_output = []
for data in sentences:
#print data[0][0].shape, data[0][1].shape
if data[2] <= diferences:
premises_encoded.append(data[0][0])
hypothesis_encoded.append(data[0][1])
expected_output.append(data[1])
return np.asarray(premises_encoded), np.asarray(hypothesis_encoded), np.asarray(expected_output)
def data_tbir_preparation(self, sentences):
premises_encoded = []
hypothesis_encoded = []
expected_output = []
id_query=[]
id_premises=[]
for data in sentences:
premises_encoded.append(data[0][0])
hypothesis_encoded.append(data[0][1])
expected_output.append(data[1])
id_query.append(data[2])
id_premises.append(data[3])
return np.asarray(premises_encoded), np.asarray(hypothesis_encoded), expected_output, id_query, id_premises
def build_model(self, LOAD_W=True):
#DROPOUT TO INPUT AND OUTPUTS OF THE SENTENCE EMBEDDINGS!!
print('Build embeddings model...')
#check this maxlen
maxlen = 45
premise_model = Sequential()
hypothesis_model = Sequential()
# 2 embedding layers 1 per premise 1 per hypothesis
#hypothesis_model.add(Embedding(input_dim=self.vocab_size, output_dim=self.vocab_size, input_length=maxlen))
premise_model.add(self.RNN(100, init='normal', activation='tanh', input_shape=(maxlen,self.vocab_size)))
#hypothesis_model.add(Embedding(input_dim=self.vocab_size, output_dim=self.vocab_size, input_length=maxlen))
hypothesis_model.add(self.RNN(100, init='normal', activation='tanh', input_shape=(maxlen,self.vocab_size)))
print('Concat premise + hypothesis...')
self.nli_model = Sequential()
self.nli_model.add(Merge([premise_model, hypothesis_model], mode='concat', concat_axis=1))
for i in range(1, self.stacked_layers):
print ('stacking %d layer')%i
self.nli_model.add(Dense(input_dim=100, output_dim=200, init='normal', activation='tanh'))
print ('stacking last layer')
self.nli_model.add(Dense(input_dim=200, output_dim=3, init='normal', activation='tanh'))
print ('Softmax layer...')
# 3 way softmax (entail, neutral, contradiction)
self.nli_model.add(Dense(3, init='uniform'))
self.nli_model.add(Activation('softmax')) # care! 3way softmax!
print('Compiling model...')
#sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
self.nli_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
print('Model Compiled')
#print('generating sparse vectors(1hot encoding) from sentences...')
if LOAD_W:
print('loading weights...')
self.nli_model.load_weights(self.weights_path)
def train_model(self, data_train):
#split data
#data preparation
#print data_train[0]
premises_encoded, hypothesis_encoded, expected_output = self.data_preparation_nn(data_train, 3)
"""
nb_samples, timesteps, input_dim) means:
- nb_samples samples (examples)
- for each sample, a number of time steps (the same for all samples in the batch)
- for each time step of each sample, input_dim features.
NEEEEED TO CONVER DATA TO SPARSE VECTORSSASDASD
"""
'''
TODO PREPARE DATA OUTSIDE HERE TO GENERATE BOTH TEST AND TRAIN
'''
print('premsises shape and sample....')
print premises_encoded.shape
print('hypothesis shape and sample....')
print hypothesis_encoded.shape
print('output shape and sample....')
print expected_output.shape
print expected_output[0]
#(nb_samples, timesteps, input_dim). -> (expected_output[0], [1], len_vocab)
X = [premises_encoded, hypothesis_encoded]
#print X[0]
raise SystemExit(0)
# I dont want the conversion here, make the conversion somewhere else, for esthetic purpouses
print('training....')
self.nli_model.fit(X, expected_output, batch_size=64, nb_epoch=1, verbose=1, sample_weight=None, show_accuracy=True)
print('saving weights')
self.nli_model.save_weights(self.weights_path, overwrite=True)
def test_model(self, data_test, is_tbir=False):
print ('testing....')
if is_tbir is False:
premises_encoded_t, hypothesis_encoded_t, expected_output_t = self.data_preparation_nn(data_test, 3)
else:
#print data_test[0]
premises_encoded_t, hypothesis_encoded_t, expected_output_t, img_query_t, id_querys = self.data_tbir_preparation(data_test)
print('premsises shape and sample....')
print premises_encoded_t.shape
print('hypothesis shape and sample....')
print hypothesis_encoded_t.shape
X_t = [premises_encoded_t, hypothesis_encoded_t]
#print X_t[0]
#score = self.nli_model.evaluate(X_t, expected_output_t, batch_size=128, show_accuracy=True, verbose=1)
predictions = self.nli_model.predict(X_t, batch_size=128, verbose=1)
"""
store results?
"""
correct = 0
f = open(self.filename_output, 'a')
for pred, e_out, id_query, idq in zip(predictions, expected_output_t, img_query_t, id_querys):
#print np.argmax(pred), np.argmax(e_out)
if np.argmax(pred) == np.argmax(e_out): #np arrays!
correct +=1
sup = str(pred) + " " + str(e_out) + " " + str(id_query) + " " + str(idq)
f.write(sup)
f.write('\n')
f.close()
print 'Predictions correct ', correct,'out of',len(predictions), 'acc(%): ',float(correct)/float(len(predictions))