In [100]:
import logging
import os
import queue
import random
import re
from args import get_setup_args
import shutil
import string
#import setup
import torch
import torch.nn.functional as F
import torch.utils.data as data
from collections import Counter
import numpy as np
import ujson as json
#import spacy
import json
import tensorflow as tf

from tqdm import tqdm_notebook as tqdm

In [101]:
# Do word embeddings instead of sentence embedding for LSTM
from tensorflow.keras.preprocessing.text import one_hot

adversarial_data = None
squad_data = None
with open('./sentence_selection_data/train_sent_sel_adv.json') as f:
    adversarial_data = json.load(f)

with open('./sentence_selection_data/train_sent_sel_squad.json') as f:
    squad_data = json.load(f)
    
    vocab = set()
    
    train_q,train_s,train_val_test_labels = [],[],[]
    vocab_max_sent_len = 0
    i1,i2 = 0,0
for q_id, curr_qa in adversarial_data.items():
    if i1 >= 50000:
        break
    i1+=1
    ques = curr_qa['question']
    sent = curr_qa['sentence']
    answer = curr_qa['contains_answer']
    
    # Add stuff to vocab
    split_ques = ques.split()
    split_sent = sent.split()
    
    vocab.update(split_ques)
    vocab.update(split_sent)
    
    max_len = max(len(split_ques),len(split_sent))

    if max_len > vocab_max_sent_len:
        vocab_max_sent_len = max_len
        

    
    train_q.append(ques)
    train_s.append(sent)
    train_val_test_labels.append(answer)
    
for q_id, curr_qa in squad_data.items():
    if i2 >= 50000:
        break
    i2+=1
    ques = curr_qa['question']
    sent = curr_qa['sentence']
    answer = curr_qa['contains_answer']
    
    # Add stuff to vocab
    split_ques = ques.split()
    split_sent = sent.split()
    
    vocab.update(split_ques)
    vocab.update(split_sent)
    
    max_len = max(len(split_ques),len(split_sent))

    if max_len > vocab_max_sent_len:
        vocab_max_sent_len = max_len
    
    train_q.append(ques)
    train_s.append(sent)
    train_val_test_labels.append(curr_qa['contains_answer'])
    
vocab_size = len(vocab)
    
encoded_train_val_test_q = [one_hot(q, vocab_size) for q in train_q]
encoded_train_val_test_s = [one_hot(s, vocab_size) for s in train_s]
# Train labels don't need to be encoded



In [102]:
from tensorflow.keras.preprocessing import sequence

#674131 examples? in total 
# Pad word embeds to be same size
encoded_train_val_test_q_padded = sequence.pad_sequences(encoded_train_val_test_q, maxlen=vocab_max_sent_len)
encoded_train_val_test_s_padded = sequence.pad_sequences(encoded_train_val_test_s, maxlen=vocab_max_sent_len)

In [103]:
# Do 60, 20, 20 split
#404479 134826 134826
full_sz = len(encoded_train_val_test_q)
train_sz = round(full_sz*0.60)
val_sz,test_sz = round((full_sz - train_sz) / 2), round((full_sz - train_sz) / 2)
encoded_train_q_padded = encoded_train_val_test_q_padded[:train_sz]
encoded_val_q_padded = encoded_train_val_test_q_padded[train_sz:train_sz+val_sz]
encoded_test_q_padded = encoded_train_val_test_q_padded[train_sz+val_sz:]

encoded_train_s_padded = encoded_train_val_test_s_padded[:train_sz]
encoded_val_s_padded = encoded_train_val_test_s_padded[train_sz:train_sz+val_sz]
encoded_test_s_padded = encoded_train_val_test_s_padded[train_sz+val_sz:]

train_labels = train_val_test_labels[:train_sz]
val_labels = train_val_test_labels[train_sz:train_sz+val_sz]
test_labels = train_val_test_labels[train_sz+val_sz:]


In [104]:
# adversarial_data = np.load('./sentence_selection_data/train_sent_sel_adv.npz')
# squad_data = np.load('./sentence_selection_data/train_sent_sel_squad.npz')
# train_q = np.append(adversarial_data['train_q'][:50], squad_data['train_q'][:50], axis = 0)
# train_s = np.append(adversarial_data['train_s'][:50], squad_data['train_s'][:50], axis = 0)
# train_labels = np.append(adversarial_data['train_labels'][:50], squad_data['train_labels'][:50], axis = 0) 

# train_q = np.append(adversarial_data['train_q'][:50], squad_data['train_q'][:50], axis = 0)
# train_s = np.append(adversarial_data['train_s'][:50], squad_data['train_s'][:50], axis = 0)
# train_labels = np.append(adversarial_data['train_labels'][:50], squad_data['train_labels'][:50], axis = 0)

#save 100 example debug set
# np.savez('./100_ex', train_q=train_q,train_s=train_s,train_labels=train_labels)

In [105]:
from tensorflow.keras.layers import Dropout, concatenate, LSTM, Dense, concatenate, Embedding
embed_len = 32
class QUA_Net(tf.keras.Model):
    def __init__(self):
        super(QUA_Net, self).__init__()
        self.q_embed = Embedding(vocab_size, embed_len)
        self.s_embed = Embedding(vocab_size, embed_len)
        self.q_lstm = LSTM(128)
        self.s_lstm = LSTM(128)
        self.dropout = Dropout(0.33) #0.3
        self.dense = Dense(64, activation='relu')
        self.dense2 = Dense(1, activation = 'sigmoid')
        
    def call(self, inputs):
        q = self.q_embed(inputs[0])
        s = self.s_embed(inputs[1])
        
#         exit()
        q_out = self.q_lstm(q)

        s_out = self.s_lstm(s)
        
        merge = concatenate([q_out, s_out], axis = -1)
        drop = self.dropout(merge)
        drop2 = self.dense(drop)
        out = self.dense2(drop2)
        return out

model = QUA_Net()

In [106]:
from tensorflow.keras import optimizers

opt = optimizers.Adam(lr=1e-2) #1e-3

# What optimizer should we use/ metrics?
model.compile(optimizer=opt,loss='binary_crossentropy', metrics=['accuracy'])

In [107]:
# pos = np.where(train_labels == True)
# print(len(pos[0]))
# pos_idxs = pos[0]

# negs = np.where(train_labels == False)
# print(len(negs[0]))

# neg_idxs = np.random.choice(negs[0], len(pos[0]))
# print(len(neg_idxs), neg_idxs)
# print(train_q[0].shape)
# train_balanced_q = np.take(train_q, np.append(pos_idxs, neg_idxs), axis=0)
# train_balanced_s = np.take(train_s, np.append(pos_idxs, neg_idxs), axis=0)
# train_balanced_labels = np.take(train_labels, np.append(pos_idxs, neg_idxs), axis=0)
# print(train_balanced_q.shape)
# print(len(train_balanced_q), len(train_balanced_s), len(train_balanced_labels))

In [None]:
from sklearn.utils import class_weight



EPOCHS = 20
train_labels = np.asarray(train_labels)
test_labels = np.asarray(test_labels)
val_labels = np.asarray(val_labels)

print(train_labels.shape)
print(encoded_train_q_padded.shape) #first dim is num examples, 2nd dim is longest string in qa dataset
print(encoded_train_s_padded.shape)

print(encoded_val_q_padded.shape)
print(encoded_val_s_padded.shape)



true_amount = len(train_labels[train_labels == True]) / len(train_val_test_labels)
false_amount = 1. - true_amount

assert(false_amount+true_amount == 1.0)

class_imb_dict = {False:false_amount, True:true_amount}

#weights = class_weight.compute_class_weight('balanced', np.unique(train_labels[:1000]), train_labels[:1000])
# hist = model.fit(x=[encoded_train_q_padded,encoded_train_s_padded],y=train_labels,validation_data=([encoded_val_q_padded,encoded_val_s_padded], val_labels),epochs=EPOCHS,batch_size=1, class_weight=class_imb_dict)
# hist = model.fit(x=[train_balanced_q[:,:,np.newaxis], train_balanced_s[:,:,np.newaxis]], y = train_balanced_labels, epochs = EPOCHS, batch_size=32, callbacks=callbacks_list)
hist = model.fit(x=[encoded_train_q_padded,encoded_train_s_padded], y=train_labels,validation_data=([encoded_val_q_padded,encoded_val_s_padded],val_labels),epochs=EPOCHS,batch_size=64)
# ADD CLAS IMB DICT LATER ONCE WE KNOW IT"LL WORK LOL APPARENTLY IT ONLY HELPS A LITLLE BIT

# serialize weights to HDF5
model.save_weights("model_weights.h5")

(37435,)
(37435, 161)
(37435, 161)
(12478, 161)
(12478, 161)
Train on 37435 samples, validate on 12478 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

In [None]:
from sklearn.metrics import confusion_matrix
# y = model.predict(x=[train_balanced_q[:,:,np.newaxis], train_balanced_s[:,:,np.newaxis]])
# print(encoded_test_q_padded[0])
# print(encoded_test_s_padded[0])

# Get indexes corr. to true positives
# correct_test_labels_idxs = np.where(test_labels == True)
# print(correct_test_labels_idxs)

# predictions = model.predict(x=[ [encoded_test_q_padded[163]],[encoded_test_s_padded[163]] ])
predictions = model.predict(x=[encoded_test_q_padded,encoded_test_s_padded])

print(predictions)
predictions = [True if val >= 0.5 else False for val in predictions]
cm = confusion_matrix(test_labels, predictions)
print(cm)


In [None]:
model.evaluate()