In [1]:
OUTPUT_PATH = 'C:\\Users\\dmytro.rybalko\\Documents\\impementations\Keras_PairCNN\\jacana-qa-naacl2013-data-results\\preprocessed_data\\'
VOCAB_PATH = OUTPUT_PATH + 'vocab.json'
EMBEDDING_PATH = OUTPUT_PATH + 'aquaint+wiki.txt.gz.ndim=50.bin'

BATCH_SIZE = 50
EPOCHS = 10
RANDOM_STATE =  42
PATIENCE = 20
ATOL = 0.01
%cd scripts

C:\Users\dmytro.rybalko\Documents\impementations\Keras_PairCNN\scripts


In [2]:
import evaluation_metrics as em

def roc_auc_score_avg(qids_test, y_test, probs):
    scores = 0
    count = 0
    for i in np.unique(qids_test):
        weights = np.array(i == qids_test)
        if (sum(y_test[weights]) == 0 or sum(y_test[weights]) == y_test[weights].shape[0]): continue
        score = sklearn.metrics.roc_auc_score(y_test[weights], probs[weights])
        scores += score
        count += 1
    return scores/count

def get_metrics(qids, y_true, y_pred, text):
    
    #train_acc = sklearn.metrics.roc_auc_score(y_train, y_pred)
    map_score = em.map_score(qids, y_true, y_pred)
    mrr_score = em.mrr_score(qids, y_true, y_pred)
    roc_auc_score = roc_auc_score_avg(qids, y_true, y_pred)
    print(text + '   MAP: %f, MRR: %f, AUC: %f' %(map_score, mrr_score, roc_auc_score))
    return map_score, mrr_score

In [3]:
'''Main file to run the setup.'''
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''
import sys

from keras.callbacks import TensorBoard
import numpy as np
import pandas as pd
import sklearn
import subprocess
import tensorflow as tf
import bm25
from model import cnn_model
from utils import batch_gen, load_embeddings
import json

# import tqdm

sys.path.insert(0, '../')

def write_log(callback, names, logs, batch_no):
    for name, value in zip(names, logs):
        summary = tf.Summary()
        summary_value = summary.value.add()
        summary_value.simple_value = value
        summary_value.tag = name
        callback.writer.add_summary(summary, batch_no)
        callback.writer.flush()
        
def load_json(file_path):
    return json.load(open(file_path, 'r'))
        
def generate_weights(qids, questions, answers, labels, lagrange_mult = 0.9, variance = 0.001, epsilon = 1e-3):
    print('Variance is', variance)
    probs, relevances = bm25.predict_relevances(qids, questions, answers)
    probs[relevances==0] = 1 - probs[relevances==0]
    probs = np.clip(probs, epsilon, 1)
    delta = (1 + np.array(labels)) * np.random.normal(loc = labels - relevances,
                                                     scale = variance,
                                                     size = np.array(qids).shape[0])
    delta_langrange = np.power(delta, 2) - lagrange_mult
    control_policy_weights = delta_langrange/probs
    return control_policy_weights, relevances, probs

def train_model(mode):
    '''Train the model.
    1. Read numpy arrays for input data
    2. Batch train the model
    3. Calculate map scores using our method.
    4. Dump predicted values in csv format for evaluation using Trec-eval
    '''
    if mode not in ['TRAIN-ALL', 'TRAIN']:
        print('Invalid mode')
        return

    data_dir = os.path.join(OUTPUT_PATH, mode)

    # Load train set.
    q_train = np.load(os.path.join(data_dir, '%s.questions.npy' %(mode.lower())))
    a_train = np.load(os.path.join(data_dir, '%s.answers.npy' %(mode.lower())))
    y_train = np.load(os.path.join(data_dir, '%s.labels.npy' %(mode.lower())))
    qids_train = np.load(os.path.join(data_dir, '%s.qids.npy' %(mode.lower())))
    addn_feat_train = np.zeros(y_train.shape)
    
    #weights, relevances = generate_weights(qids_train, q_train, a_train, y_train, variance = 0.000001)
    
    print('''q_train.shape, a_train.shape, y_train.shape, qids_train.shape,
             addn_feat_train.shape: ''')
    print(q_train.shape, q_train.shape, y_train.shape, qids_train.shape,
          addn_feat_train.shape)

    # Load dev and test sets.
    q_dev = np.load(os.path.join(data_dir, 'dev.questions.npy'))
    a_dev = np.load(os.path.join(data_dir, 'dev.answers.npy'))
    y_dev = np.load(os.path.join(data_dir, 'dev.labels.npy'))
    qids_dev = np.load(os.path.join(data_dir, 'dev.qids.npy'))
    addn_feat_dev = np.zeros(y_dev.shape)

    q_test = np.load(os.path.join(data_dir, 'test.questions.npy'))
    a_test = np.load(os.path.join(data_dir, 'test.answers.npy'))
    y_test = np.load(os.path.join(data_dir, 'test.labels.npy'))
    qids_test = np.load(os.path.join(data_dir, 'test.qids.npy'))
    addn_feat_test = np.zeros(y_test.shape)

    vocab = load_json(VOCAB_PATH)
    
    max_ques_len = q_train.shape[1]
    max_ans_len = a_train.shape[1]
    embedding, embed_dim, _ = load_embeddings(EMBEDDING_PATH, OUTPUT_PATH, vocab)

    addit_feat_len = 1
    if addn_feat_train.ndim > 1:
        addit_feat_len = addn_feat_train.shape[1]

    # Get model
    cnn_model_instance = cnn_model(embed_dim, max_ques_len, max_ans_len,
                                len(vocab), embedding,
                                addit_feat_len=addit_feat_len)
    
    bs = BATCH_SIZE
    np.set_printoptions(threshold=np.nan)
    # np.seterr(divide='ignore', invalid='ignore')
    # Train manually, epoch by epoch
    # TODO: Add tqdm
    log_path = './logs'
    callback = TensorBoard(log_path)
    callback.set_model(cnn_model_instance)
    train_names = ['train_loss', 'train_acc']
    dev_names = ['dev_loss', 'dev_acc']

    y_pred_train = cnn_model_instance.predict([q_train, a_train, addn_feat_train, np.ones(shape = len(q_train))])
    get_metrics(qids_train, y_train, y_pred_train, 'Train initial ')
    y_pred_dev = cnn_model_instance.predict([q_dev, a_dev, addn_feat_dev, np.ones(shape = len(q_dev))])
    map_overall, mrr_dev = get_metrics(qids_dev, y_dev, y_pred_dev, 'Validation initial ')
    y_pred_test = cnn_model_instance.predict([q_test, a_test, addn_feat_test, np.ones(shape = len(q_test))])
    get_metrics(qids_test, y_test, y_pred_test, 'Test initial ')
    
    patience = 1
    best_model_weights = cnn_model_instance.get_weights()
    
    weights, relevances, probs = generate_weights(qids_train, q_train, a_train, y_train, lagrange_mult = 0.9,
                                              variance = 0.01)
    
    for epoch in range(EPOCHS):
        print('Epoch:', epoch)

        #weights, relevances, probs = generate_weights(qids_train, q_train, a_train, y_train, lagrange_mult = 0.9,
        #                                      variance = 0.01)
            
        q_train_rand, a_train_rand, y_train_rand, addn_feat_train_rand, weights_rand, relevances_rand = sklearn.utils.shuffle(
            q_train, a_train, y_train, addn_feat_train, weights, relevances, random_state = RANDOM_STATE)
          
        batch_no = 0
                
        for b_q_train, b_a_train, b_y_train, b_addn_feat_train, b_weights, b_relevances in zip(
                batch_gen(q_train_rand, bs), batch_gen(a_train_rand, bs),
                batch_gen(y_train_rand, bs), batch_gen(addn_feat_train_rand, bs),
                batch_gen(weights_rand, bs), batch_gen(relevances_rand, bs)):
        
            loss_current = cnn_model_instance.train_on_batch(
                [b_q_train, b_a_train, b_addn_feat_train, b_weights], b_relevances)
            
            
            if batch_no%100 == 0 and batch_no != 0:
                #write_log(callback, train_names, logs, batch_no*(epoch+1))
                
                y_pred_train = cnn_model_instance.predict([q_train, a_train, addn_feat_train, np.ones(shape = len(q_train))])
                get_metrics(qids_train, y_train, y_pred_train, 'Batch {} train: '.format(batch_no))
                y_pred_dev = cnn_model_instance.predict([q_dev, a_dev, addn_feat_dev, np.ones(shape = len(q_dev))])
                get_metrics(qids_dev, y_dev, y_pred_dev, 'Batch {} validation: '.format(batch_no))
              
            batch_no += 1
        
        y_pred_train = cnn_model_instance.predict([q_train, a_train, addn_feat_train, np.ones(shape = len(q_train))])
        get_metrics(qids_train, y_train, y_pred_train, 'Epoch {} train: '.format(epoch))
        y_pred_dev = cnn_model_instance.predict([q_dev, a_dev, addn_feat_dev, np.ones(shape = len(q_dev))])
        map_current, mrr_current = get_metrics(qids_dev, y_dev, y_pred_dev, 'Epoch {} validation: '.format(epoch))
        
        print(y_pred_train[0:10])
        
        S_train = np.mean(y_pred_train/probs)
        print('S is', S_train)

        if map_current > map_overall:
            map_overall = map_current
            best_model_weights = cnn_model_instance.get_weights()
        elif patience < PATIENCE:
            patience += 1
        else: break
        
        print('Loss is ', loss_current)
 
    y_pred_train = cnn_model_instance.predict([q_train, a_train, addn_feat_train, np.ones(shape = len(q_train))])
    get_metrics(qids_train, y_train, y_pred_train, 'Train final ')
    y_pred_dev = cnn_model_instance.predict([q_dev, a_dev, addn_feat_dev, np.ones(shape = len(q_dev))])
    map_dev, mrr_dev = get_metrics(qids_dev, y_dev, y_pred_dev, 'Validation final ')
    y_pred_test = cnn_model_instance.predict([q_test, a_test, addn_feat_test, np.ones(shape = len(q_test))])
    get_metrics(qids_test, y_test, y_pred_test, 'Test final ')

    #cnn_model_instance.set_weights(best_model_weights)
    return cnn_model_instance, weights, relevances
    
    """
    # Dump data for trec eval
    N = len(y_pred_test)
    nnet_outdir = OUTPUT_PATH + 'output\\'

    df_submission = pd.DataFrame(index=np.arange(N), columns=['qid', 'iter', 'docno', 'rank', 'sim', 'run_id'])
    df_submission['qid'] = qids_test
    df_submission['iter'] = 0
    df_submission['docno'] = np.arange(N)
    df_submission['rank'] = 0
    df_submission['sim'] = y_pred
    df_submission['run_id'] = 'nnet'
    df_submission.to_csv(os.path.join(nnet_outdir, 'submission.txt'), header=False, index=False, sep=' ')

    df_gold = pd.DataFrame(index=np.arange(N), columns=['qid', 'iter', 'docno', 'rel'])
    df_gold['qid'] = qids_test
    df_gold['iter'] = 0
    df_gold['docno'] = np.arange(N)
    df_gold['rel'] = y_test
    df_gold.to_csv(os.path.join(nnet_outdir, 'gold.txt'), header=False, index=False, sep=' ')

    #subprocess.call("/bin/sh eval/run_eval.sh '{}'".format(nnet_outdir), shell=True)
    return cnn_model_instance, models
    """
   

  return f(*args, **kwds)
Using TensorFlow backend.


In [4]:
best_model, weights, relevances = train_model(mode = 'TRAIN')

q_train.shape, a_train.shape, y_train.shape, qids_train.shape,
             addn_feat_train.shape: 
(4718, 33) (4718, 33) (4718,) (4718,) (4718,)
Loading word vectors...
Trying to load from npy dump.
Preparing model with the following parameters: 
embed_dim, max_ques_len, max_ans_len, vocab_size, embedding,
              addit_feat_len, no_conv_filters: 
50 33 40 52051 (52051, 50) 1 100
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 40)           0                                            
__________________________________________________________________________________________________
ques_input (InputLayer)         (None, 33)           0                                            
__________________________________________________________________________________________________
embedding_2 (Emb

In [None]:
data_dir = os.path.join(OUTPUT_PATH, 'TRAIN')
dataset = 'train'
q_test = np.load(os.path.join(data_dir, dataset + '.questions.npy'))
a_test = np.load(os.path.join(data_dir, dataset + '.answers.npy'))
y_test = np.load(os.path.join(data_dir, dataset + '.labels.npy'))
qids_test = np.load(os.path.join(data_dir, dataset + '.qids.npy'))
addn_feat_test = np.zeros(y_test.shape)
y_pred = best_model.predict([q_test, a_test, addn_feat_test, np.ones(shape = len(q_test))])
map, mrr = get_metrics(qids_test, y_test, y_pred, '')

### Loss distribution

In [None]:
import matplotlib.pyplot as plt

In [None]:
mode = 'TRAIN'
dataset = 'train'
data_dir = os.path.join(OUTPUT_PATH, mode)
q_train = np.load(os.path.join(data_dir, '%s.questions.npy' %(dataset)))
a_train = np.load(os.path.join(data_dir, '%s.answers.npy' %(dataset)))
y_train = np.load(os.path.join(data_dir, '%s.labels.npy' %(dataset)))
qids_train = np.load(os.path.join(data_dir, '%s.qids.npy' %(dataset)))
    
variance = 0.01
    
probs, relevances = bm25.predict_relevances(qids_train, q_train, a_train, top_k = 3)
probs[relevances==0] = 1 - probs[relevances==0]
probs = np.clip(probs, 1e-3, 1)
loss = (1 + np.array(y_train)) * np.random.normal(loc = y_train - relevances,
                                                    scale = 0.0001,
                                                    size = np.array(qids_train).shape[0])
loss = np.power(loss, 2) - 0.9

In [None]:
y_pred_copy = y_pred.copy()
y_pred_copy = y_pred_copy.reshape(-1)
y_pred_copy[relevances==0] = 1 - y_pred_copy[relevances==0]

In [None]:
np.mean(weights * y_pred_copy) + 0.9

In [None]:
loss/probs

In [None]:
get_metrics(qids_train, y_train, probs, '')

In [None]:
weights = loss/probs

In [None]:
plt.hist(weights, range = (-10 , 10))

In [None]:
probs[qids_train == '21']

In [None]:
relevances[qids_train == '21']

### Removing no relevance queries
Should be cleaned up!

In [None]:
mode = 'TRAIN-ALL'
data_dir = os.path.join(OUTPUT_PATH, mode)
dataset = 'dev'
q_test = np.load(os.path.join(data_dir, dataset + '.questions.npy'))
a_test = np.load(os.path.join(data_dir, dataset + '.answers.npy'))
y_test = np.load(os.path.join(data_dir, dataset + '.labels.npy'))
qids_test = np.load(os.path.join(data_dir, dataset + '.qids.npy'))

In [None]:
bad_labels = []
for i in np.unique(qids_test):
    mask = qids_test == i
    labels = y_test[mask]
    if (labels.sum() == 0) or (labels.sum() == labels.shape[0]):
        bad_labels += [i]

In [None]:
def f(x):
    return x not in bad_labels

f = np.vectorize(f)  # or use a different name if you want to keep the original f

mask = f(qids_test)

In [None]:
outdir = "C:\\Users\\dmytro.rybalko\\Documents\\impementations\Keras_PairCNN\\jacana-qa-naacl2013-data-results\\preprocessed_data\\TRAIN-ALL2\\"
np.save(os.path.join(outdir, 'dev.questions.npy'), q_test[mask])
np.save(os.path.join(outdir, 'dev.answers.npy'), a_test[mask])
np.save(os.path.join(outdir, 'dev.labels.npy'), y_test[mask])
np.save(os.path.join(outdir, 'dev.qids.npy'), qids_test[mask])

In [None]:
mode = 'TRAIN'
data_dir = os.path.join(OUTPUT_PATH, mode)
dataset = 'train'
q_test = np.load(os.path.join(data_dir, dataset + '.questions.npy'))
a_test = np.load(os.path.join(data_dir, dataset + '.answers.npy'))
y_test = np.load(os.path.join(data_dir, dataset + '.labels.npy'))
qids_test = np.load(os.path.join(data_dir, dataset + '.qids.npy'))
    
variance = 0.10
    
probs, relevances = bm25.predict_relevances(qids_test, q_test, a_test, top_k = 3)
loss = (1 + np.array(y_test)) * np.random.normal(loc = y_test - probs,
                                                    scale = variance,
                                                    size = np.array(qids_test).shape[0])
loss = np.power(loss, 2)

In [None]:
em.map_score(qids_test, y_test, relevances)

In [None]:
em.mrr_score(qids_test, y_test, relevances)

In [None]:
def roc_auc_score_avg(qids_test, y_test, probs):
    scores = 0
    count = 0
    for i in np.unique(qids_test):
        weights = np.array(i == qids_test)
        if (sum(y_test[weights]) == 0 or sum(y_test[weights]) == y_test[weights].shape[0]): continue
        score = sklearn.metrics.roc_auc_score(y_test[weights], probs[weights])
        scores += score
        count += 1
    return scores/count

In [None]:
roc_auc_score_avg(qids_test, y_test, probs)

In [None]:
roc_auc_score_avg(qids_test, y_test, probs)

In [None]:
roc_auc_score_avg(qids_test, y_test, probs)

In [None]:
plt.hist(loss)
plt.show()