In [26]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

from scipy.spatial.distance import cosine

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Input, Concatenate, Dropout, Lambda
from keras.layers.merge import concatenate
from keras import backend as K

import pickle

from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize

# Import data

In [2]:
# Get intros

with open('../data/sample_labels.pkl', 'rb') as f:
    intros = pickle.load(f)

In [3]:
#get entity embeddings for each entity ID

with open('../data/knowledge_graph_data/wiki_DistMult_entity.npy', 'rb') as f:
    e = np.load(f)

kg_emb_size = e[0].shape

In [4]:
#get entity id to embedding mapping. This is so we can retrieve the 
#entity embeddings when we know the index of the entity

with open('../data/knowledge_graph_data/idx2id_entity_full_no_text.pickle', 'rb') as f:
    idx2id = pickle.load(f)
id2idx = {v: k for k, v in idx2id.items()}

# Data Preprocessing

In [5]:
#Break intros into lists of the text, entity locations, and entity IDs

num_entities = 0
full_text = []
entity_locations = []
entity_id = []

for intro in intros:
    if intro[1]:
        full_text.append(intro[0])
        
        temp = []
        temp1 = []
        
        for idx,entity_key in enumerate(intro[1]):

            temp.append(entity_key[3])
            loc = np.argwhere(intro[1][entity_key]==1)
            temp1.append((loc.min(),loc.max()))
            num_entities+=1
                
        entity_id.append(temp)
        entity_locations.append(temp1)

full_text = np.asarray(full_text)
entity_locations = np.asarray(entity_locations)
entity_id = np.asarray(entity_id)

In [6]:
#create paragraphs array.
#The paragraph array is a list of sublists. Each sublist is a list of words contained in the paragraph.


paragraphs = []
for paragraph in full_text:
    temp = []
    for sentence in sent_tokenize(paragraph):
        for word in word_tokenize(sentence):
            temp.append(word)
    paragraphs.append(temp)

In [7]:
#train w2v model and create intro array
w2v_size = 100
w2v_model = Word2Vec(paragraphs, min_count = 1, size = w2v_size, window = 5, sg=1)
vocab_size = len(w2v_model.wv.vocab)

intro_vectors = []
for sentence in paragraphs:
    temp = []
    for word in sentence:
        temp.append(w2v_model[word])
    intro_vectors.append(temp)

intro_vectors = [[l.tolist() for l in vectors] for vectors in intro_vectors]

  # Remove the CWD from sys.path while we load stuff.


In [8]:
#get training data to be used for LSTM
#X will be a list of sublists. Each sublist contains the vectors of words in the context window for each entity.

context_window = 10 #number of words with entity centered for input to LSTM model

#using text sequences
X_words = []
X_w2v = []
X_comparisons = []
count_fail=0
count_success=0

for idx,locations in enumerate(entity_locations):
    for idx2,loc in enumerate(locations):
        low = max(loc[0]-context_window//2,0)
        r_extra = max(0,context_window//2-loc[0])
        high = min(loc[0]+context_window//2,len(paragraphs[idx]))
        l_extra = max(loc[0]+context_window//2-len(paragraphs[idx]),0)
        try:
            X_comparisons.append(e[id2idx[entity_id[idx][idx2]]])
            X_words.append(paragraphs[idx][low-l_extra:high+r_extra])
            X_w2v.append(intro_vectors[idx][low-l_extra:high+r_extra])
            count_success+=1
        except:
            count_fail+=1
print('Percent success: {}'.format(100*(count_success/(count_success+count_fail))))

Percent success: 88.41346378914845


In [9]:
#We cannot feed words into the LSTM. So we need to tokenize the words
t = Tokenizer()
t.fit_on_texts(X_words)
X_token_words = np.zeros((len(X_words),context_window), dtype=int)
for idx,window in enumerate(X_words):
    for idx2,word in enumerate(window):
        X_token_words[idx][idx2] = t.word_counts[word]
num_unique_words = X_token_words.max()+1

#Convert X_w2v list into array
X_w2v_new = np.zeros((len(X_w2v),context_window,w2v_size))
for idx,window in enumerate(X_w2v):
    for idx2,word in enumerate(window):
        for idx3,emb in enumerate(word):
            X_w2v_new[idx][idx2][idx3] = emb
X_w2v = X_w2v_new

#convert comparisons into array
X_comparisons = np.array(X_comparisons)

now that we have correct training labels, let's re-organize our training data such that the problem can be posed as a binary classification problem, rather than forcing the model to learn the actual KG embeddings. For each example that we currently have (positive labels), let's give the model a few negative examples. We want negative examples to be KG embeddings that are relatively close to the positive examples.

In [14]:
num_incorrect = 10
    
def get_X_incorrect():

    closest_indices = np.zeros((len(Y),num_incorrect), dtype = int)

    for idx,y in enumerate(Y):
        distances = dict()
        y_size = np.dot(y,y)
        for idx1,emb in enumerate(Y):
            if idx1 != idx:
                distances[np.dot(y,emb)/(y_size*np.dot(emb,emb))] = idx1
        for idx2,val in enumerate(sorted(distances)[-num_incorrect:]):
            closest_indices[idx][idx2] = distances[val]

    X_incorrect = []
    for idx, entity in enumerate(closest_indices):
        for idx1,incorrect_entity in enumerate(entity):
            X_incorrect.append(e[incorrect_entity])
    X_incorrect = np.array(X_incorrect)

    np.save('X_incorrect.npy',X_incorrect)

In [12]:
X_comparisons= np.append(X_comparisons,np.load('X_incorrect.npy'),axis = 0)

In [15]:
X_w2v_all = []
X_token_all = []
for w2v_window,token_window in zip(X_w2v,X_token_words):
    for i in range(num_incorrect):
        X_w2v_all.append(w2v_window)
        X_token_all.append(token_window)
        
X_w2v_all=np.append(X_w2v,np.array(X_w2v_all),axis = 0)
X_token_all = np.append(X_token_words,np.array(X_token_all),axis = 0)

In [16]:
Y = np.zeros(X_comparisons.shape[0])
for idx,y in enumerate(Y):
    if idx < X_w2v.shape[0]:
        Y[idx] = 1

# Models

In [17]:
X_train_words,X_test_words,X_train_w2v,X_test_w2v,X_train_comparisons,X_test_comparisons,Y_train,Y_test = train_test_split(
    X_token_all,X_w2v_all,X_comparisons,Y)
X_train = [X_train_words,X_train_w2v,X_train_comparisons]
X_test = [X_test_words,X_test_w2v,X_test_comparisons]

In [29]:
#create custom loss function for cosine distance (for binary classification)

def cosine_distance(vals):
    ytrue = vals[0]
    ypred = vals[1]
    return -K.mean(ytrue * ypred, axis=-1, keepdims=True)

def cos_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0],1)

In [30]:
def create_model(optimizer,loss,metrics):
    
    #inputs
    inp_context_words = Input(shape = (context_window,), name='inp_context_words')
    inp_w2v = Input(shape = (context_window,w2v_size), name = 'inp_w2v')
    inp_comparisons = Input(shape = kg_emb_size,name = 'inp_comparisons')
    
    #embed the context words
    emb = Embedding(output_dim = 100, input_dim = num_unique_words, input_length = context_window,
                   name = 'emb1')(inp_context_words)
    emb = Dropout(0.2, name = 'emb2')(emb)
    
    #LSTM input
    lstm_inp = concatenate([inp_w2v,emb], axis = 2, name = 'lstm_inp')
    
    lstm_1 = Bidirectional(LSTM(500,name = 'lstm_layer'))(lstm_inp)
    hidden_1 = Dropout(0.2, name = 'hidden_2')(lstm_1)
    
    lstm_output = Dense(e[0].shape[0],activation='linear',name = 'lstm_output')(hidden_1)
    
    distance = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([lstm_output, inp_comparisons])
    
    model = Model(inputs=[inp_context_words,inp_w2v,inp_comparisons],outputs = distance)
    
    model.compile(optimizer = optimizer, loss = loss, metrics = metrics)
    
    return model

In [31]:
optimizer = "adam"
loss = 'binary_crossentropy'
metrics = ['accuracy']

batch_size = 64
epochs = 10
validation_split = 0.1
verbose = 1

model = create_model(optimizer,loss,metrics)
model.summary()

W1120 18:22:46.366942 139836567152448 deprecation_wrapper.py:119] From /home/matteo/.local/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W1120 18:22:46.379953 139836567152448 deprecation_wrapper.py:119] From /home/matteo/.local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.

W1120 18:22:46.383372 139836567152448 deprecation.py:323] From /home/matteo/.conda/envs/capstone/lib/python3.7/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inp_context_words (InputLayer)  (None, 10)           0                                            
__________________________________________________________________________________________________
emb1 (Embedding)                (None, 10, 100)      2248300     inp_context_words[0][0]          
__________________________________________________________________________________________________
inp_w2v (InputLayer)            (None, 10, 100)      0                                            
__________________________________________________________________________________________________
emb2 (Dropout)                  (None, 10, 100)      0           emb1[0][0]                       
__________________________________________________________________________________________________
lstm_inp (

In [32]:
# Train model

LSTM_history = model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, 
                    validation_split=validation_split, verbose=verbose)

Train on 140228 samples, validate on 15581 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [33]:
#test set accuracy
y_test_emb = model.predict(X_test)