In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

from scipy.spatial.distance import cosine

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Input, Concatenate, Dropout
from keras.layers.merge import concatenate
from keras import backend as K

import pickle

from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize

Using TensorFlow backend.


# Import data

In [2]:
data = pd.read_csv('../data/sample_data_1percent.csv')

In [3]:
# Get intros

with open('../data/sample_labels.pkl', 'rb') as f:
    intros = pickle.load(f)

In [6]:
intros[0]

['an abugida from geez abugida or alphasyllabary is a segmental in which consonantvowel sequences are written as a unit each unit is based on a consonant letter and vowel notation is secondary this contrasts with a full alphabet in which vowels have status equal to consonants and with an abjad in which vowel marking is absent or optional although in less formal contexts all three types of script may be termed alphabets the terms also contrast them with a syllabary in which the symbols cannot be split into separate consonants and vowels abugidas include the extensive brahmic family of scripts of south and southeast asia semitic ethiopic scripts and canadian aboriginal syllabics which are themselves based in part on brahmic scripts as is the case for syllabaries the units of the writing system may consist of the representations both of syllables and of consonants for scripts of the brahmic family the term akshara is used for the units abugida as a term in linguistics was proposed by pete

In [4]:
data.head()

Unnamed: 0,text,link_anchor,link_start,link_end,target_wikidata,tokenized_vector,candidates
0,the academy awards also officially and popular...,film industry,120,133,1415395,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,"[93196.0, 2973146.0, 2973208.0, 2973157.0, 328..."
1,the academy awards also officially and popular...,academy of motion picture arts and sciences,156,199,212329,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,"[212329.0, 212329.0]"
2,the academy awards also officially and popular...,art deco,551,559,173782,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,"[173782.0, 4824850.0, 4796770.0, 23307613.0, 2..."
3,the academy awards also officially and popular...,george stanley,603,617,5544783,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,"[3101928.0, 5539480.0, 5544780.0, 5544779.0, 5..."
4,the academy awards also officially and popular...,cedric gibbons,642,656,727904,[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ...,"[727904.0, 5057103.0]"


In [3]:
#get entity embeddings for each entity ID

with open('../data/knowledge_graph_data/wiki_DistMult_entity.npy', 'rb') as f:
    e = np.load(f)

In [4]:
#get entity id to embedding mapping. This is so we can retrieve the 
#entity embeddings when we know the index of the entity

with open('../data/knowledge_graph_data/idx2id_entity_full_no_text.pickle', 'rb') as f:
    idx2id = pickle.load(f)
id2idx = {v: k for k, v in idx2id.items()}

# Data Preprocessing

In [5]:
#Break intros into lists of the text, entity locations, and entity IDs

num_entities = 0
full_text = []
entity_locations = []
entity_id = []

for intro in intros:
    if intro[1]:
        full_text.append(intro[0])
        
        temp = []
        temp1 = []
        
        for idx,entity_key in enumerate(intro[1]):

            temp.append(entity_key[3])
            loc = np.argwhere(intro[1][entity_key]==1)
            temp1.append((loc.min(),loc.max()))
            num_entities+=1
                
        entity_id.append(temp)
        entity_locations.append(temp1)

full_text = np.asarray(full_text)
entity_locations = np.asarray(entity_locations)
entity_id = np.asarray(entity_id)

In [6]:
#create paragraphs array.
#The paragraph array is a list of sublists. Each sublist is a list of words contained in the paragraph.


paragraphs = []
for paragraph in full_text:
    temp = []
    for sentence in sent_tokenize(paragraph):
        for word in word_tokenize(sentence):
            temp.append(word)
    paragraphs.append(temp)

In [7]:
#train w2v model and create intro array
w2v_size = 100
w2v_model = Word2Vec(paragraphs, min_count = 1, size = w2v_size, window = 5, sg=1)
vocab_size = len(w2v_model.wv.vocab)

intro_vectors = []
for sentence in paragraphs:
    temp = []
    for word in sentence:
        temp.append(w2v_model[word])
    intro_vectors.append(temp)

intro_vectors = [[l.tolist() for l in vectors] for vectors in intro_vectors]

  # Remove the CWD from sys.path while we load stuff.


In [8]:
#get training data to be used for LSTM
#X will be a list of sublists. Each sublist contains the vectors of words in the context window for each entity.

context_window = 10 #number of words with entity centered for input to LSTM model

#using text sequences
X_words = []
X_w2v = []
Y = []
count_fail=0
count_success=0

for idx,locations in enumerate(entity_locations):
    for idx2,loc in enumerate(locations):
        low = max(loc[0]-context_window//2,0)
        r_extra = max(0,context_window//2-loc[0])
        high = min(loc[0]+context_window//2,len(paragraphs[idx]))
        l_extra = max(loc[0]+context_window//2-len(paragraphs[idx]),0)
        try:
            Y.append(e[id2idx[entity_id[idx][idx2]]])
            X_words.append(paragraphs[idx][low-l_extra:high+r_extra])
            X_w2v.append(intro_vectors[idx][low-l_extra:high+r_extra])
            count_success+=1
        except:
            count_fail+=1
Y_array = np.zeros((len(Y),Y[0].shape[0]))
for idx,y in enumerate(Y):
    for idx2,y2 in enumerate(y):
        Y_array[idx][idx2] = y2
Y = Y_array
print('Percent success: {}'.format(100*(count_success/(count_success+count_fail))))

Percent success: 88.41346378914845


In [9]:
#We cannot feed words into the LSTM. So we need to tokenize the words
t = Tokenizer()
t.fit_on_texts(X_words)
X_token_words = np.zeros((len(X_words),context_window), dtype=int)
for idx,window in enumerate(X_words):
    for idx2,word in enumerate(window):
        X_token_words[idx][idx2] = t.word_counts[word]
num_unique_words = X_token_words.max()+1

#Convert X_w2v list into array
X_w2v_new = np.zeros((len(X_w2v),context_window,w2v_size))
for idx,window in enumerate(X_w2v):
    for idx2,word in enumerate(window):
        for idx3,emb in enumerate(word):
            X_w2v_new[idx][idx2][idx3] = emb
X_w2v = X_w2v_new

# Models

In [10]:
X_train_words,X_test_words,X_train_w2v,X_test_w2v,Y_train,Y_test = train_test_split(X_token_words,X_w2v,Y)
X_train = [X_train_words,X_train_w2v]
X_test = [X_test_words,X_test_w2v]

In [5]:
#create custom loss function for cosine distance (for binary classification)

def cosine_distance(ytrue,ypred):
    return -K.mean(ytrue * ypred, axis=-1, keepdims=True)

In [34]:
def create_model(optimizer,loss,metrics):
    
    #inputs
    inp_context_words = Input(shape = (context_window,), name='inp_context_words')
    inp_w2v = Input(shape = (context_window,w2v_size), name = 'inp_w2v')
    
    #embed the context words
    emb = Embedding(output_dim = 100, input_dim = num_unique_words, input_length = context_window,
                   name = 'emb1')(inp_context_words)
    emb = Dropout(0.2, name = 'emb2')(emb)
    
    #LSTM input
    lstm_inp = concatenate([inp_w2v,emb], axis = 2, name = 'lstm_inp')
    
    lstm_1 = Bidirectional(LSTM(500,name = 'lstm_layer'))(lstm_inp)
    hidden_1 = Dropout(0.2, name = 'hidden_2')(lstm_1)
    
    output = Dense(e[0].shape[0],activation='linear',name = 'output')(hidden_1)
    
    distance = Lambda(cosine_distance, output_shape=cos_dist_output_shape)([processed_a, processed_b]
    
    model = Model(inputs=[inp_context_words,inp_w2v],outputs = output)
    
    model.compile(optimizer = optimizer, loss = loss, metrics = metrics)
    
    return model

In [37]:
optimizer = "adam"
loss = cosine_distance
metrics = costine_distance

batch_size = 64
epochs = 10
validation_split = 0.1
verbose = 1

model = create_model(optimizer,loss,metrics)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inp_context_words (InputLayer)  (None, 10)           0                                            
__________________________________________________________________________________________________
emb1 (Embedding)                (None, 10, 100)      2248300     inp_context_words[0][0]          
__________________________________________________________________________________________________
inp_w2v (InputLayer)            (None, 10, 100)      0                                            
__________________________________________________________________________________________________
emb2 (Dropout)                  (None, 10, 100)      0           emb1[0][0]                       
__________________________________________________________________________________________________
lstm_inp (

In [None]:
# Train model

LSTM_history = model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs, 
                    validation_split=validation_split, verbose=verbose)

Train on 12747 samples, validate on 1417 samples
Epoch 1/10


In [18]:
#test set accuracy
y_test_emb = model.predict(X_test)