In [16]:
import numpy as np
import pandas as pd

from scipy.spatial.distance import cosine

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import LSTM, Dense, Bidirectional, Embedding, Input, Concatenate
from keras.layers.merge import concatenate

import pickle

from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize

# Import data

In [2]:
# Get intros

with open('../data/sample_labels.pkl', 'rb') as f:
    intros = pickle.load(f)
    
#with open('../data/sample_data.pkl', 'rb') as f:
    #intros = pickle.load(f)

In [3]:
#get entity embeddings for each entity ID

with open('../data/knowledge_graph_data/wiki_DistMult_entity.npy', 'rb') as f:
    e = np.load(f)

In [4]:
#get entity id to embedding mapping
with open('../data/knowledge_graph_data/idx2id_entity_full_no_text.pickle', 'rb') as f:
    idx2id = pickle.load(f)
id2idx = {v: k for k, v in idx2id.items()}

# Data Preprocessing

In [5]:
#Break intros into lists of the text, entity locations, and entity IDs

full_text = []
entity_locations = []
entity_id = []

for intro in intros:
    if intro[1]:
        full_text.append(intro[0])
        
        temp = []
        temp1 = []
        
        for idx,entity_key in enumerate(intro[1]):

            temp.append(entity_key[3])
            loc = np.argwhere(intro[1][entity_key]==1)
            temp1.append((loc.min(),loc.max()))
                
        entity_id.append(temp)
        entity_locations.append(temp1)

full_text = np.asarray(full_text)
entity_locations = np.asarray(entity_locations)
entity_id = np.asarray(entity_id)

In [6]:
#create words/paragraphs arrays

words = []
paragraphs = []
for paragraph in full_text:
    temp1 = []
    for sentence in sent_tokenize(paragraph):
        temp = []
        for word in word_tokenize(sentence):
            temp.append(word)
            temp1.append(word)
        words.append(temp)
    paragraphs.append(temp1)

In [7]:
#train w2v model and create intro array
w2v_model = Word2Vec(words, min_count = 1, size = 100, window = 5, sg=1)

intro_vectors = []
for sentence in paragraphs:
    temp = []
    for word in sentence:
        temp.append(w2v_model[word])
    intro_vectors.append(temp)

intro_vectors = [[l.tolist() for l in vectors] for vectors in intro_vectors]

  


In [8]:
#get training data
#X will be a list of sublists. Each sublist contains the vectors of words in the context window for each entity.

context_window = 40 #number of words with entity centered for input to LSTM model
#note: also try including all of intro up to context word (and perhaps few words to the right)

#using text sequences
X_words = []
X_w2v = []
Y = []
count_fail=0
count_success=0

for idx,locations in enumerate(entity_locations):
    for idx2,loc in enumerate(locations):
        low = max(loc[0]-context_window//2,0)
        r_extra = max(0,context_window//2-loc[0])
        high = min(loc[0]+context_window//2,len(paragraphs[idx]))
        l_extra = max(loc[0]+context_window//2-len(paragraphs[idx]),0)
        try:
            Y.append(e[id2idx[entity_id[idx][idx2]]])
            X_words.append(paragraphs[idx][low-l_extra:high+r_extra])
            X_w2v.append(intro_vectors[idx][low-l_extra:high+r_extra])
            count_success+=1
        except:
            count_fail+=1
print('Percent success: {}'.format(100*(count_success/(count_success+count_fail))))

Percent success: 88.41346378914845


In [24]:
# tokenize X_words
xi = [i for i in range(100)]
yi = [[i for i in range(100)] for i in range(1000000)]

In [25]:
for i in yi:
    cosine(xi,i)

# Models

In [9]:
def create_model():
    
    model1 = Sequential()
    vocab_size = len(w2v_model.wv.vocab)
    model1.add(Embedding(vocab_size,100,input_length=context_window))
    
    model2 = Input(shape=(len(X_w2v[0]),len(X_w2v[0][0])))
    
    model = Sequential()
    model.add(Concatenate([model1,model2]))
    model.add(LSTM(10))
    model.add(Dense(1))
    
    return model

In [10]:
optimizer = "adam"
loss = "sparse_categorical_crossentropy"
metrics = ["accuracy"]

batch_size = 16
epochs = 1
validation_split = 0.1
verbose = 1

model = create_model()
model.compile(optimizer=optimizer, loss = loss, metrics = metrics)

W1102 15:05:27.136999 139724238182208 deprecation_wrapper.py:119] From /home/matteo/.local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1102 15:05:27.145656 139724238182208 deprecation_wrapper.py:119] From /home/matteo/.local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1102 15:05:27.147412 139724238182208 deprecation_wrapper.py:119] From /home/matteo/.local/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1102 15:05:27.155192 139724238182208 deprecation_wrapper.py:119] From /home/matteo/.local/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



In [None]:
# Train model
LSTM_history = model.fit([X_words,X_w2v], Y, batch_size=batch_size, epochs=epochs, 
                    validation_split=validation_split, verbose=verbose)

Exception ignored in: 'zmq.backend.cython.socket.nbytes'
OverflowError: value too large to convert to int


In [None]:
#add preprocessing step to hash (if not, then tokenize) input text (X_words)