In [31]:
import json
import torch
import nltk
from InferSent.models import InferSent
import numpy as np
import pandas as pd

In [16]:
def get_infersent(V=2):
    '''
    Builds the infersent model using either GloVe or fastText
    '''
    MODEL_PATH = 'encoder/infersent%s.pkl' %V
    if V == 2:
        W2V_PATH = 'fastText/crawl-300d-2M.vec'
    elif V == 1:
        W2V_PATH = 'GloVe/glove.840B.300d.txt'
    
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, \
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))
    infersent.set_w2v_path(W2V_PATH)

    return infersent

In [5]:
def get_dataset(loc: str):
    '''
    Get the dataset from file location 'loc'
    '''
    with open(loc) as infile:
        dataset = json.load(infile)
    
    return dataset

In [7]:
def get_embedding(infersent, sentences: list):
    '''
    Use sentences to build a sentence embedding for each context using infersent.
    Returns a list of sentence embeddings
    '''

    print("Getting Sentence Embeddings for %d sentences", len(sentences))
    # outputs a numpy array with n vectors of dimension 4096
    context_embeddings = []
    for sentence in sentences:
        # sentence is actually a list of sentences for context_i
        embeddings = infersent.encode(sentence, tokenize=True)
        context_embeddings.append(embeddings)
    
    return np.asarray(context_embeddings)

In [76]:
contextWords = [[context[i].split()] for i in range(len(context))]
questionWords = [[questions[i].split()] for i in range(len(questions))]
answerWords = []
for i in range(len(answers)):
    if len(answers[i]) > 0:
        current = answers[i][0].split()
    else:
        current = ""
    answerWords.append(current)
        
        
    

In [8]:
def retrieve_data(dataset: dict):
    '''
    Retrieves context, questions, and targets from the data
    Context will return a list of lists for each sentence in a given context
    Questions will return a list of lists of questions for each context
    Targets will return a list of target values that correspond to each question.
    Target values are equivalent to the sentence number within the context that contains the answer to the question
    '''
    data = dataset['data']
    target = []
    ctx = [] 
    questions = [] 
    answers = []
    for topic in data:
        sentences = []
        for paragraph in topic['paragraphs']:
            context = paragraph['context']
            cont_sents = nltk.sent_tokenize(context)
            ctx.append(cont_sents)

            c_question = []
            c_answer = []
            for qas in paragraph['qas']:
                if qas['is_impossible']:
                    # skip impossible questions
                    continue
                question = qas['question']
                answer = qas['answers'][0]['text']
                c_question.append(question)
                c_answer.append(answer)

                ans_pos = qas['answers'][0]['answer_start']

                acc = 0
                # find which sentence the answer is part of
                for i, sent in enumerate(cont_sents):
                    acc += len(sent)
                    if acc > ans_pos:
                        # answer is in sentence i
                        target.append(i)
                        break
            
            questions.append(c_question)
            answers.append(c_answer)
    
    return ctx, questions, answers, target

In [9]:
def build_vocab(infersent, context: list):
    '''
    Flattens the context and then builds the vocab
    '''
    flat_context = [sentence for c in context for sentence in c] 
    infersent.build_vocab(flat_context, tokenize=True)

    return infersent

In [10]:
def cos_similarity(a,b):
    '''
    Calculate the cosine similiarity between a and b
    cos_sim = a.b / |a||b|
    '''
    return np.dot(a,b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [17]:
## load inferscent pre-trained model
infersent = get_infersent()

In [27]:
## load dataset
dataset = get_dataset("train-v2.0.json")

In [28]:
## parse the dataset
context, questions, answers, target = retrieve_data(dataset)

In [30]:
build_vocab(infersent, context)

Found 89670(/110274) words with w2v vectors
Vocab size : 89670


InferSent(
  (enc_lstm): LSTM(300, 2048, bidirectional=True)
)

In [112]:
ctx_embed = get_embedding(infersent, context[:22])
q_embed = get_embedding(infersent, questions[:22])
# a_embed = get_embedding(infersent, answers[:22])

Getting Sentence Embeddings for %d sentences 22
Getting Sentence Embeddings for %d sentences 22


In [116]:
## finding max sentences in any paragraph

max = 0
for i in range(len(ctx_embed)):
    temp = len(ctx_embed[i])
    if temp > max:
        max = temp
print(max)

n = len(ctx_embed)
feature_vectors = []
for i in range(n):
    quests = q_embed[i]
    cntxs = ctx_embed[i]
    for j in range(len(quests)):
        similarities = []
        for k in range(len(cntxs)):
            a = cos_similarity(quests[j], cntxs[k])
            similarities.append(a)
        if len(similarities) < max:
            diff = max - len(similarities)
            for i in range(diff):
                similarities.append(1.0)
        feature_vectors.append(similarities)

12


In [119]:
print(feature_vectors[0])

[0.39177325, 0.2917924, 0.33045757, 0.3469124, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
