In [35]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

In [43]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")

model = AutoModelForQuestionAnswering.from_pretrained("../models/SQuAD2_trained_model")

In [44]:
def answer_questions(questions, text):
    for question in questions:
        inputs = tokenizer.encode_plus(question, text, return_tensors="pt")
        # Need to pop token type ids when using distilbert because this model does not 
        # handle them, but the encoder still sets them for some reason. 
        inputs.pop('token_type_ids', None)
        input_ids = inputs["input_ids"].tolist()[0]

        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
        answer_start_scores, answer_end_scores = model(**inputs)
        
        # Get the most likely beginning of answer with the argmax of the score
        answer_start = torch.argmax(answer_start_scores)
        # Get the most likely end of answer with the argmax of the score
        answer_end = torch.argmax(answer_end_scores) + 1  

        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

        print(f"Question: {question}")
        print(f"Answer: {answer}\n")

In [45]:
text = '''My favorite color is red. My favorite color is NOT blue. There are 100 billion neurons in the brain. 
There are infinitely many prime numbers. 
The Cure are an English rock band formed in Crawley, West Sussex, in 1978.'''
questions = ['When did The Cure form?', 'How many prime numbers are there?', 
             'What is my favorite color?', "how many neurons are there in my brain?", 'When did Lincoln die?']
answer_questions(questions, text)

Question: When did The Cure form?
Answer: 1978

Question: How many prime numbers are there?
Answer: infinitely many

Question: What is my favorite color?
Answer: red

Question: how many neurons are there in my brain?
Answer: 100 billion

Question: When did Lincoln die?
Answer: [CLS]



In [17]:
class AnswerJoiner(nn.Module):
    
    def __init__(self, embeddings, hidden_dim, num_answers):
        super(AnswerJoiner, self).__init__()
        
        _, self.embedding_dim = self.embeddings.shape
        
        self.hidden_dim = hidden_dim
        self.num_answers = num_answers
        self.input_size = self.embedding_dim * (self.num_answers + 1)
        
        self.tanh = nn.Tanh()
        self.W1 = nn.Linear(self.input_size, self.hidden_dim)
        self.W2 = nn.Linear(self.hidden_dim)
        
        self.dropout_layer = nn.Dropout(p=0.25)
        
    def forward(self, embedded_q_and_a_s, correct_answer=None):
        embeds = self.dropout_layer(embedded_q_and_a_s)
        pred_answer = self.W2(self.tanh(self.W1(x)))

        if correct_answer is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(pred_answer, correct_answer)
            return loss
        else:
            return pred_answer

In [19]:
def train(embeddings, pred_answers, num_answers, correct_answers, hidden_dim, epochs, learning_rate):

    model = AnswerJoiner(embeddings, hidden_dim, num_answers)
    model.to("cuda")

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        model.train()


        loss = model.forward(batch_W[b], batch_P[b], Y=batch_Y[b])

        loss.backward()
        optimizer.step()

        print("loss: ",loss)

    return model

In [32]:
from models import InferSent
model_version = 2
MODEL_PATH = "encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
Infermodel = InferSent(params_model)
Infermodel.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

In [33]:
# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = 'GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec'
Infermodel.set_w2v_path(W2V_PATH)

In [34]:
# Load embeddings of K most frequent words
Infermodel.build_vocab_k_words(K=100000)

Vocab size : 100000


In [None]:
embeddings = Infermodel.encode([], bsize=128, tokenize=False, verbose=True)