In [1]:
import json
import torch
import nltk
from InferSent.models import InferSent
import numpy as np
import pandas as pd
from textblob import TextBlob
from sklearn.linear_model import LogisticRegression 
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from sklearn.metrics import accuracy_score
import warnings
warnings.simplefilter("ignore")
import pickle
import spacy as sc

In [19]:
def get_infersent(V=2):
    '''
    Builds the infersent model using either GloVe or fastText
    '''
    MODEL_PATH = 'encoder/infersent%s.pkl' %V
    if V == 2:
        W2V_PATH = 'fastText/crawl-300d-2M.vec'
    elif V == 1:
        W2V_PATH = 'GloVe/glove.840B.300d.txt'
    
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, \
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
    infersent = InferSent(params_model)
    infersent.load_state_dict(torch.load(MODEL_PATH))
    infersent.set_w2v_path(W2V_PATH)

    return infersent

In [20]:
def get_dataset(loc: str):
    '''
    Get the dataset from file location 'loc'
    '''
    with open(loc) as infile:
        dataset = json.load(infile)
    
    return dataset

In [21]:
def get_embedding(infersent, sentences: list):
    '''
    Use sentences to build a sentence embedding for each context using infersent.
    Returns a list of sentence embeddings
    '''

    print("Getting Sentence Embeddings for %d sentences", len(sentences))
    # outputs a numpy array with n vectors of dimension 4096
    context_embeddings = []
    for sentence in sentences:
        # sentence is actually a list of sentences for context_i
        embeddings = infersent.encode(sentence, tokenize=True)
        context_embeddings.append(embeddings)
    
    return np.asarray(context_embeddings)

In [22]:
def retrieve_data(dataset: dict):
    '''
    Retrieves context, questions, and targets from the data
    Context will return a list of lists for each sentence in a given context
    Questions will return a list of lists of questions for each context
    Targets will return a list of target values that correspond to each question.
    Target values are equivalent to the sentence number within the context that contains the answer to the question
    '''
    data = dataset['data']
    target = []
    ctx = [] 
    questions = [] 
    answers = []
    for topic in data:
        sentences = []
        for paragraph in topic['paragraphs']:
            context = paragraph['context']
            cont_sents = nltk.sent_tokenize(context)

            c_question = []
            c_answer = []
            c_target = []
            for qas in paragraph['qas']:
                if qas['is_impossible']:
                    # skip impossible questions
                    continue
                question = qas['question']
                answer = qas['answers'][0]['text']
                c_question.append(question)
                c_answer.append(answer)

                ans_pos = qas['answers'][0]['answer_start']
                if ans_pos == 0: ## finding in the test set
                    # find which sentence the answer is part of
                    for i, sent in enumerate(cont_sents):
                        if answer in sent:
                            c_target.append(i)
                            break
                else:
                    acc = 0
                    # find which sentence the answer is part of
                    for i, sent in enumerate(cont_sents):
                        acc += len(sent) + 1
                        if acc > ans_pos:
                            # answer is in sentence i
                            c_target.append(i)
                            break
            if len(c_question) > 0:
                if len(c_target) != len(c_question):
                    c_target.append(3)
                ctx.append(cont_sents)
                target.append(c_target)
                questions.append(c_question)
                answers.append(c_answer)
    
    return ctx, questions, answers, target

In [23]:
def build_vocab(infersent, context: list):
    '''
    Flattens the context and then builds the vocab
    '''
    flat_context = [sentence for c in context for sentence in c] 
    infersent.build_vocab(flat_context, tokenize=True)

    return infersent

In [24]:
def cos_similarity(a,b):
    '''
    Calculate the cosine similiarity between a and b
    cos_sim = a.b / |a||b|
    '''
    return np.dot(a,b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [25]:
## load inferscent pre-trained model
infersent = get_infersent()

In [8]:
## load dataset
dataset = get_dataset("train-v2.0.json")

In [9]:
## parse the dataset
context, questions, answers, target = retrieve_data(dataset)

In [27]:
print(target[0])

[1, 1, 3, 1, 1, 1, 3, 2, 1, 1, 3, 3, 3, 1, 3]


In [28]:
build_vocab(infersent, context)

Found 89181(/109646) words with w2v vectors
Vocab size : 89181


InferSent(
  (enc_lstm): LSTM(300, 2048, bidirectional=True)
)

In [29]:
ctx_embed = get_embedding(infersent, context[:20])
q_embed = get_embedding(infersent, questions[:20])
# a_embed = get_embedding(infersent, answers[:22])

Getting Sentence Embeddings for %d sentences 20
Getting Sentence Embeddings for %d sentences 20


In [13]:
f = open('questions_embeddings.txt', 'rb')
q_embed = pickle.load(f)
f.close()

In [14]:
f = open('context_embeddings.txt', 'rb')
ctx_embed = pickle.load(f)
f.close()

In [3]:
## finding max sentences in any paragraph

max = 0
for i in range(len(ctx_embed)):
    temp = len(ctx_embed[i])
    if temp > max:
        max = temp

n = len(ctx_embed)
feature_vectors = []
for i in range(n):
    targ = target[i]
    quests = q_embed[i]
    cntxs = ctx_embed[i]
    for j in range(len(quests)):
        similarities = []
        for k in range(len(cntxs)):
            a = cos_similarity(quests[j], cntxs[k])
            similarities.append(a)
        if len(similarities) < max:
            diff = max - len(similarities)
            for i in range(diff):
                similarities.append(1.0)
        similarities.append(targ[j])
        feature_vectors.append(similarities)

NameError: name 'ctx_embed' is not defined

In [31]:
print(len(feature_vectors))

251


In [16]:
feature_frame = pd.DataFrame.from_records(feature_vectors)

In [17]:
feature_frame.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.391773,0.291792,0.330458,0.346912,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
1,0.474997,0.489037,0.375269,0.450412,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
2,0.580804,0.525381,0.415801,0.538072,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3
3,0.436262,0.405446,0.34835,0.394128,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
4,0.535396,0.401412,0.410652,0.464892,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1


In [18]:
# for i in range(len(context)):
from nltk.tree import Tree
nlp = sc.load("en_core_web_sm")
max_l = 10
root_word_features = []
for i in range(len(context)):
    current_context = context[i]
    current_context_questions = questions[i]
    context_roots = []
    for sent in current_context:
        doc = nlp(str(sent))
        context_roots.append([st.stem(chunk.root.head.text.lower()) for chunk in doc.noun_chunks])
    for ques in current_context_questions:
        q_doc = nlp(ques)
        ques_roots = [st.stem(chunk.root.head.text.lower()) for chunk in q_doc.noun_chunks]
        temp_arr = []
        for r in ques_roots:
            for i in range(len(context_roots)):
                if r in context_roots[i]:
                    temp_arr.append(1)
                    break
                else:
                    temp_arr.append(0)
            if len(temp_arr) < max_l:
                for i in range(max_l - len(temp_arr)):
                    temp_arr.append(0)
            elif len(temp_arr) > max_l:
                temp_arr = temp_arr[:10]
        root_word_features.append(temp_arr)
print(root_word_features)

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 

In [19]:
print(len(root_word_features))

86821


In [25]:
feature_frame_2 = pd.DataFrame.from_records(root_word_features)

In [26]:
feature_frame_2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# f = open('root_wf_frame.txt', 'wb')
# pickle.dump(feature_frame_2, f)
# f.close()

In [33]:
feature_frame_2.rename(columns = {0:'column_root_0', 1:'column_root_1',2:'column_root_2', 3:'column_root_3',
                                 4:'column_root_4',5:'column_root_5',6:'column_root_6',7:'column_root_7',
                                 8:'column_root_8',9:'column_root_9'}, inplace = True) 

In [34]:
feature_frame_2.head()

Unnamed: 0,column_root_0,column_root_1,column_root_2,column_root_3,column_root_4,column_root_5,column_root_6,column_root_7,column_root_8,column_root_9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
# f = open('root_wf_frame_final.txt', 'wb')
# pickle.dump(feature_frame_2, f)
# f.close()

In [36]:
f = open('d_full.txt', 'rb')
feature_frame = pickle.load(f)
f.close()

In [37]:
feature_frame.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,0.391773,0.291792,0.330458,0.346912,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
1,0.474997,0.489037,0.375269,0.450412,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
2,0.580804,0.525381,0.415801,0.538072,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3
3,0.436262,0.405446,0.34835,0.394128,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
4,0.535396,0.401412,0.410652,0.464892,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1


In [39]:
Y = feature_frame.iloc[:, -1]
X = feature_frame.iloc[:, 0:11]

In [40]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.391773,0.291792,0.330458,0.346912,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,0.474997,0.489037,0.375269,0.450412,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,0.580804,0.525381,0.415801,0.538072,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.436262,0.405446,0.34835,0.394128,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.535396,0.401412,0.410652,0.464892,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [44]:
X.rename(columns = {0:'sentence_0', 1:'sentence_1',2:'sentence_2', 3:'sentence_3',
                                 4:'sentence_4',5:'sentence_5',6:'sentence_6',7:'sentence_7',
                                 8:'sentence_8',9:'sentence_9', 10:'sentence_10'}, inplace = True) 

In [45]:
X.head()

Unnamed: 0,sentence_0,sentence_1,sentence_2,sentence_3,sentence_4,sentence_5,sentence_6,sentence_7,sentence_8,sentence_9,sentence_10
0,0.391773,0.291792,0.330458,0.346912,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,0.474997,0.489037,0.375269,0.450412,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,0.580804,0.525381,0.415801,0.538072,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.436262,0.405446,0.34835,0.394128,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.535396,0.401412,0.410652,0.464892,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [47]:
final_features = X.join(feature_frame_2)

In [48]:
final_features.head()

Unnamed: 0,sentence_0,sentence_1,sentence_2,sentence_3,sentence_4,sentence_5,sentence_6,sentence_7,sentence_8,sentence_9,...,column_root_0,column_root_1,column_root_2,column_root_3,column_root_4,column_root_5,column_root_6,column_root_7,column_root_8,column_root_9
0,0.391773,0.291792,0.330458,0.346912,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.474997,0.489037,0.375269,0.450412,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.580804,0.525381,0.415801,0.538072,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.436262,0.405446,0.34835,0.394128,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.535396,0.401412,0.410652,0.464892,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
r,c = final_features.shape
print(r)

86821


In [52]:
f = open('final_target_dataframe.txt', 'wb')
pickle.dump(Y, f)
f.close()

In [51]:
Y.head()

0    1
1    1
2    3
3    1
4    1
Name: 27, dtype: int64

In [20]:
f = open('final_features_dataframe.txt', 'rb')
final_features = pickle.load(f)
f.close()

In [23]:
f = open('final_target_dataframe.txt', 'rb')
Y = pickle.load(f)
f.close()

In [24]:
final_features = final_features.fillna(0)

In [25]:
clf = LogisticRegression(random_state=0, solver='lbfgs',
                         multi_class='multinomial')
clf.fit(final_features, Y)
Y_pred = clf.predict(final_features)
print(accuracy_score(Y, Y_pred))

0.47843263726517776


In [27]:
nn = MLPClassifier(hidden_layer_sizes=(20,20), max_iter=1000, alpha=1e-4,
                    solver='sgd', verbose=10, tol=1e-4, random_state=1,
                    learning_rate_init=.01)
nn.fit(final_features, Y)
Y_pred = nn.predict(final_features)
print(accuracy_score(Y, Y_pred))

Iteration 1, loss = 1.69021036
Iteration 2, loss = 1.51174722
Iteration 3, loss = 1.43498972
Iteration 4, loss = 1.34745293
Iteration 5, loss = 1.25596052
Iteration 6, loss = 1.18559785
Iteration 7, loss = 1.14209901
Iteration 8, loss = 1.11905521
Iteration 9, loss = 1.10182137
Iteration 10, loss = 1.08864683
Iteration 11, loss = 1.07779918
Iteration 12, loss = 1.06790958
Iteration 13, loss = 1.05921232
Iteration 14, loss = 1.04901931
Iteration 15, loss = 1.04269697
Iteration 16, loss = 1.03676960
Iteration 17, loss = 1.03115220
Iteration 18, loss = 1.02699883
Iteration 19, loss = 1.02040058
Iteration 20, loss = 1.01200624
Iteration 21, loss = 1.00411045
Iteration 22, loss = 0.99537330
Iteration 23, loss = 0.98550922
Iteration 24, loss = 0.97927048
Iteration 25, loss = 0.97319982
Iteration 26, loss = 0.96789596
Iteration 27, loss = 0.96367253
Iteration 28, loss = 0.95919488
Iteration 29, loss = 0.95574752
Iteration 30, loss = 0.95300944
Iteration 31, loss = 0.94961441
Iteration 32, los

Iteration 253, loss = 0.89420573
Iteration 254, loss = 0.89391909
Iteration 255, loss = 0.89353006
Iteration 256, loss = 0.89374030
Iteration 257, loss = 0.89389850
Iteration 258, loss = 0.89389430
Iteration 259, loss = 0.89315451
Iteration 260, loss = 0.89363950
Iteration 261, loss = 0.89318166
Iteration 262, loss = 0.89354230
Iteration 263, loss = 0.89338698
Iteration 264, loss = 0.89391961
Iteration 265, loss = 0.89324314
Iteration 266, loss = 0.89327519
Iteration 267, loss = 0.89369150
Iteration 268, loss = 0.89264965
Iteration 269, loss = 0.89335560
Iteration 270, loss = 0.89332628
Iteration 271, loss = 0.89310874
Iteration 272, loss = 0.89281244
Iteration 273, loss = 0.89269277
Iteration 274, loss = 0.89226599
Iteration 275, loss = 0.89296454
Iteration 276, loss = 0.89279945
Iteration 277, loss = 0.89249070
Iteration 278, loss = 0.89282429
Iteration 279, loss = 0.89244174
Iteration 280, loss = 0.89255154
Iteration 281, loss = 0.89333910
Iteration 282, loss = 0.89262257
Iteration 

In [74]:
svc = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
svc.fit(X, Y)
Y_pred = svc.predict(X)
print(accuracy_score(Y, Y_pred))

0.5580216767832668


In [26]:
test_dataset = get_dataset("dev-v2.0.json")

In [27]:
test_context, test_questions, test_answers, test_target = retrieve_data(test_dataset)

In [11]:
print(test_target[0])

[0, 0, 1, 1, 3]


In [12]:
build_vocab(infersent, test_context)

Found 18479(/19812) words with w2v vectors
Vocab size : 18479


InferSent(
  (enc_lstm): LSTM(300, 2048, bidirectional=True)
)

In [13]:
test_ctx_embed = get_embedding(infersent, test_context)
test_q_embed = get_embedding(infersent, test_questions)

Getting Sentence Embeddings for %d sentences 1204
Getting Sentence Embeddings for %d sentences 1204


In [13]:
# for i in range(20):
#     test_current_context = test_context[i]
#     test_current_questions = test_questions[i]
    
#     test_ctx_embed = get_embedding(infersent, test_current_context)
#     test_q_embed = get_embedding(infersent, test_current_questions)
    
#     similarity_vectors = getSimilarityVector(test_ctx_embed,test_q_embed)
#     features = pd.DataFrame.from_records(similarity_vectors)
#     feat
#     root_match_vectors = getRootMatchVector()
    

In [14]:
max = 0
for i in range(len(test_ctx_embed)):
    temp = len(test_ctx_embed[i])
    if temp > max:
        max = temp

n = len(test_ctx_embed)
test_feature_vectors = []
for i in range(n):
    targ = test_target[i]
    quests = test_q_embed[i]
    cntxs = test_ctx_embed[i]
    for j in range(len(quests)):
        similarities = []
        for k in range(len(cntxs)):
            a = cos_similarity(quests[j], cntxs[k])
            similarities.append(a)
        if len(similarities) < max:
            diff = max - len(similarities)
            for i in range(diff):
                similarities.append(1.0)
        similarities.append(targ[j])
        test_feature_vectors.append(similarities)

In [15]:
test_features = pd.DataFrame.from_records(test_feature_vectors)

In [16]:
test_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.387645,0.249265,0.228349,0.222138,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0
1,0.48867,0.309683,0.308471,0.276238,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0
2,0.339817,0.328267,0.355208,0.275369,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
3,0.1996,0.241099,0.163828,0.132389,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1
4,0.577559,0.472216,0.52645,0.665151,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3


In [17]:
test_Y = test_features.iloc[:, -1]
test_X = test_features.iloc[:, 0:11]

In [18]:
test_X.rename(columns = {0:'sentence_0', 1:'sentence_1',2:'sentence_2', 3:'sentence_3',
                                 4:'sentence_4',5:'sentence_5',6:'sentence_6',7:'sentence_7',
                                 8:'sentence_8',9:'sentence_9', 10:'sentence_10'}, inplace = True) 

In [20]:
test_X.head()

Unnamed: 0,sentence_0,sentence_1,sentence_2,sentence_3,sentence_4,sentence_5,sentence_6,sentence_7,sentence_8,sentence_9,sentence_10
0,0.387645,0.249265,0.228349,0.222138,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,0.48867,0.309683,0.308471,0.276238,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,0.339817,0.328267,0.355208,0.275369,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.1996,0.241099,0.163828,0.132389,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.577559,0.472216,0.52645,0.665151,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
f = open('test_similarity_features.txt', 'wb')
pickle.dump(test_X, f)
f.close()

In [22]:
f = open('test_target.txt', 'wb')
pickle.dump(test_Y, f)
f.close()

In [11]:
nlp = sc.load("en_core_web_sm")
max_l = 10
test_root_word_features = []
for i in range(len(test_context)):
    current_context = test_context[i]
    current_context_questions = test_questions[i]
    context_roots = []
    for sent in current_context:
        doc = nlp(str(sent))
        context_roots.append([st.stem(chunk.root.head.text.lower()) for chunk in doc.noun_chunks])
    for ques in current_context_questions:
        q_doc = nlp(ques)
        ques_roots = [st.stem(chunk.root.head.text.lower()) for chunk in q_doc.noun_chunks]
        temp_arr = []
        for r in ques_roots:
            for i in range(len(context_roots)):
                if r in context_roots[i]:
                    temp_arr.append(1)
                    break
                else:
                    temp_arr.append(0)
            if len(temp_arr) < max_l:
                for i in range(max_l - len(temp_arr)):
                    temp_arr.append(0)
            elif len(temp_arr) > max_l:
                temp_arr = temp_arr[:10]
        test_root_word_features.append(temp_arr)
print(len(test_root_word_features))

5928


In [12]:
test_root_features = pd.DataFrame.from_records(test_root_word_features)

In [13]:
test_root_features.rename(columns = {0:'column_root_0', 1:'column_root_1',2:'column_root_2', 3:'column_root_3',
                                 4:'column_root_4',5:'column_root_5',6:'column_root_6',7:'column_root_7',
                                 8:'column_root_8',9:'column_root_9'}, inplace = True)

In [14]:
test_root_features.head()

Unnamed: 0,column_root_0,column_root_1,column_root_2,column_root_3,column_root_4,column_root_5,column_root_6,column_root_7,column_root_8,column_root_9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
f = open('test_similarity_features.txt', 'rb')
test_X = pickle.load(f)
f.close()

In [17]:
test_final_features = test_X.join(test_root_features)

In [18]:
test_final_features.head()

Unnamed: 0,sentence_0,sentence_1,sentence_2,sentence_3,sentence_4,sentence_5,sentence_6,sentence_7,sentence_8,sentence_9,...,column_root_0,column_root_1,column_root_2,column_root_3,column_root_4,column_root_5,column_root_6,column_root_7,column_root_8,column_root_9
0,0.387645,0.249265,0.228349,0.222138,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.48867,0.309683,0.308471,0.276238,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.339817,0.328267,0.355208,0.275369,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.1996,0.241099,0.163828,0.132389,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.577559,0.472216,0.52645,0.665151,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
f = open('test_final_features.txt', 'wb')
pickle.dump(test_final_features, f)
f.close()

In [31]:
f = open('test_target.txt', 'rb')
test_Y = pickle.load(f)
f.close()

In [29]:
test_final_features = test_final_features.fillna(0)

In [32]:
test_Y_pred = nn.predict(test_final_features)
print(accuracy_score(test_Y, test_Y_pred))

0.6781376518218624


In [48]:
f = open('test_set_predictions.txt', 'wb')
pickle.dump(test_Y_pred, f)
f.close()

In [33]:
print(len(test_Y_pred))

5928


In [35]:
r = test_Y.shape
print(r)

(5928,)


In [15]:
f = open('test_set_predictions.txt', 'rb')
test_Y_pred = pickle.load(f)
f.close()

f = open('test_target.txt', 'rb')
test_Y = pickle.load(f)
f.close()

In [16]:
indexes = []
for i in range(5928):
    if test_Y[i] == test_Y_pred[i]:
        indexes.append(i)

In [38]:
top_indexes = indexes
print(len(top_indexes))

4020


In [39]:
questions = []
answers = []
q_num = 0
for i in range(len(test_context)):
    c_test = test_context[i]
    q_test = test_questions[i]
    t_test = test_target[i]
    for j in range(len(q_test)):
        if q_num in top_indexes:
            questions.append(q_test[j])
            answers.append(c_test[t_test[j]])
        q_num += 1
print(questions[2:3])
print(answers[2:3])
    

['What century did the Normans first gain their separate identity?']
['The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries.']


In [41]:
print(len(questions))

4020


In [42]:
f = open('questions.txt', 'wb')
pickle.dump(questions, f)
f.close()

In [43]:
f = open('answers.txt', 'wb')
pickle.dump(answers, f)
f.close()

In [44]:
f = open('questions.txt', 'rb')
new_questions = pickle.load(f)
f.close()

In [45]:
f = open('answers.txt', 'rb')
new_answers = pickle.load(f)
f.close()

In [53]:
print(new_questions[8])

Who did Rollo sign the treaty of Saint-Clair-sur-Epte with?


In [54]:
print(new_answers[8])

The Duchy of Normandy, which began in 911 as a fiefdom, was established by the treaty of Saint-Clair-sur-Epte between King Charles III of West Francia and the famed Viking ruler Rollo, and was situated in the former Frankish kingdom of Neustria.


In [2]:
nlp = sc.load("en_core_web_sm")

In [88]:
quest = questions[486]
ans_sentence = answers[486]

doc = nlp(ans_sentence)
print(quest)
print(ans_sentence)
for ent in doc.ents:
    print(ent.text, ent.label_)


Where were the narrow gauge rail lines built in Victoria?
Two tourist railways operate over 760 mm (2 ft 6 in) narrow gauge lines, which are the remnants of five formerly government-owned lines which were built in mountainous areas.
Two CARDINAL
760 mm QUANTITY
2 ft 6 CARDINAL
five CARDINAL


In [87]:
for i in range(500):
    quest = questions[i]
    ans_sentence = answers[i]

    if quest.startswith('Who'):
        doc = nlp(ans_sentence)
        print(i)
        print("Question : " + quest)
        print("Answer : " + ans_sentence )
        for ent in doc.ents:
            if ent.label_ == "PERSON":
                if str(ent) not in quest:
                    print("Selected ans : " + str(ent))
                    break
    elif quest.startswith("Where"):
        doc = nlp(ans_sentence)
        print(i)
        print("Question : " + quest)
        print("Answer : " + ans_sentence )
        for ent in doc.ents:
            if ent.label_ == "EVENT":
                print("Selected ans : " + str(ent))
                break
            elif ent.label_ == "EVENT": 
                print("Selected ans : " + str(ent))
                break
            elif ent.label_ == "ORDINAL": 
                print("Selected ans : " + str(ent))
                break

3
Question : Who ruled the duchy of Normandy
Answer : The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure.
Selected ans : Richard I of Normandy
8
Question : Who did Rollo sign the treaty of Saint-Clair-sur-Epte with?
Answer : The Duchy of Normandy, which began in 911 as a fiefdom, was established by the treaty of Saint-Clair-sur-Epte between King Charles III of West Francia and the famed Viking ruler Rollo, and was situated in the former Frankish kingdom of Neustria.
Selected ans : Charles III
11
Question : Who was the Normans' main enemy in Italy, the Byzantine Empire and Armenia?
Answer : Soon after the Normans began to enter Italy, they entered the Byzantine Empire and then Armenia, fighting against the Pechenegs, the Bulgars, and especially the Seljuk Turks.
14
Question : Who ruined Roussel de Bailleul's plans for an inde

398
Question : Who was given the highlights of most of the matches?
Answer : The BBC was given the highlights of most of the matches, while BSkyB paying £304m for the Premier League rights, would give them a monopoly of all live matches, up to 60 per year from the 1992 season.
407
Question : Who did BSkyB compete with initially?
Answer : Key selling points were the improvement in picture and sound quality, increased number of channels and an interactive service branded Open.... now called Sky Active, BSkyB competed with the ONdigital (later ITV Digital) terrestrial offering and cable services.
426
Question : Where according to gross state product does Victoria rank in Australia?
Answer : Victoria's total gross state product (GSP) is ranked second in Australia, although Victoria is ranked fourth in terms of GSP per capita because of its limited mining activity.
Selected ans : second
429
Question : Where is the Asian influence strongest in Victoria?
Answer : Many Chinese miners worked in