In [1]:
# increase the cell width 
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; } </style>"))   

### Convert hotpotqa to squard format

According to Longformer: use the following input format with special tokens:  “[CLS] [q] question [/q] [p] sent1,1 [s] sent1,2 [s] ... [p] sent2,1 [s] sent2,2 [s] ...” 
where [s] and [p] are special tokens representing sentences and paragraphs. The special tokens were added to the RoBERTa vocabulary and randomly initialized before task finetuning.

In [2]:
import tqdm 
from datetime import datetime 
import pytz 
timeZ_Az = pytz.timezone('US/Mountain') 

QUESTION_START = '[question]'
QUESTION_END = '[/question]' 
TITLE_START = '<t> , '  # indicating the start of the title of a paragraph (also used for loss over paragraphs)
TITLE_END = ', </t> . '   # indicating the end of the title of a paragraph, add , to avoid tilte to be recognized as part of the first entity in the sentence after
SENT_MARKER_END = ', </sent> , '  # indicating the end of the title of a sentence (used for loss over sentences)
PAR = '[/par]'  # used for indicating end of the regular context and beginning of `yes/no/null` answers
EXTRA_ANSWERS = " yes no null </s>"

In [3]:
import sys
sys.path.insert(-1, '/xdisk/msurdeanu/fanluo/miniconda3/lib/python3.7/site-packages')
sys.path.insert(-1, '/xdisk/msurdeanu/fanluo/miniconda3/lib/python3.8/site-packages')
 
from prettytable import PrettyTable
import spacy   
print(spacy.__version__)
import en_core_web_lg          
nlp1 = en_core_web_lg.load() 
nlp2 = en_core_web_lg.load() 

from spacy.symbols import ORTH, LEMMA, POS
nlp1.tokenizer.add_special_case('</sent>', [{ ORTH: '</sent>', LEMMA: '</sent>', POS: 'SYM'}]) 
nlp1.tokenizer.add_special_case('</t>', [{ORTH: '</t>', LEMMA: '</t>', POS: 'SYM'}]) 
nlp1.tokenizer.add_special_case('<t>', [{ORTH: '<t>', LEMMA: '<t>', POS: 'SYM'}])  
import neuralcoref
neuralcoref.add_to_pipe(nlp1)


#!python -m pip install pytextrank
# Fan: make 3 changes in pytextrank.py 
# 1. phrase_text = ' '.join(key[0] for key in phrase_key) 
#  p.text are the joint of lemma tokens with pos_ in kept_pos, and maintain the order when join    
# 2. add argumrnt 'chunk_type' to only consider named entity ('ner') or noun_chunks ('noun'), besides the default ('both') 
# 3. replace token.lemma_ with token.lemma_.lower().strip()
import pytextrank
tr = pytextrank.TextRank(pos_kept=["ADJ", "NOUN", "PROPN", "VERB", "NUM", "ADV"], chunk_type='both')  
nlp2.add_pipe(tr.PipelineComponent, name='textrank', last=True)



%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.cbook import flatten

#!conda install networkx --yes
import networkx as nx
import itertools 

2.3.5


### Create phrases graph  

In [4]:
def create_para_graph(paras_phrases):
    G = nx.Graph()    
    top_para_phrases = []                     # node of the first (top ranked) phrases from each para 
    for para_phrases in paras_phrases:        # each para
        top_sent_phrases = []                 # node of the first (top ranked) phrases from each sent 
        for sent_phrases in para_phrases:     # each sent
            
            # complete graph for each sent
            sent_G = nx.Graph()
            sent_G.add_nodes_from([phrase[0] for phrase in sent_phrases])  
            sent_G.add_edges_from(itertools.combinations([phrase[0] for phrase in sent_phrases], 2)) 
            G = nx.compose(G, sent_G)         # union of the node sets and edge sets
            
            
            # add an edge between the top ranked phrases from each sent to bridge sents
            if(sent_phrases):
                for top_sent_phrase in top_sent_phrases:
                    G.add_edge(top_sent_phrase[0], sent_phrases[0][0])  # sent_phrases[0] is the top ranked phrase of the sentence
                top_sent_phrases.append(sent_phrases[0])     
            
        top_sent_phrases = sorted(top_sent_phrases, key=lambda x: x[1], reverse=True)      # x[0]: phrase text,  x[1]: phrase rank
        
        
        # add an edge between the top ranked phrases from each para to bridge paras
        if(top_sent_phrases):
            for top_para_phrase in top_para_phrases: 
                G.add_edge(top_para_phrase[0], top_sent_phrases[0][0])  # top_sent_phrases[0] is the top ranked phrase of current para
            top_para_phrases.append(top_sent_phrases[0])
     
    # Draw
#     pos = nx.spring_layout(G)
#     plt.figure(figsize=(20,10))
#     nx.draw(G, pos, with_labels=True, edge_color='black', width=1, linewidths=1,
#             node_size=500, node_color='orange', alpha=0.9                           
#             )     
    return G


### Compute the reduced context with phrase graph

In [5]:
import re
import string

from networkx.algorithms import approximation as approx
def reduce_context_with_phares_graph(example, q_id, gold_paras_only=False):
    """function to compute reduced context with phrase graph.

    Args:
        json_dict: The original data load from hotpotqa file.
        gold_paras_only: when is true, only use the 2 paragraphs that contain the gold supporting facts; if false, use all the 10 paragraphs
 
    Returns:
        a new file save additional phrase-related info and the reduced context

    """
    noun_tags = ['NN', 'NNS', 'NNP', 'NNPS']
    new_dict = {"data": []} 
    common_phrases_num_le2 = 0
    extended = 0
    answer_in_reduced_context = 0
    answer_in_context = 0
    reduced_context_ratios = []

    raw_contexts = example["context"]
#         if gold_paras_only: 
#        raw_contexts = [lst for lst in raw_contexts if lst[0] in support_para]    
    paras_phrases = []                                                # phrases of all 10 paragraghs
    for i, para_context in enumerate(raw_contexts):                   # each para

        title = _normalize_text(para_context[0])          
        sents = [_normalize_text(sent) for sent in para_context[1]]
        num_sents_before_coref_resolved = len(sents)
        print("numbe of sents before coref: ", num_sents_before_coref_resolved)
        sents_joint =  (' ' + SENT_MARKER_END +' ').join(sents)
        print("sents: ", sents_joint) 
        sents_doc = nlp1(sents_joint)
        print("resolved_sents: ", sents_doc._.coref_resolved) 
        sents_coref_resolved = sents_doc._.coref_resolved.split(SENT_MARKER_END)
        num_sents_after_coref_resolved = len(sents_coref_resolved)
        print("numbe of sents after coref: ", num_sents_after_coref_resolved)
        
        if(num_sents_before_coref_resolved == num_sents_after_coref_resolved):
            sent_docs = list(nlp2.pipe([title] + sents_coref_resolved))       
        else:
            sent_docs = list(nlp2.pipe([title] + sents))
 
        para_phrases = []                                        
        for sent_doc in sent_docs:                                      # each sent in a para
            sent_phrases = [(p.text, p.rank) for p in sent_doc._.phrases if(p.text != '')]  # phrases from each sentence  
            para_phrases.append(sent_phrases)                           # para_phrases[0] are phrases from  title
        paras_phrases.append(para_phrases)    

#     contexts = [TITLE_START + ' ' + lst[0]  + ' ' + TITLE_END + ' ' + (' ' + SENT_MARKER_END +' ').join(lst[1]) + ' ' + SENT_MARKER_END for lst in raw_contexts]  
#     context = " ".join(contexts)                                                     

    answer = _normalize_text(example["answer"])  
#     if (answer != '' and len(list(re.finditer(answer, context, re.IGNORECASE))) > 0):
#         answer_in_context += 1

    paras_phrases_graph = create_para_graph(paras_phrases)

    question = _normalize_text(example["question"])
    question_doc = nlp2(question)
    question_phrases = [(p.text, p.rank) for p in question_doc._.phrases if(p.text != '')] 
    question_phrases_text = [p[0] for p in question_phrases]

    all_sent_phrases_text =  list(flatten(paras_phrases))[::2]        # every other element is text, others are rank. 
    common_phrases = list(set(all_sent_phrases_text).intersection(question_phrases_text)) 
    question_only_phrase = list(set(question_phrases_text).difference(common_phrases)) 

    example["question_phrases_text"] = question_phrases_text
    example["question_only_phrase"] = question_only_phrase

    if(len(common_phrases) > 1):
        common_phrases_num_le2 += 1
        path_phrases = list(approx.steinertree.steiner_tree(paras_phrases_graph, common_phrases).nodes)  # to find the shortest path cover all common_phrases  
        extended_phrases = path_phrases + question_only_phrase  
        if(len(extended_phrases) > len(question_phrases_text)):
            extended += 1
    else: #  0 or 1 common phrases
        path_phrases = common_phrases             
        extended_phrases = question_phrases_text

    print("question: ", question)
    print("question_phrases: ", question_phrases)
    print("common_phrases: ", common_phrases)
    print("extended_phrases: ", extended_phrases)


    example["question_phrases"] = question_phrases
    example["paras_phrases"] = paras_phrases
#     example["all_sent_phrases_text"] = all_sent_phrases_text
    example["common_phrases"] = common_phrases
    example["path_phrases"] = path_phrases
    example["extended_phrases"] = extended_phrases 

#        
#         print("\n\n") 
#         print("question_phrases: ", question_phrases)    
    print("paras_phrases")
    for paras_phrase in paras_phrases:
    #     print(paras_phrase )
        print(list(flatten(paras_phrase))[::2])
    #     print("\n") 
    
#         print("\n\n") 
        

    raw_reduced_contexts = []     # sentences contain one of the extended_phrases
    number_sentences = 0
    number_reduced_sentences = 0 
    kept_para_sent = []
    for para_id, (para_title, para_lines) in enumerate(raw_contexts):
# #             print("para_id, para_title, para_lines",para_id, para_title, para_lines)

        number_sentences += len(para_lines)
        reduced_para = []
        kept_sent = []
        for sent_id, sent in enumerate(para_lines):

            for phrase in extended_phrases:
                # every other element is text, others are rank 
                if(phrase in list(flatten(paras_phrases[para_id][sent_id+1]))[::2]):  # paras_phrases[para_id][0] are phrases from the title
                    reduced_para.append(sent)
                    number_reduced_sentences += 1 
                    kept_sent.append(sent_id)
                    break     # if current sentence contains one of the extended_phrases, this sentence is added to reduced sentence, and no need to continue checking whether it contains other phrases
        if(len(reduced_para) > 0):
            raw_reduced_contexts.append([para_title, reduced_para])
            kept_para_sent.append(kept_sent)
        else:
            for phrase in extended_phrases:
                if(phrase in list(flatten(paras_phrases[para_id][0]))[::2]):   # only tilte contains one of the extended_phrases
                    raw_reduced_contexts.append([para_title, []])
                    kept_para_sent.append(kept_sent)
                    break
                     
      
    assert number_reduced_sentences <= number_sentences                    
    reduced_context_ratios.append(number_reduced_sentences / number_sentences)    
 
    
    reduced_contexts = [TITLE_START + ' ' + lst[0]  + ' ' + TITLE_END + ' ' + (' ' + SENT_MARKER_END +' ').join(lst[1]) + ' ' + SENT_MARKER_END for lst in raw_reduced_contexts]    
    reduced_context_str = " ".join(reduced_contexts)  

    if (answer != '' and len(list(re.finditer(answer, reduced_context_str, re.IGNORECASE))) > 0):
        answer_in_reduced_context += 1

    supporting_facts = []
    support_para = set(
        para_title for para_title, _ in example["supporting_facts"]
    )
    sp_set = set(list(map(tuple, example['supporting_facts'])))                       # a list of (title, sent_id in orignal context) 
    for i, para_reduced_context in enumerate(raw_reduced_contexts):                   # each para
        if(para_reduced_context[0] in support_para):
            for sent_id, orig_sent_id in enumerate(kept_para_sent[i]):
                if( (para_reduced_context[0], orig_sent_id) in sp_set ):
                    supporting_facts.append([para_reduced_context[0], sent_id])

    example['reduced_context'] = raw_reduced_contexts
    example['supporting_facts'] = supporting_facts
    example['kept_para_sent'] = kept_para_sent

    return example

In [6]:
def _normalize_text(s):

    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [7]:
import os
os.chdir('/xdisk/msurdeanu/fanluo/hotpotQA/')
#!cat /xdisk/msurdeanu/fanluo/hotpotQA/hotpot_train_v1.1.json | ../jq-linux64 -c '.[2:16]' > small.json
#!cat /xdisk/msurdeanu/fanluo/hotpotQA/hotpot_train_v1.1.json | ../jq-linux64 -c '.[380:400]' > small_dev.json
#!cat /xdisk/msurdeanu/fanluo/hotpotQA/hotpot_train_v1.1.json | ../jq-linux64 -c '.[31:50]' > sample.json

In [8]:
# debug: check args
import shlex
argString ='--datafile /xdisk/msurdeanu/fanluo/hotpotQA/Data/hotpot_dev_distractor_v1.json --qid 5a75e05c55429976ec32bc5f'  # --outfile /xdisk/msurdeanu/fanluo/hotpotQA/small_out.json'
shlex.split(argString)

import json
import argparse 
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--datafile", type=str, default='small.json')
    parser.add_argument("--qid", type=str, default='5ae73acb5542991e8301cc07')
#    parser.add_argument("--outfile", type=str, default='small_out.json')
    args = parser.parse_args(shlex.split(argString)) 
     
    print(args.datafile)
    print(args.qid)
    question_json = !cat $args.datafile | /xdisk/msurdeanu/fanluo/helper/jq-linux64 -c --arg key $args.qid '.[] | select(._id | contains($key))'  # --arg key $args.qid is used to pass args.qid as a variable to jq
    
#     print(question_json)
    question = json.loads(question_json[0])    # Convert from JSON string to dict
    reduce_context_with_phares_graph(question, args.qid)

In [9]:
if __name__ == "__main__":
    main()

/xdisk/msurdeanu/fanluo/hotpotQA/Data/hotpot_dev_distractor_v1.json
5a75e05c55429976ec32bc5f
numbe of sents before coref:  5
sents:  fishing lake is lake in canadian province of saskatchewan , </sent> ,  lake is located between highway 5 and highway 16 22 km north of town of foam lake saskatchewan and 24 km east of town of wadena saskatchewan , </sent> ,  lake does not have effective outlet channel and so is prone to flooding , </sent> ,  record floods in 2007 resulted in plan by government of saskatchewan to lower level of lake by digging drainage channel , </sent> ,  fishing lake first nation opposed this plan and instead flood control berms were constructed
resolved_sents:  fishing lake is lake in canadian province of saskatchewan , </sent> ,  lake is located between highway 5 and highway 16 22 km north of town of foam lake saskatchewan and 24 km east of town of wadena saskatchewan , </sent> ,  lake does not have effective outlet channel and so is prone to flooding , </sent> ,  reco

numbe of sents before coref:  10
sents:  higgins lake is large recreational and fishing lake in roscommon county in us state of michigan , </sent> ,  9900 acre 40 km² lake is known for its deep clear waters and is 10th largest in michigan with shoreline of 21 mi , </sent> ,  it is named after sylvester higgins first chief of topographical department of michigan geological survey , </sent> ,  it has maximum width of 4 mi and length of 7 mi with maximum depth of 135 ft , </sent> ,  mean depth is 44 ft and lake contains almost 20 billion cubic feet 570 million m³ of water , </sent> ,  its retention time is about 125 years , </sent> ,  lakes watershed covers 19000 acres 77 km² , </sent> ,  twinlobed lake receives half of its water from submerged springs six percent from incoming streams and remainder from direct rainfall and runoff , </sent> ,  it drains into marl lake by cut river which runs into houghton lake and eventually to lake michigan , </sent> ,  mile north of lake water flows int

In [10]:
### Run all examples from the json file

# with open("/xdisk/msurdeanu/fanluo/hotpotQA/Data/hotpot_dev_distractor_v1.json", "r", encoding='utf-8') as f:  
#     json_dict = json.load(f)
#     for e_id, example in enumerate(json_dict):
#         print("e_id: ", e_id, "_id: ", example["_id"])
#         reduce_context_with_phares_graph(example, example["_id"])  

### Sandbox

In [11]:
title = "chris jones footballer born 1985"
sents =  ['chris is welsh semiprofessional footballer currently playing for cymru alliance side porthmadog', 'former professional with leeds united jones is currently in his fourth season with city', 'he made several appearances for leeds and was heavily involved with first team', 'chris was playing with likes of aaron lennon james milner rio ferdinand scott carson and alan smith during his spell with yorkshire outfit']

In [12]:
context = title  + ' ' + TITLE_END + ' ' + (' ' + SENT_MARKER_END +' ').join(sents)

In [13]:
context_doc = nlp1(context)
title, sents = context_doc._.coref_resolved.split(TITLE_END)  
sents = sents.split(SENT_MARKER_END)
sent_docs = list(nlp2.pipe([title] + sents))     

In [14]:
context_doc._.coref_resolved

'chris born 1985 , </t> .  chris is welsh semiprofessional footballer currently playing for cymru alliance side porthmadog , </sent> ,  former professional with leeds united jones is currently in welsh semiprofessional footballer currently playing for cymru alliance side porthmadog , </sent> ,  former professional with leeds united jones fourth season with city , </sent> ,  welsh semiprofessional footballer currently playing for cymru alliance side porthmadog , </sent> ,  former professional with leeds united jones made several appearances for leeds and was heavily involved with first team , </sent> ,  chris was playing with likes of aaron lennon james milner rio ferdinand scott carson and alan smith during welsh semiprofessional footballer currently playing for cymru alliance side porthmadog , </sent> ,  former professional with leeds united jones spell with yorkshire outfit'

In [15]:
context = "brown state fishing lake , </t> .  brown state fishing lake sometimes also known as brown state fishing lake and wildlife area is protected area in brown county kansas in united states , </sent> ,  lake is 62 acres 025 km² in area and up to 13 feet 4 m deep , </sent> ,  area was formerly known as brown county state park and is 8 miles 13 km east of hiawatha kansas"
 

In [16]:
context_doc = nlp1(context)
print(context_doc._.coref_resolved)
print(context_doc._.coref_clusters)
print(context_doc._.coref_clusters[1].mentions)
print(context_doc._.coref_clusters[1].mentions[-1])
print(context_doc._.coref_clusters[1].mentions[-1]._.coref_cluster.main)

brown state fishing lake , </t> .  brown state fishing lake sometimes also known as brown state fishing lake and wildlife area is protected area in brown county kansas in united states , </sent> ,  lake is 62 acres 025 km² in area and up brown state fishing lake and wildlife area was formerly known as brown county state park and is 8 miles 13 km east of hiawatha kansas
[brown: [brown, brown], brown state fishing lake and wildlife area: [brown state fishing lake and wildlife area, to 13 feet 4 m deep , </sent> ,  area], brown county: [brown county, brown county], kansas: [kansas, kansas]]
[brown state fishing lake and wildlife area, to 13 feet 4 m deep , </sent> ,  area]
to 13 feet 4 m deep , </sent> ,  area
brown state fishing lake and wildlife area


In [17]:
context = "fishing lake , </t> .  fishing lake is lake in canadian province of saskatchewan , </sent> ,  lake is located between highway 5 and highway 16 22 km north of town of foam lake saskatchewan and 24 km east of town of wadena saskatchewan , </sent> ,  lake does not have effective outlet channel and so is prone to flooding , </sent> ,  record floods in 2007 resulted in plan by government of saskatchewan to lower level of lake by digging drainage channel , </sent> ,  fishing lake first nation opposed this plan and instead flood control berms were constructed"
context_resolved = nlp1(context)._.coref_resolved

print(context_resolved)
title_coref_resolved, sents_coref_resolved = context_resolved.split(TITLE_END)  
sents_coref_resolved = sents_coref_resolved.split(SENT_MARKER_END) 
sent_docs = list(nlp2.pipe([title_coref_resolved] + sents_coref_resolved))     
para_phrases = []                                        
for sent_doc in sent_docs:                                    # each sent in a para
    sent_phrases = [(p.text, p.rank) for p in sent_doc._.phrases if(p.text != '')]  # phrases from each sentence 
    para_phrases.append(sent_phrases)     
print(para_phrases)
print("\n")  

fishing lake , </t> .  fishing lake is lake in canadian province of saskatchewan , </sent> ,  lake is located between highway 5 and highway 16 22 km north of town of foam lake saskatchewan and 24 km east of town of wadena saskatchewan , </sent> ,  lake does not have effective outlet channel and so is prone to flooding , </sent> ,  record floods in 2007 resulted in plan by government of saskatchewan to lower level of lake by digging drainage channel , </sent> ,  fishing lake first nation opposed this plan and instead flood control berms were constructed
[[('fishing lake', 0.4330127018922193), ('lake', 0.3333333333333333)], [('canadian province', 0.2616585242462876), ('lake', 0.23150520948869216), ('province', 0.20142482587913918), ('canadian', 0.20142482587913918), ('saskatchewan', 0.16895618538601082), ('fishing lake', 0.13160696215647302)], [('16 22 km', 0.2026452065409755), ('24 km', 0.18671728907198992), ('foam lake saskatchewan', 0.1852953057904101), ('lake saskatchewan', 0.1764118

In [20]:
from spacy.tokens import Span
text = 'valhalla highlands historic district also known as lake valhalla is national historic district located near cold spring in putnam county new york'
doc = nlp2(text)
for chunk in doc.noun_chunks:
    print('chunk: ', chunk.text) 
    print('root: ',  chunk.root.text)
#     span = Span(doc, chunk.root.i, chunk.root.i+1)
#     print(span)
#     print(span.start)

chunk:  valhalla highlands historic district
root:  district
chunk:  lake valhalla
root:  valhalla
chunk:  national historic district
root:  district
chunk:  cold spring
root:  spring
chunk:  putnam county new york
root:  york


In [19]:
context = "this list provides guide to opera composers as determined by their presence on majority of compiled lists of significant opera composers , </sent> ,  see lists consulted section for full details , </sent> ,  composers run from jacopo peri who wrote first ever opera in late 16th century italy to john adams one of leading figures in contemporary operatic world , </sent> ,  brief accompanying notes offer explanation as to why each composer has been considered major , </sent> ,  also included is section about major women opera composers compiled from same lists , </sent> ,  for introduction to operatic history see opera , </sent> ,  organisation of list is by birthdate"
context_resolved = nlp1(context)._.coref_resolved

print(context_resolved)
sents_coref_resolved = context_resolved.split(SENT_MARKER_END)  

this list provides guide to opera composers as determined by opera composers presence on majority of compiled lists of significant opera composers , </sent> ,  see lists consulted section for full details , </sent> ,  composers run from jacopo peri who wrote first ever opera in late 16th century italy to john adams one of leading figures in contemporary operatic world , </sent> ,  brief accompanying notes offer explanation as to why each composer has been considered major , </sent> ,  also included is section about major women opera composers compiled from same lists , </sent> ,  for introduction to operatic history see opera , </sent> ,  organisation of list is by birthdate
