# Construct Dataset for Manual Evaluation

## Import Packages

In [38]:
import Config as CON
import os
import sys
import random
import json
import time
import numpy as np
import math
import csv
import matplotlib.pyplot as plt
import string

import nltk
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import conll2000
from nltk.chunk.util import tree2conlltags,conlltags2tree
from nltk.tag import UnigramTagger, BigramTagger
from nltk.chunk import ChunkParserI
from nltk.tokenize import sent_tokenize

#Declare some necessary global functions and objects
current_time_ms = lambda: int(round(time.time() * 1000))
wordnet_lemmatizer = WordNetLemmatizer()

In [39]:
nere_file = os.path.join(CON.OUTPUT_DATA_DIRECTORY,"nere.json")
assert(os.path.exists(nere_file))

nere = {}

with open(nere_file, encoding='utf-8') as f:
    nere = json.loads(f.read())

In [40]:
I = random.sample(range(0, len(nere.keys())), 2000)
E1 = [list(nere.keys())[i] for i in I]

In [41]:
count = 0
tuples = []
for e1 in E1:
    e1_keys = [x for x in nere[e1].keys() if len(nere[e1][x])>0]
    if len(e1_keys)==0:
        continue
    i = random.sample(range(0, len(e1_keys)), 1)[0]
    e2 = e1_keys[i]
    r = nere[e1][e2]
    count +=1
    #print("(%s, %s) => %s"%(e1, e2, r))
    tuples.append((e1, e2, r))
    if count==1000:
        break
        
#print(tuples)

## NLTK Chunking

In [42]:
data= conll2000.chunked_sents()
train_data=data[:10900]
test_data=data[10900:]

def conll_tag_chunks(chunk_sents):
    tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
    return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]

def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff

#Define the chunker class
class NGramTagChunker(ChunkParserI):
    def __init__(self,train_sentences,tagger_classes=[UnigramTagger,BigramTagger]):
        train_sent_tags=conll_tag_chunks(train_sentences)
        self.chunk_tagger=combined_tagger(train_sent_tags,tagger_classes)
        
    def parse(self,tagged_sentence):
        if not tagged_sentence:
            return None
        pos_tags=[tag for word, tag in tagged_sentence]
        chunk_pos_tags=self.chunk_tagger.tag(pos_tags)
        chunk_tags=[chunk_tag for (pos_tag,chunk_tag) in chunk_pos_tags]
        wpc_tags=[(word,pos_tag,chunk_tag) for ((word,pos_tag),chunk_tag) in zip(tagged_sentence,chunk_tags)]
        return conlltags2tree(wpc_tags)

#train chunker model
ntc=NGramTagChunker(train_data)
#evaluate chunker model performance
print(ntc.evaluate(test_data))

ChunkParse score:
    IOB Accuracy:  90.0%%
    Precision:     82.1%%
    Recall:        86.3%%
    F-Measure:     84.1%%


In [43]:
def print_tree(t):
    try:
        t.label()
    except AttributeError:
        return t[0]
    else:
        txt = ""
        for child in t:
            txt = txt + print_tree(child).strip() + " "
        return txt.strip()

In [44]:
def traverse(t):
    '''
    Returns:
    Noun phrases -- are possible entities
    Verb phrases -- are possible relations
    '''
    noun_phrases = []
    verb_phrases = []
    try:
        t.label()
    except AttributeError:
        return ([],[])
        
    else:
        if(t.label()=='NP'):
            noun_phrases.append(print_tree(t))
            
        if(t.label()=='VP'):
            verb_phrases.append(print_tree(t))
        
        for child in t:
            np, vp = traverse(child)
            noun_phrases.extend(np)
            verb_phrases.extend(vp)
            
    return noun_phrases, verb_phrases

In [45]:
verb_stopwords = ['to','be'] #These appear too frequently in almost every sentences, and are identify as verb phrases

temp = []
top_keywords = []
top_keywords_file = os.path.join(CON.OUTPUT_DATA_DIRECTORY,"top_keywords.txt")
assert(os.path.exists(top_keywords_file))

with open(top_keywords_file, encoding='utf-8') as f:
    X = f.readlines()
    top_keywords = [x.strip() for x in X]

for x in top_keywords:
    #Keywords with a single or couple letters are almost meaningless
    if len(x)>=3:
        temp.append(x)
top_keywords.clear()
top_keywords = temp

In [46]:
abstract_filename = os.path.join(CON.OUTPUT_DATA_DIRECTORY,"all_abstracts_with_keywords.json")
assert(os.path.exists(abstract_filename))

with open(abstract_filename, encoding='utf-8') as f:
    all_abstracts = json.loads(f.read())
    
I = random.sample(range(0, len(all_abstracts)), 10000)

In [47]:
dataset = {}

for i in I:
    abstract_with_keywords = all_abstracts[i]
    
    abstract = abstract_with_keywords["abstract"]
    
    if abstract == "":
        continue
    
    #Analyze sentence by sentence
    for sentence in sent_tokenize(abstract):
        #Extract parts of speech of words and chunk them into phrases
        nltk_pos_tagged=nltk.pos_tag(sentence.split())
        chunk_tree=ntc.parse(nltk_pos_tagged)
        #Extract noun and verb phrases from the sentence
        nps, vps = traverse(chunk_tree)
        
        #Convert a verb to its root form: e.g., is/are -> be
        vps = [wordnet_lemmatizer.lemmatize(v, 'v') for v in vps]
        #Exlude too frequent ones, defined as stopwords
        vps = [v for v in vps if v not in verb_stopwords]
        
        #Candidate entities are the noun phrases that are present in the sentence, and are also top keywords
        candidate_entities = []
        
        for k in top_keywords:
            for e in nps:
                if k in e:
                    if (e in k) or ((k+" ") in e) or ((" "+k) in e):
                        if k not in candidate_entities:
                            candidate_entities.append(k)
        
        #Extract relations between each pair of candidate entities
        if(len(candidate_entities)<2):
            continue
        
        found = False
        for (e1, e2, r) in tuples:
            if (e1 in candidate_entities) and (e2 in candidate_entities) and (r in vps):
                #found = True
                if (e1, e2, r) not in dataset.keys():
                    dataset[(e1, e2, r)] = [sentence]
                    if len(dataset[(e1, e2, r)])==3:
                        tuples.remove((e1, e2, r))
                        print(len(tuples))
                else:
                    dataset[(e1, e2, r)].append(sentence)
                    if len(dataset[(e1, e2, r)])==3:
                        tuples.remove((e1, e2, r))
                        print(len(tuples))
                break
            #end-if
        #end-for
        
        if len(tuples)==0:
            break
    #end-for
    if len(tuples)==0:
        break
#end-for

660
659
658
657
656
655
654
653
652
651
650
649
648
647
646
645
644
643
642
641
640
639
638
637
636
635
634
633
632
631
630
629
628
627
626
625
624
623
622
621
620
619
618
617
616
615
614
613
612
611
610
609
608
607
606
605
604
603
602
601
600
599
598
597
596
595
594
593
592
591
590
589
588
587
586
585
584
583
582
581
580
579
578
577
576
575
574
573
572
571
570
569
568
567
566
565
564
563
562
561
560
559
558
557
556
555
554
553
552
551
550
549
548
547
546
545
544
543
542
541
540
539
538
537
536
535
534
533
532
531
530
529
528
527
526
525
524
523
522
521
520
519


In [48]:
print(dataset.keys())

dict_keys([('pollution level', 'pollution', 'show'), ('heavy metal contamination', 'metals', 'show'), ('rural area', 'rural', 'have'), ('reliability', 'method', 'use'), ('wet season', 'seas', 'show'), ('crop', 'water', 'show'), ('ecological environment', 'protection', 'show'), ('elements', 'oil', 'show'), ('numerical model', 'model', 'use'), ('environmental problem', 'pollution', 'has become'), ('air pollution', 'risk', 'associate'), ('lung cancer', 'lung', '='), ('conversion', 'energy', 'use'), ('sensitivity analysis', 'sensitivity', 'show'), ('river water', 'pollution', 'show'), ('linear model', 'model', 'use'), ('child', 'effect', 'have'), ('war', 'scenarios', 'warm'), ('interviews', 'survey', 'use'), ('meteorology', 'temperature', 'show'), ('winter wheat', 'yield', 'show'), ('control group', 'control', 'were significantly'), ('wind', 'region', 'show'), ('stable isotope', 'isotope', 'use'), ('evaluation method', 'valuation', 'show'), ('seedling', 'drought stress', 'show'), ('disaste

In [55]:
import csv

with open('dataset.csv', 'w') as f:
    for key in dataset.keys():
        w_str = key
        i = 0
        for s in dataset[key]:
            if i==0:
                f.write("%s^%s^%d\n"%(key,s,1))
            else:
                f.write("%s^%s^%d\n"%(key,s,0))
            i+=1

In [51]:
print(len(dataset))

458


In [53]:
with open('dataset.txt', 'w') as f:
    for t in dataset:
        f.write(str(t)+"\n")
        f.write(str(dataset[t]))
        f.write("\n")