Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/


*.bin
2 changes: 1 addition & 1 deletion OpenGNN
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ ognn-build-vocab --no_pad_token --field_name edges --string_index 0 \
/data/naturallanguage/cnn_dailymail/split/train/inputs.jsonl.gz
ognn-build-vocab --with_sequence_tokens \
--save_vocab /data/naturallanguage/cnn_dailymail/output.vocab \
/data/naturallanguage/cnn_dailymail/split/train/inputs.jsonl.gz
/data/naturallanguage/cnn_dailymail/split/train/targets.jsonl.gz
```

Then run
Expand Down
Empty file added parsers/__init__.py
Empty file.
145 changes: 143 additions & 2 deletions parsers/naturallanguage/dmcnn/convert2graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,154 @@
import pdb
import codecs
import traceback
from typing import Callable, List
from collections import OrderedDict
from typing import Any, List, Optional, Tuple, Callable

from docopt import docopt

from data.utils import load_xml
from parsers.naturallanguage.gigaword.loadgigacorpus import parse_sample

from nltk.tree import Tree

from parsers.naturallanguage.graphtextrepr import (DependencyEdge,
GraphTextRepresentation,
Token)
from parsers.naturallanguage.textsummary import TextSummary
from data.utils import load_xml_gz


def parse_tree_to_sentence(parse_tree:str)-> List[str]:
return Tree.fromstring(parse_tree).leaves()

def try_find_RB_span(tokens: List[str]) -> Optional[Tuple[int, int]]:
try:
lrb_idx = tokens.index('-LRB-')
rrb_idx = tokens.index('-RRB-')
if lrb_idx > rrb_idx:
return None # Malformed title, parentheses misplaced
return (lrb_idx, rrb_idx+1)
except ValueError:
return None


def parse_sample(datapoint, provenance: str, headline_getter: Optional[Callable[[Any], List[str]]]=None)-> Optional[TextSummary]:
if headline_getter is None and (datapoint.get('HEADLINE') is None or len(datapoint['HEADLINE']) == 0):
return None
try:
if headline_getter is None:
headline_tokens = parse_tree_to_sentence(datapoint['HEADLINE'])
else:
headline_tokens = headline_getter(datapoint)
# Remove LRB-RRB chunks
rb_span = try_find_RB_span(headline_tokens)
while rb_span is not None:
headline_tokens = headline_tokens[:rb_span[0]] + headline_tokens[rb_span[1]:]
rb_span = try_find_RB_span(headline_tokens)
if len(headline_tokens) <= 1:
return None

except Exception as e:
print('Could not parse %s. Ignoring sample.' % datapoint.get('HEADLINE'))
print(e)
return None

if 'sentences' not in datapoint or datapoint['sentences'] is None:
return None

all_sentences = datapoint['sentences']['sentence']
if type(all_sentences) is not list:
all_sentences = [all_sentences]

tokenized_sentences = [] # type: List[List[Token]]
for sentence in all_sentences:
sentence_tokens = []
if type(sentence['tokens']['token']) is not list:
# May happen in single-word sentences
sentence['tokens']['token'] = [sentence['tokens']['token']]
for i, token in enumerate(sentence['tokens']['token']):
assert int(token['@id']) == i + 1
sentence_tokens.append(Token(word=token['word'], lemma=token['lemma'], pos_tag=token['POS']))
tokenized_sentences.append(sentence_tokens)

graph_text_representation = GraphTextRepresentation(tokenized_sentences, provenance=provenance)

# Add named entities, by finding consecutive annotations
for sentence_idx, sentence in enumerate(all_sentences):
sentence_tokens = sentence['tokens']['token']
for token_idx, token in enumerate(sentence_tokens):
if 'NER' not in token:
return None # Ignore samples that don't have NER output.
if token['NER'] == 'O':
continue
if token_idx + 1 < len(sentence_tokens) - 1 and sentence_tokens[token_idx + 1]['NER'] != token['NER']:
# Create an entity that includes this token as the last one
before_start_token_idx = token_idx - 1
while before_start_token_idx > 0 and sentence_tokens[before_start_token_idx]['NER'] == token['NER']:
before_start_token_idx -= 1
graph_text_representation.add_entity(token['NER'], sentence_idx, before_start_token_idx + 1, token_idx + 1)

def get_collapsed_dependencies(sentence):
if 'dependencies' not in sentence or sentence['dependencies'] is None:
return None
for dependencies in sentence['dependencies']:
if dependencies['@type'] == 'collapsed-dependencies':
return dependencies
return None

# Add dependencies
for sentence_idx, sentence in enumerate(all_sentences):
if ('collapsed-dependencies' not in sentence or sentence['collapsed-dependencies'] is None) and get_collapsed_dependencies(sentence) is None:
continue
if 'collapsed-dependencies' in sentence:
collapsed_deps = sentence['collapsed-dependencies']
else:
collapsed_deps = get_collapsed_dependencies(sentence)

if type(collapsed_deps['dep']) is not list:
collapsed_deps['dep'] = [collapsed_deps['dep']]
for dependency in collapsed_deps['dep']:
if dependency['@type'] == 'root':
continue # Root is not useful for us
dependency_type = dependency['@type']
underscore_location = dependency_type.find('_')
if underscore_location != -1:
dependency_type = dependency_type[:underscore_location]
if isinstance(dependency['dependent'], OrderedDict):
dependency['dependent'] = dependency['dependent']['@idx']
if isinstance(dependency['governor'], OrderedDict):
dependency['governor'] = dependency['governor']['@idx']

graph_text_representation.add_dependency_edge(DependencyEdge(
dependency_type=dependency_type,
sentence_idx=sentence_idx,
from_idx=int(dependency['dependent']) - 1,
to_idx=int(dependency['governor']) - 1
))

# Add co-references
coreferences = None
if 'coreferences' in datapoint and datapoint['coreferences'] is not None:
coreferences = datapoint['coreferences']
elif 'coreference' in datapoint and datapoint['coreference'] is not None:
coreferences = datapoint['coreference']

if coreferences is not None:
if type(coreferences['coreference']) is not list:
coreferences['coreference'] = [coreferences['coreference']]
for coreference in coreferences['coreference']:
all_mentions = coreference['mention']
representative = [m for m in all_mentions if '@representative' in m and m['@representative'] == 'true'][0]

for mention in all_mentions:
if mention.get('@representative') == 'true' or (mention['sentence'] == representative['sentence'] and mention['head'] == representative['head']):
continue
graph_text_representation.add_coreference(int(mention['sentence']) - 1, int(mention['head']) - 1,
int(representative['sentence']) -1, int(representative['head'])-1)

return TextSummary(
summary_sentence=headline_tokens,
main_text= graph_text_representation
)

def parse_cnndm_file(filename: str, write_sample_callback: Callable, summaries_folder: str) -> None:
def process_sample(location, sample):
Expand Down
4 changes: 0 additions & 4 deletions parsers/sourcecode/barone/ast_graph_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,6 @@ def visit_Raise(self, node):

self.terminal('raise')
if hasattr(node, 'exc') and node.exc is not None:
self.terminal(' ')
self.visit(node.exc)
if node.cause is not None:
self.terminal('from')
Expand Down Expand Up @@ -792,8 +791,6 @@ def visit_UnaryOp(self, node):
self.terminal('(')
op = UNARYOP_SYMBOLS[type(node.op)]
self.terminal(op)
if op == 'not':
self.terminal(' ')
self.visit(node.operand)
self.terminal(')')
self.parent = gparent
Expand Down Expand Up @@ -938,7 +935,6 @@ def visit_excepthandler(self, node):

self.terminal('except')
if node.type is not None:
self.terminal(' ')
self.visit(node.type)
if node.name is not None:
self.terminal(' as ')
Expand Down
11 changes: 6 additions & 5 deletions train_and_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,9 @@ def train_and_eval(model, args):
worse_epochs = 0
print("saving best model...")
saver.save(session, os.path.join(args.checkpoint_dir, "best.ckpt"))
worse_epochs += 1
else:
worse_epochs += 1

# and stop training if triggered patience
if worse_epochs >= args.patience:
print("early stopping triggered...")
Expand Down Expand Up @@ -424,8 +426,7 @@ def infer(model, args):
mode=tf.estimator.ModeKeys.PREDICT,
batch_size=args.batch_size,
metadata=metadata,
features_file=args.train_source_file,
labels_file=args.train_target_file,
features_file=args.infer_source_file,
features_bucket_width=args.bucket_width,
sample_buffer_size=args.sample_buffer_size)
session_config = tf.ConfigProto(
Expand Down Expand Up @@ -509,11 +510,11 @@ def build_optimizer(args):
optimizer_class = getattr(tf.train, optimizer, None)
if optimizer_class is None:
raise ValueError("Unsupported optimizer %s" % optimizer)

kwargs = {}
# TODO: optimizer params
# optimizer_params = params.get("optimizer_params", {})

def optimizer(lr): return optimizer_class(lr) # **optimizer_params)
def optimizer(lr): return optimizer_class(lr, **kwargs)

learning_rate = args.learning_rate
if args.lr_decay_rate is not None:
Expand Down