In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "8"

In [2]:
import stanza
from stanza import DownloadMethod
pipe = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', package={'constituency': 'wsj_bert'}, 
                        model_dir="/mount/arbeitsdaten/asr-2/vaethdk/resources/weights/", 
                        download_method=DownloadMethod.REUSE_RESOURCES)

  from .autonotebook import tqdm as notebook_tqdm
2023-09-07 16:30:03 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj_bert |

2023-09-07 16:30:03 INFO: Using device: cuda
2023-09-07 16:30:03 INFO: Loading: tokenize
2023-09-07 16:30:09 INFO: Loading: pos
2023-09-07 16:30:10 INFO: Loading: constituency
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to

## Code

In [3]:
from collections import defaultdict
from typing import Dict, Iterable, Set
from stanza.models.constituency.parse_tree import Tree

In [4]:
def count_labels(tree: Tree, labels:  Set[str]) -> Dict[str, int]:
    # Returns the count per label
    counters = defaultdict(lambda: 0)
    for child in tree.children:
        if child.label in labels:
            counters[child.label] += 1
        if not child.is_leaf():
            child_counters = count_labels(child, labels)
            for label in child_counters:
                counters[label] += child_counters[label]
    return counters

In [58]:
from typing import List, Tuple

def _prune(tree: Tree, label: str, instance: int = -1, instance_count = 0) -> Tuple[int, List[Tree]]:
    candidates = []
    current_instance_count = instance_count
    for child in tree.children:
        if child.label == label:
            # found node for label
            if instance < 0 or current_instance_count == instance:
                # remove instance
                current_instance_count += 1
                # print("SKIPPING", child, instance, current_instance_count)
                continue
            else:
                current_instance_count += 1
                # print("SEEN", child, instance, current_instance_count)
        if child.is_leaf():
            # found word -> append
            candidates += [child.label]
        else:
            # recurse
            next_instance_count, next_candidates = _prune(child, label, instance, current_instance_count)
            current_instance_count = next_instance_count
            candidates += next_candidates
    return current_instance_count, candidates

def prune(tree: Tree, label: str, instance: int = -1) -> List[Tree]:
    """ Remove "label" nodes from the tree.
    Args:
        instance (int): if < 0, removes ALL instances of 'label',
                        else only the i-th given instance
    """
    return _prune(tree, label, instance=instance, instance_count=0)[1]

def extract_subtree(tree: Tree, label: str, instance: int = 0, instance_count = 0) -> List[Tree]:
    """ 
    Args:
        instance (int): i-th subtree with 'label' to extract
        intance_count (int): IGNORE, internal use only
    """
    candidates = []
    for child in tree.children:
        if child.label == label:
            # found node for label
            if instance_count == instance:
                return child.leaf_labels()
            instance_count += 1
        else:
            # recurse
            candidates += extract_subtree(child, label, instance, instance_count)
    return candidates



In [94]:
def leafs_to_string(leafs: List[str]) -> str:
    result = " ".join(leafs)
    old_length = 0
    new_length = -1
    while old_length != new_length:
        old_length = len(result)
        result = result.replace(" ,", ",").replace(" ?", "?").replace(" ;", ";").replace(" !", "!").replace(" :", ":").replace(" `", "`").replace(" '", "'").replace(' "', '"').replace(' $', '$')
        new_length = len(result)
    old_length = 0
    new_length = -1
    while old_length != new_length:
        # while we can contract punctuation, we should continue to do so
        old_length = len(result)
        result = result.replace(",,", ",").replace("??", "?").replace(";;", ";").replace("!!", "!").replace("::", ":").replace(" `", "`").replace("''", "'").replace('""', '"').replace('$$', '$')
        new_length = len(result)
    return result.strip()
    

def realize_all_options(tree: Tree, label: str) -> Set[str]:
    # realize all possible options
    realizations = set([])
    # count label frequency
    label_count = count_labels(tree, labels=set([label]))
    num_labels = label_count[label] if label in label_count else 0
    for instance in range(num_labels):
        realization = leafs_to_string(prune(tree, label, instance=instance))
        if len(realization) > 4: # filter empty clauses (or clauses that contain only punctuation after pruning)
            realizations.add(realization)
    return realizations


def extract_single_option(tree: Tree, label: str, instance: int = 0):
    realizations = set([])
    # count label frequency
    label_count = count_labels(tree, labels=set([label])) 
    num_labels = label_count[label] if label in label_count else 0
    if num_labels > 1:
        # remove second instance
        realization = leafs_to_string(extract_subtree(tree, label, instance=instance))
        realizations.add(realization)
    return realizations

### Tests

In [46]:
def print_all_options(tree: Tree, label: str):
    for option in realize_all_options(tree, label):
        print(option)

In [90]:
sentence1 = "Can I use more than two private cars for stretches longer than 100 miles, or is that usually not reimbursable?"
sentence2 = "Should I use two cars to drive to the venue?"
sentence3 = "Can I, using a brush, carefully glue and paint my pretty miniatures, or is this impossible?"
sentence4 = "If I stay for longer than 5 days in Germany, am I still eligible for 500$ daily allowance?"
doc1 = pipe(sentence1)
doc2 = pipe(sentence2)
doc3 = pipe(sentence3)
doc4 = pipe(sentence4)

In [100]:
for sentence in doc1.sentences:
    print(sentence.constituency.pretty_print())
    # print(type(sentence.constituency))
    # print(Tree.get_compound_constituents([sentence.constituency]))

(ROOT
  (S
    (SQ
      (MD Can)
      (NP (PRP I))
      (VP
        (VB use)
        (NP
          (QP (JJR more) (IN than) (CD two))
          (JJ private)
          (NNS cars))
        (PP
          (IN for)
          (NP
            (NP (NNS stretches))
            (ADJP
              (ADJP (RBR longer))
              (PP
                (IN than)
                (NP (CD 100) (NNS miles))))))))
    (, ,)
    (CC or)
    (SQ
      (VBZ is)
      (NP (DT that))
      (ADVP (RB usually))
      (RB not)
      (ADJP (JJ reimbursable)))
    (. ?)))



In [101]:
for sentence in doc2.sentences:
    print(sentence.constituency.pretty_print())

(ROOT
  (SQ
    (MD Should)
    (NP (PRP I))
    (VP
      (VB use)
      (NP (CD two) (NNS cars))
      (S
        (VP
          (TO to)
          (VP
            (VB drive)
            (PP
              (IN to)
              (NP (DT the) (NN venue)))))))
    (. ?)))



In [102]:
for sentence in doc3.sentences:
    print(sentence.constituency.pretty_print())

(ROOT
  (S
    (SQ
      (MD Can)
      (NP (PRP I))
      (, ,)
      (S
        (VP
          (VBG using)
          (NP (DT a) (NN brush))))
      (, ,)
      (ADVP (RB carefully))
      (VP
        (NN glue)
        (CC and)
        (VB paint)
        (NP (PRP$ my) (JJ pretty) (NNS miniatures))))
    (, ,)
    (CC or)
    (SQ
      (VBZ is)
      (NP (DT this))
      (ADJP (JJ impossible)))
    (. ?)))



In [91]:
for sentence in doc4.sentences:
    print(sentence.constituency.pretty_print())

(ROOT
  (SBARQ
    (SBAR
      (IN If)
      (S
        (NP (PRP I))
        (VP
          (VBP stay)
          (PP
            (IN for)
            (NP
              (QP (JJR longer) (IN than) (CD 5))
              (NNS days)))
          (PP
            (IN in)
            (NP (NNP Germany))))))
    (, ,)
    (VBP am)
    (NP (PRP I))
    (ADVP (RB still))
    (ADJP
      (JJ eligible)
      (PP
        (IN for)
        (NP
          (NML (CD 500) ($ $))
          (JJ daily)
          (NN allowance))))
    (. ?)))



In [86]:
# Remove adjectives
print(sentence1)
print_all_options(doc1.sentences[0].constituency, "JJ")
print(sentence2)
print_all_options(doc2.sentences[0].constituency, "JJ")
print(sentence3)
print_all_options(doc3.sentences[0].constituency, "JJ")
print(sentence4)
print_all_options(doc4.sentences[0].constituency, "JJ")

Can I use more than two private cars for stretches longer than 100 miles, or is that usually not reimbursable?
Can I use more than two cars for stretches longer than 100 miles, or is that usually not reimbursable?
Can I use more than two private cars for stretches longer than 100 miles, or is that usually not?
Should I use two cars to drive to the venue?
Can I, using a brush, carefully glue and paint my pretty miniatures, or is this impossible?
Can I, using a brush, carefully glue and paint my pretty miniatures, or is this?
Can I, using a brush, carefully glue and paint my miniatures, or is this impossible?
If I stay for longer than 5 days in Germany, am I still eligible for daily allowance?
If I stay for longer than 5 days in Germany, am I still for daily allowance?
If I stay for longer than 5 days in Germany, am I still eligible for allowance?


In [87]:
# Remove prepositional phrase
print(sentence1)
print_all_options(doc1.sentences[0].constituency, "PP")
print(sentence2)
print_all_options(doc2.sentences[0].constituency, "PP")
print(sentence3)
print_all_options(doc3.sentences[0].constituency, "PP")
print(sentence4)
print_all_options(doc4.sentences[0].constituency, "PP")

Can I use more than two private cars for stretches longer than 100 miles, or is that usually not reimbursable?
Can I use more than two private cars, or is that usually not reimbursable?
Can I use more than two private cars for stretches longer, or is that usually not reimbursable?
Should I use two cars to drive to the venue?
Should I use two cars to drive?
Can I, using a brush, carefully glue and paint my pretty miniatures, or is this impossible?
If I stay for longer than 5 days in Germany, am I still eligible for daily allowance?
If I stay for longer than 5 days in Germany, am I still eligible?
If I stay in Germany, am I still eligible for daily allowance?
If I stay for longer than 5 days, am I still eligible for daily allowance?


In [88]:
# Remove adverb phrase
print(sentence1)
print_all_options(doc1.sentences[0].constituency, "ADVP")
print(sentence2)
print_all_options(doc2.sentences[0].constituency, "ADVP")
print(sentence3)
print_all_options(doc3.sentences[0].constituency, "ADVP")
print(sentence4)
print_all_options(doc4.sentences[0].constituency, "ADVP")

Can I use more than two private cars for stretches longer than 100 miles, or is that usually not reimbursable?
Can I use more than two private cars for stretches longer than 100 miles, or is that not reimbursable?
Should I use two cars to drive to the venue?
Can I, using a brush, carefully glue and paint my pretty miniatures, or is this impossible?
Can I, using a brush, glue and paint my pretty miniatures, or is this impossible?
If I stay for longer than 5 days in Germany, am I still eligible for daily allowance?
If I stay for longer than 5 days in Germany, am I eligible for daily allowance?


In [89]:
# Remove quantifier phrase
print(sentence1)
print_all_options(doc1.sentences[0].constituency, "QP")
print(sentence2)
print_all_options(doc2.sentences[0].constituency, "QP")
print(sentence3)
print_all_options(doc3.sentences[0].constituency, "QP")
print(sentence4)
print_all_options(doc4.sentences[0].constituency, "QP")

Can I use more than two private cars for stretches longer than 100 miles, or is that usually not reimbursable?
Can I use private cars for stretches longer than 100 miles, or is that usually not reimbursable?
Should I use two cars to drive to the venue?
Can I, using a brush, carefully glue and paint my pretty miniatures, or is this impossible?
If I stay for longer than 5 days in Germany, am I still eligible for daily allowance?
If I stay for days in Germany, am I still eligible for daily allowance?


In [96]:
# Remove adverbs??
print(sentence1)
print_all_options(doc1.sentences[0].constituency, "RB")
print(sentence2)
print_all_options(doc2.sentences[0].constituency, "RB")
print(sentence3)
print_all_options(doc3.sentences[0].constituency, "RB")
print(sentence4)
print_all_options(doc4.sentences[0].constituency, "RB")

Can I use more than two private cars for stretches longer than 100 miles, or is that usually not reimbursable?
Can I use more than two private cars for stretches longer than 100 miles, or is that not reimbursable?
Can I use more than two private cars for stretches longer than 100 miles, or is that usually reimbursable?
Should I use two cars to drive to the venue?
Can I, using a brush, carefully glue and paint my pretty miniatures, or is this impossible?
Can I, using a brush, glue and paint my pretty miniatures, or is this impossible?
If I stay for longer than 5 days in Germany, am I still eligible for 500$ daily allowance?
If I stay for longer than 5 days in Germany, am I eligible for 500$ daily allowance?


In [97]:
# Remove cardinal numbers ??
print(sentence1)
print_all_options(doc1.sentences[0].constituency, "CD")
print(sentence2)
print_all_options(doc2.sentences[0].constituency, "CD")
print(sentence3)
print_all_options(doc3.sentences[0].constituency, "CD")
print(sentence4)
print_all_options(doc4.sentences[0].constituency, "CD")

Can I use more than two private cars for stretches longer than 100 miles, or is that usually not reimbursable?
Can I use more than two private cars for stretches longer than miles, or is that usually not reimbursable?
Can I use more than private cars for stretches longer than 100 miles, or is that usually not reimbursable?
Should I use two cars to drive to the venue?
Should I use cars to drive to the venue?
Can I, using a brush, carefully glue and paint my pretty miniatures, or is this impossible?
If I stay for longer than 5 days in Germany, am I still eligible for 500$ daily allowance?
If I stay for longer than days in Germany, am I still eligible for 500$ daily allowance?
If I stay for longer than 5 days in Germany, am I still eligible for$ daily allowance?


In [95]:
# Remove nominal modifiers??
print(sentence1)
print_all_options(doc1.sentences[0].constituency, "NML")
print(sentence2)
print_all_options(doc2.sentences[0].constituency, "NML")
print(sentence3)
print_all_options(doc3.sentences[0].constituency, "NML")
print(sentence4)
print_all_options(doc4.sentences[0].constituency, "NML")

Can I use more than two private cars for stretches longer than 100 miles, or is that usually not reimbursable?
Should I use two cars to drive to the venue?
Can I, using a brush, carefully glue and paint my pretty miniatures, or is this impossible?
If I stay for longer than 5 days in Germany, am I still eligible for 500$ daily allowance?
If I stay for longer than 5 days in Germany, am I still eligible for daily allowance?


In [98]:
# Remove simple declarative clauses
print(sentence1)
print_all_options(doc1.sentences[0].constituency, "S")
print(sentence2)
print_all_options(doc2.sentences[0].constituency, "S")
print(sentence3)
print_all_options(doc3.sentences[0].constituency, "S")
print(sentence4)
print_all_options(doc4.sentences[0].constituency, "S")

Can I use more than two private cars for stretches longer than 100 miles, or is that usually not reimbursable?
Should I use two cars to drive to the venue?
Should I use two cars?
Can I, using a brush, carefully glue and paint my pretty miniatures, or is this impossible?
Can I, carefully glue and paint my pretty miniatures, or is this impossible?
If I stay for longer than 5 days in Germany, am I still eligible for 500$ daily allowance?
If, am I still eligible for 500$ daily allowance?


In [101]:
# Remove all questions after first question
print(sentence1)
for sentence in extract_single_option(doc1.sentences[0].constituency, "SQ", instance=0):
    print(sentence)
print(sentence2)
for sentence in extract_single_option(doc2.sentences[0].constituency, "SQ", instance=0):
    print(sentence)
print(sentence3)
for sentence in extract_single_option(doc3.sentences[0].constituency, "SQ", instance=0):
    print(sentence)
print(sentence4)
for sentence in extract_single_option(doc4.sentences[0].constituency, "SQ", instance=0):
    print(sentence)

Can I use more than two private cars for stretches longer than 100 miles, or is that usually not reimbursable?
Can I use more than two private cars for stretches longer than 100 miles
Should I use two cars to drive to the venue?
Can I, using a brush, carefully glue and paint my pretty miniatures, or is this impossible?
Can I, using a brush, carefully glue and paint my pretty miniatures
If I stay for longer than 5 days in Germany, am I still eligible for 500$ daily allowance?


# Real Data

In [109]:
import json
from tqdm.auto import tqdm
import time
from copy import deepcopy


def augment_question_data(json_question_filepath: str, output_filename: str):
    all_augmentations = defaultdict(lambda: set([]))
    with open(json_question_filepath, "r") as f:
        data = json.load(f)
        new_data = deepcopy(data)
        for key in tqdm(data):
            # analyze question
            question = data[key]['text']
            doc = pipe(question)
            
            question_augmentations = set([])
            for sentence_idx, sentence in enumerate(doc.sentences):
                # apply augmentations per sentence
                sentence_augmentations = realize_all_options(sentence.constituency, "JJ")
                sentence_augmentations = sentence_augmentations.union(realize_all_options(sentence.constituency, "PP"))
                sentence_augmentations = sentence_augmentations.union(realize_all_options(sentence.constituency, "ADVP"))
                sentence_augmentations = sentence_augmentations.union(realize_all_options(sentence.constituency, "QP"))
                sentence_augmentations = sentence_augmentations.union(realize_all_options(sentence.constituency, "RB"))
                sentence_augmentations = sentence_augmentations.union(realize_all_options(sentence.constituency, "CD"))
                sentence_augmentations = sentence_augmentations.union(realize_all_options(sentence.constituency, "NML"))
                sentence_augmentations = sentence_augmentations.union(realize_all_options(sentence.constituency, "S"))
                sentence_augmentations = sentence_augmentations.union(extract_single_option(sentence.constituency, "SQ", instance=0))

                if len(question_augmentations) > 0:
                    # cross-product between all sentence realisations until now and next sentence realizations
                    new_augmentations = set([])
                    for sentences_so_far in question_augmentations:
                        for next_sentence in sentence_augmentations:
                            new_augmentations.add(sentences_so_far + " " + next_sentence)
                    question_augmentations = new_augmentations
                else:
                    # initialize question list (this is the first sentence)
                    question_augmentations = question_augmentations.union(sentence_augmentations)
            
            # create new question entries in data with final augmentations
            for augmentation in question_augmentations:
                new_key = str(time.time()).replace(".", "")
                new_data[new_key] = deepcopy(data[key])
                new_data[new_key]['text'] = augmentation
                all_augmentations[key].add(augmentation)
            
    with open(output_filename, "w") as f:
        json.dump(new_data, f)
                
    return all_augmentations

In [110]:
v1_augmentations = augment_question_data("../resources/en/generated/train_questions_v1.json", "./train_questions_v1_ling.json")

100%|██████████| 800/800 [02:58<00:00,  4.49it/s]

TOTAL Augmentations 2881
MAX Augmentations 10
MIN Augmentations 1



