From f00c62c230fe28bb5b498f41264db14c14ee31e5 Mon Sep 17 00:00:00 2001 From: MukundVarmaT Date: Fri, 16 Jul 2021 13:55:41 +0530 Subject: [PATCH 1/2] add tense tense transform --- transformations/tense/README.md | 53 ++++++++ transformations/tense/__init__.py | 1 + transformations/tense/requirements.txt | 1 + transformations/tense/test.json | 75 +++++++++++ transformations/tense/transformation.py | 167 ++++++++++++++++++++++++ 5 files changed, 297 insertions(+) create mode 100644 transformations/tense/README.md create mode 100644 transformations/tense/__init__.py create mode 100644 transformations/tense/requirements.txt create mode 100644 transformations/tense/test.json create mode 100644 transformations/tense/transformation.py diff --git a/transformations/tense/README.md b/transformations/tense/README.md new file mode 100644 index 000000000..ccd08d489 --- /dev/null +++ b/transformations/tense/README.md @@ -0,0 +1,53 @@ +# Tense Tranformation 🦎 + ⌨️ → 🐍 +This transformation converts sentences from one tense to the other, example: simple present to simple past. + +Author name: Tanay Dixit, Mukund Varma T + +## What type of a transformation is this? + +In this transformation, we convert a sentence into the target tense based on a verb, subject conjugation. +This ensures that the context of the given sentence remains the same while the attribute of time changes. + +The following are some representative examples: + + Input: I can come to the party + Target Tense: past + Transformed Text: I can came to the party + + Input: I went to the park + Target Tense: future + Transformed Text: I will go to the park + + Input: I will go to the park. + Target Tense: present + Transformed Text: I go to the park. + +## What tasks does it intend to benefit? + +The task is designed to measure the capacity of language understanding in language models, specifically to understand the given tense of a sentence. +This task is nominally simple for humans, since we have an understanding of time / a sequence of events but is difficult for a language model as they do not have any prior information about time. +There have been a couple of attempts to perform controlled attribute text transformation (Logeswaran et. al) but is yet to be seen on language models trained in a general setting. + +## Citations + +```bibtex +@article{DBLP:journals/corr/abs-1811-01135, + author = {Lajanugen Logeswaran and + Honglak Lee and + Samy Bengio}, + title = {Content preserving text generation with attribute controls}, + journal = {CoRR}, + volume = {abs/1811.01135}, + year = {2018}, + url = {http://arxiv.org/abs/1811.01135}, + archivePrefix = {arXiv}, + eprint = {1811.01135}, + timestamp = {Thu, 22 Nov 2018 17:58:30 +0100}, + biburl = {https://dblp.org/rec/journals/corr/abs-1811-01135.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} +``` + +## What are the limitations of this transformation? + +The transformation is not robust to all complex cases and is limited to only simple past/present/future tense conversions. diff --git a/transformations/tense/__init__.py b/transformations/tense/__init__.py new file mode 100644 index 000000000..0a79241bb --- /dev/null +++ b/transformations/tense/__init__.py @@ -0,0 +1 @@ +from .transformation import * \ No newline at end of file diff --git a/transformations/tense/requirements.txt b/transformations/tense/requirements.txt new file mode 100644 index 000000000..2a1eb2862 --- /dev/null +++ b/transformations/tense/requirements.txt @@ -0,0 +1 @@ +pattern @ git+https://github.com/tanay2001/pattern.git \ No newline at end of file diff --git a/transformations/tense/test.json b/transformations/tense/test.json new file mode 100644 index 000000000..d0cf429b0 --- /dev/null +++ b/transformations/tense/test.json @@ -0,0 +1,75 @@ +{ + "type": "tense_transformation", + "test_cases": [ + { + "class": "TenseTransformation", + "args": { + "to_tense": "past" + }, + "inputs": { + "sentence": "I will go to the park." + }, + "outputs": [ + { + "sentence": "I went to the park." + } + ] + }, + { + "class": "TenseTransformation", + "args": { + "to_tense": "past" + }, + "inputs": { + "sentence": "I can come to the party" + }, + "outputs": [ + { + "sentence": "I can came to the party" + } + ] + }, + { + "class": "TenseTransformation", + "args": { + "to_tense": "past" + }, + "inputs": { + "sentence": "I will go to the park" + }, + "outputs": [ + { + "sentence": "I went to the park" + } + ] + }, + { + "class": "TenseTransformation", + "args": { + "to_tense": "past" + }, + "inputs": { + "sentence": "I go to the park." + }, + "outputs": [ + { + "sentence": "I went to the park." + } + ] + }, + { + "class": "TenseTransformation", + "args": { + "to_tense": "past" + }, + "inputs": { + "sentence": "I visit the hospital" + }, + "outputs": [ + { + "sentence": "I visited the hospital" + } + ] + } + ] +} \ No newline at end of file diff --git a/transformations/tense/transformation.py b/transformations/tense/transformation.py new file mode 100644 index 000000000..d11c30268 --- /dev/null +++ b/transformations/tense/transformation.py @@ -0,0 +1,167 @@ +from interfaces.SentenceOperation import SentenceOperation +from tasks.TaskTypes import TaskType +import string +from pattern.en import conjugate, PAST, PRESENT, SINGULAR, PLURAL +import spacy +from spacy.symbols import NOUN +import random + +SUBJ_DEPS = {'agent', 'csubj', 'csubjpass', 'expl', 'nsubj', 'nsubjpass'} + +def _get_conjuncts(tok): + """ + Return conjunct dependents of the leftmost conjunct in a coordinated phrase, + e.g. "Burton, [Dan], and [Josh] ...". + """ + return [right for right in tok.rights + if right.dep_ == 'conj'] + + +def is_plural_noun(token): + """ + Returns True if token is a plural noun, False otherwise. + Args: + token (``spacy.Token``): parent document must have POS information + Returns: + bool + """ + if token.doc.is_tagged is False: + raise ValueError('token is not POS-tagged') + return True if token.pos == NOUN and token.lemma != token.lower else False + + +def get_subjects_of_verb(verb): + if verb.dep_ == "aux" and list(verb.ancestors): + return get_subjects_of_verb(list(verb.ancestors)[0]) + """Return all subjects of a verb according to the dependency parse.""" + subjs = [tok for tok in verb.lefts if tok.dep_ in SUBJ_DEPS] + # get additional conjunct subjects + subjs.extend(tok for subj in subjs for tok in _get_conjuncts(subj)) + if not len(subjs): + ancestors = list(verb.ancestors) + if len(ancestors) > 0: + return get_subjects_of_verb(ancestors[0]) + return subjs + + +def is_plural_verb(token): + if token.doc.is_tagged is False: + raise ValueError('token is not POS-tagged') + subjects = get_subjects_of_verb(token) + if not len(subjects): + return False + plural_score = sum([is_plural_noun(x) for x in subjects])/len(subjects) + + return plural_score > .5 + +def preserve_caps(word, newWord): + """Returns newWord, capitalizing it if word is capitalized.""" + if word[0] >= 'A' and word[0] <= 'Z': + newWord = newWord.capitalize() + return newWord + +''' +change tense function borrowed from https://github.com/bendichter/tenseflow/blob/master/tenseflow/change_tense.py +''' + +class TenseTransformation(SentenceOperation): + tasks = [ + TaskType.TEXT_CLASSIFICATION, + TaskType.TEXT_TO_TEXT_GENERATION, + TaskType.TEXT_TAGGING, + ] + languages = ["en"] + + def __init__(self, to_tense): + super().__init__() + assert to_tense in ['past', 'present', 'future', 'random'] + self.to_tense = to_tense + self.nlp = spacy.load('en_core_web_sm') + + def change_tense(self, text, to_tense): + """Change the tense of text. + Args: + text (str): text to change. + to_tense (str): 'present','past', or 'future' + npl (SpaCy model, optional): + Returns: + str: changed text. + """ + tense_lookup = {'future': 'inf', 'present': PRESENT, 'past': PAST} + tense = tense_lookup[to_tense] + + doc = self.nlp(text) + + out = list() + out.append(doc[0].text) + words = [] + for word in doc: + words.append(word) + if len(words) == 1: + continue + if (words[-2].text == 'will' and words[-2].tag_ == 'MD' and words[-1].tag_ == 'VB') or \ + words[-1].tag_ in ('VBD', 'VBP', 'VBZ', 'VBN') or \ + (not words[-2].text in ('to', 'not') and words[-1].tag_ == 'VB'): + + if words[-2].text in ('were', 'am', 'is', 'are', 'was') or \ + (words[-2].text == 'be' and len(words) > 2 and words[-3].text == 'will'): + this_tense = tense_lookup['past'] + else: + this_tense = tense + + subjects = [x.text for x in get_subjects_of_verb(words[-1])] + if ('I' in subjects) or ('we' in subjects) or ('We' in subjects): + person = 1 + elif ('you' in subjects) or ('You' in subjects): + person = 2 + else: + person = 3 + if is_plural_verb(words[-1]): + number = PLURAL + else: + number = SINGULAR + if (words[-2].text == 'will' and words[-2].tag_ == 'MD') or words[-2].text == 'had': + out.pop(-1) + if to_tense == 'future': + if not (out[-1] == 'will' or out[-1] == 'be'): + out.append('will') + # handle will as a noun in future tense + if words[-2].text == 'will' and words[-2].tag_ == 'NN': + out.append('will') + oldWord = words[-1].text + out.append(preserve_caps(oldWord, conjugate(oldWord, tense=this_tense, person=person, number=number))) + else: + out.append(words[-1].text) + + # negation + if words[-2].text + words[-1].text in ('didnot', 'donot', 'willnot', "didn't", "don't", "won't"): + if tense == PAST: + out[-2] = 'did' + elif tense == PRESENT: + out[-2] = 'do' + else: + out.pop(-2) + + # future perfect + if words[-1].text in ('have', 'has') and len(list(words[-1].ancestors)) and words[-1].dep_ == 'aux': + out.pop(-1) + + text_out = ' '.join(out) + + # Remove spaces before/after punctuation: + for char in string.punctuation: + if char in """(<['""": + text_out = text_out.replace(char+' ', char) + else: + text_out = text_out.replace(' '+char, char) + + for char in ["-", "“", "‘"]: + text_out = text_out.replace(char+' ', char) + for char in ["…", "”", "'s", "n't"]: + text_out = text_out.replace(' '+char, char) + + return text_out + + def generate(self, sentence: str): + perturbed_texts = self.change_tense(sentence, to_tense = random.choice(['past', 'present', 'future']) if self.to_tense == 'random' else self.to_tense) + return [perturbed_texts] \ No newline at end of file From eca93fdfe6ae9b4cf6213fdf7b3b89bafd34d860 Mon Sep 17 00:00:00 2001 From: tanay2001 Date: Wed, 28 Jul 2021 21:56:14 +0530 Subject: [PATCH 2/2] addressed issues --- transformations/tense/README.md | 10 ++++-- transformations/tense/test.json | 42 +++++++++++++++++++++++++ transformations/tense/transformation.py | 13 +++++--- 3 files changed, 58 insertions(+), 7 deletions(-) diff --git a/transformations/tense/README.md b/transformations/tense/README.md index ccd08d489..ecec749dc 100644 --- a/transformations/tense/README.md +++ b/transformations/tense/README.md @@ -10,9 +10,9 @@ This ensures that the context of the given sentence remains the same while the a The following are some representative examples: - Input: I can come to the party + Input: My father goes to gym every day Target Tense: past - Transformed Text: I can came to the party + Transformed Text: My father went to gym every day Input: I went to the park Target Tense: future @@ -47,7 +47,13 @@ There have been a couple of attempts to perform controlled attribute text transf bibsource = {dblp computer science bibliography, https://dblp.org} } ``` +### Data and Source Code +change tense and verb infliction borrowed from https://github.com/bendichter/tenseflow ## What are the limitations of this transformation? The transformation is not robust to all complex cases and is limited to only simple past/present/future tense conversions. +Examples where it fails:
+Input: I will go for dinner after I am done playing tennis. +to_tense: past +Output: I went for dinner after I was did playing tennis. \ No newline at end of file diff --git a/transformations/tense/test.json b/transformations/tense/test.json index d0cf429b0..8651468b8 100644 --- a/transformations/tense/test.json +++ b/transformations/tense/test.json @@ -15,6 +15,20 @@ } ] }, + { + "class": "TenseTransformation", + "args": { + "to_tense": "past" + }, + "inputs": { + "sentence": "It smells very delicious in the kitchen, what are you cooking?" + }, + "outputs": [ + { + "sentence": "It smelt very delicious in the kitchen, what were you cooking?" + } + ] + }, { "class": "TenseTransformation", "args": { @@ -70,6 +84,34 @@ "sentence": "I visited the hospital" } ] + }, + { + "class": "TenseTransformation", + "args": { + "to_tense": "past" + }, + "inputs": { + "sentence": "I will go for dinner after I am done playing tennis" + }, + "outputs": [ + { + "sentence": "I went for dinner after I was did playing tennis" + } + ] + }, + { + "class": "TenseTransformation", + "args": { + "to_tense": "past" + }, + "inputs": { + "sentence": "My father goes to gym every day" + }, + "outputs": [ + { + "sentence": "My father went to gym every day" + } + ] } ] } \ No newline at end of file diff --git a/transformations/tense/transformation.py b/transformations/tense/transformation.py index d11c30268..b22630287 100644 --- a/transformations/tense/transformation.py +++ b/transformations/tense/transformation.py @@ -5,6 +5,7 @@ import spacy from spacy.symbols import NOUN import random +from initialize import spacy_nlp SUBJ_DEPS = {'agent', 'csubj', 'csubjpass', 'expl', 'nsubj', 'nsubjpass'} @@ -67,8 +68,7 @@ def preserve_caps(word, newWord): class TenseTransformation(SentenceOperation): tasks = [ TaskType.TEXT_CLASSIFICATION, - TaskType.TEXT_TO_TEXT_GENERATION, - TaskType.TEXT_TAGGING, + TaskType.TEXT_TO_TEXT_GENERATION ] languages = ["en"] @@ -76,7 +76,7 @@ def __init__(self, to_tense): super().__init__() assert to_tense in ['past', 'present', 'future', 'random'] self.to_tense = to_tense - self.nlp = spacy.load('en_core_web_sm') + self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm") def change_tense(self, text, to_tense): """Change the tense of text. @@ -91,7 +91,7 @@ def change_tense(self, text, to_tense): tense = tense_lookup[to_tense] doc = self.nlp(text) - + print(doc[0], doc) out = list() out.append(doc[0].text) words = [] @@ -163,5 +163,8 @@ def change_tense(self, text, to_tense): return text_out def generate(self, sentence: str): + """ + takes in a input sentence and transforms it's tense to the target tense + """ perturbed_texts = self.change_tense(sentence, to_tense = random.choice(['past', 'present', 'future']) if self.to_tense == 'random' else self.to_tense) - return [perturbed_texts] \ No newline at end of file + return [perturbed_texts]