-
Notifications
You must be signed in to change notification settings - Fork 195
/
transformation.py
170 lines (146 loc) Β· 6.15 KB
/
transformation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
from interfaces.SentenceOperation import SentenceOperation
from tasks.TaskTypes import TaskType
import string
from pattern.en import conjugate, PAST, PRESENT, SINGULAR, PLURAL
import spacy
from spacy.symbols import NOUN
import random
from initialize import spacy_nlp
SUBJ_DEPS = {'agent', 'csubj', 'csubjpass', 'expl', 'nsubj', 'nsubjpass'}
def _get_conjuncts(tok):
"""
Return conjunct dependents of the leftmost conjunct in a coordinated phrase,
e.g. "Burton, [Dan], and [Josh] ...".
"""
return [right for right in tok.rights
if right.dep_ == 'conj']
def is_plural_noun(token):
"""
Returns True if token is a plural noun, False otherwise.
Args:
token (``spacy.Token``): parent document must have POS information
Returns:
bool
"""
if token.doc.is_tagged is False:
raise ValueError('token is not POS-tagged')
return True if token.pos == NOUN and token.lemma != token.lower else False
def get_subjects_of_verb(verb):
if verb.dep_ == "aux" and list(verb.ancestors):
return get_subjects_of_verb(list(verb.ancestors)[0])
"""Return all subjects of a verb according to the dependency parse."""
subjs = [tok for tok in verb.lefts if tok.dep_ in SUBJ_DEPS]
# get additional conjunct subjects
subjs.extend(tok for subj in subjs for tok in _get_conjuncts(subj))
if not len(subjs):
ancestors = list(verb.ancestors)
if len(ancestors) > 0:
return get_subjects_of_verb(ancestors[0])
return subjs
def is_plural_verb(token):
if token.doc.is_tagged is False:
raise ValueError('token is not POS-tagged')
subjects = get_subjects_of_verb(token)
if not len(subjects):
return False
plural_score = sum([is_plural_noun(x) for x in subjects])/len(subjects)
return plural_score > .5
def preserve_caps(word, newWord):
"""Returns newWord, capitalizing it if word is capitalized."""
if word[0] >= 'A' and word[0] <= 'Z':
newWord = newWord.capitalize()
return newWord
'''
change tense function borrowed from https://github.com/bendichter/tenseflow/blob/master/tenseflow/change_tense.py
'''
class TenseTransformation(SentenceOperation):
tasks = [
TaskType.TEXT_CLASSIFICATION,
TaskType.TEXT_TO_TEXT_GENERATION
]
languages = ["en"]
def __init__(self, to_tense):
super().__init__()
assert to_tense in ['past', 'present', 'future', 'random']
self.to_tense = to_tense
self.nlp = spacy_nlp if spacy_nlp else spacy.load("en_core_web_sm")
def change_tense(self, text, to_tense):
"""Change the tense of text.
Args:
text (str): text to change.
to_tense (str): 'present','past', or 'future'
npl (SpaCy model, optional):
Returns:
str: changed text.
"""
tense_lookup = {'future': 'inf', 'present': PRESENT, 'past': PAST}
tense = tense_lookup[to_tense]
doc = self.nlp(text)
print(doc[0], doc)
out = list()
out.append(doc[0].text)
words = []
for word in doc:
words.append(word)
if len(words) == 1:
continue
if (words[-2].text == 'will' and words[-2].tag_ == 'MD' and words[-1].tag_ == 'VB') or \
words[-1].tag_ in ('VBD', 'VBP', 'VBZ', 'VBN') or \
(not words[-2].text in ('to', 'not') and words[-1].tag_ == 'VB'):
if words[-2].text in ('were', 'am', 'is', 'are', 'was') or \
(words[-2].text == 'be' and len(words) > 2 and words[-3].text == 'will'):
this_tense = tense_lookup['past']
else:
this_tense = tense
subjects = [x.text for x in get_subjects_of_verb(words[-1])]
if ('I' in subjects) or ('we' in subjects) or ('We' in subjects):
person = 1
elif ('you' in subjects) or ('You' in subjects):
person = 2
else:
person = 3
if is_plural_verb(words[-1]):
number = PLURAL
else:
number = SINGULAR
if (words[-2].text == 'will' and words[-2].tag_ == 'MD') or words[-2].text == 'had':
out.pop(-1)
if to_tense == 'future':
if not (out[-1] == 'will' or out[-1] == 'be'):
out.append('will')
# handle will as a noun in future tense
if words[-2].text == 'will' and words[-2].tag_ == 'NN':
out.append('will')
oldWord = words[-1].text
out.append(preserve_caps(oldWord, conjugate(oldWord, tense=this_tense, person=person, number=number)))
else:
out.append(words[-1].text)
# negation
if words[-2].text + words[-1].text in ('didnot', 'donot', 'willnot', "didn't", "don't", "won't"):
if tense == PAST:
out[-2] = 'did'
elif tense == PRESENT:
out[-2] = 'do'
else:
out.pop(-2)
# future perfect
if words[-1].text in ('have', 'has') and len(list(words[-1].ancestors)) and words[-1].dep_ == 'aux':
out.pop(-1)
text_out = ' '.join(out)
# Remove spaces before/after punctuation:
for char in string.punctuation:
if char in """(<['""":
text_out = text_out.replace(char+' ', char)
else:
text_out = text_out.replace(' '+char, char)
for char in ["-", "β", "β"]:
text_out = text_out.replace(char+' ', char)
for char in ["β¦", "β", "'s", "n't"]:
text_out = text_out.replace(' '+char, char)
return text_out
def generate(self, sentence: str):
"""
takes in a input sentence and transforms it's tense to the target tense
"""
perturbed_texts = self.change_tense(sentence, to_tense = random.choice(['past', 'present', 'future']) if self.to_tense == 'random' else self.to_tense)
return [perturbed_texts]