In [9]:
from abc import ABC, abstractmethod
import os
import re
import numpy as np
import random
import spacy
import pandas as pd
import pickle

In [2]:
class Operation(ABC):
    """
    Abstract class for augmenting a given text.
    """

    @abstractmethod
    def generate(self, text, **kwargs):
        """
        Corrupts the given text.
        """
        pass

In [3]:
class ReplaceNamedEntities(Operation):
    def __init__(self, resource_dir) -> None:
        super().__init__()
        self.nlp = spacy.load("en_core_web_sm")
        fpaths = [os.path.join(resource_dir, fpath) for fpath in os.listdir(resource_dir) if fpath.endswith(".txt")]
        self.resources = [open(fpath, "r").read().splitlines() for fpath in fpaths]

    def get_replacement(self, entity):
        possibilities = []
        for resource in self.resources:
            if entity.lower() in list(map(lambda x: x.lower(), resource)):
                possibilities.extend(random.sample(resource, 5))
                break
        possibilities = [" {} ".format(x) for x in possibilities]
        possibilities.append(" ")
        return random.choice(possibilities)

    def replace_named_entities(self, text, soften=False):
        """
        loses persons, organizations, products and places
        """
        doc = self.nlp(text)
        named_entities = set(["PERSON", "ORG", "PRODUCT", "EVENT", "GPE", "GEO"])
        ne = []
        for x in doc.ents:
            if x.label_ in named_entities:
                ne.append((x.text, x.start_char, x.end_char))
        if len(ne) == 0:
            return text
        ne_new = random.sample(ne, np.random.randint(1, min(len(ne), 3)+1))
        ne_new = sorted(ne_new, key=lambda x: x[1])
        if soften:
            ne_new = ne_new[:1]
        shift = 0
        for (entity, start, end) in ne_new:
            replacemnt = self.get_replacement(entity)
            text = text[:start-shift] + replacemnt + text[end-shift:]
            shift += end - start - len(replacemnt)
        text = re.sub("\s+", " ", text)
        return text.strip()

    def generate(self, text, **kwargs):
        soften = kwargs.get("soften", False)
        return self.replace_named_entities(text, soften)

In [10]:
class TF_IDF_Replacement(Operation):
    def __init__(self, resource_dir) -> None:
        super().__init__()
        tfidf_path = os.path.join(resource_dir, "tfidf_aqua.pkl")
        self.tfidf = pickle.load(open(tfidf_path, "rb"))
        self.words = self.tfidf.get_feature_names()

    def __sample(self, n=3):
        return random.sample(self.words, n)

    def generate(self, text, **kwargs):
        soften = kwargs.get("soften", False)
        
        transformed = self.tfidf.transform([text]).toarray()
        most_imp = np.argpartition(transformed, -4)[:, -4:]
        array = most_imp[0]
        question = text
        vals = []
        num_replace = np.random.randint(1, 3)
        if soften:
            num_replace = 1
        replacements = self.__sample(num_replace)
        for idx in array:
            val = transformed[0][idx]
            word = self.words[idx]
            vals.append((val, word))
        vals.sort(reverse = True)
        replaced = list(map(lambda x: x[1], vals))[:num_replace]
        for replaced_, replacement in zip(replaced, replacements):
            question = question.replace(replaced_, replacement, 1)
        return question

In [4]:
resource_dir = "../operators/resources"

In [6]:
df = pd.read_csv("../data/aqua_train_augmented.csv")
df.head()

Unnamed: 0,question,positive1,positive2,positive3,positive4,neg_0,neg_1,neg_2,neg_3,neg_4,neg_5,neg_6
0,"Two friends plan to walk along a 43-km trail, ...","Two friends plan to walk a 43 km long trail, w...","Two friends plan to walk along a 43-km trail, ...",Two friends plan to walk along a forty-three-k...,,"Two friends plan to walk along a 43-km , start...","Two to walk along a 43-km trail, starting at o...","Two friends plan to walk along a 43-km trail, ...","Two friends plan to walk along a 43-km trail, ...","2 friends plan to walk along a -km trail, star...","Two focis plan to walk along a 43-km 9kg, star...","Two friends plan to walk along a 43-km trail, ..."
1,"In the coordinate plane, points (x, 1) and (5,...","In the coordinate plane, the points (x, 1) and...","In the coordinate plane, points (x, 1) and (5,...","In the coordinate plane, points (x, one) and (...",,"In the , points (x, 1) and (5, y) are on line ...","In the , points (x, 1) and (5, y) are on k. If...","In the coordinate plane, points (x, 1) and (5,...","In the coordinate plane, points (x, 1) and (5,...","In the coordinate plane, points (x, ) and ( , ...","In the coordinate plane, points (x, 1) and (5,...","In the coordinate plane, points (x, 1) and (5,..."
2,Carl is facing very difficult financial times ...,Carl is facing very difficult financial times ...,Carl is facing very difficult financial times ...,Carl is facing very difficult financial times ...,Carl is facing very difficult financial times ...,Carl is facing very difficult financial times ...,Carl is facing very difficult financial times ...,Carl is facing very difficult financial times ...,is facing very difficult financial times and c...,Carl is facing very difficult financial times ...,Carl is facing very difficult y20 times and ca...,Carl is facing very difficult financial times ...
3,The speed at which a man can row a boat in sti...,The speed with which a man can row a boat is 2...,The speed at which a man can row a boat in sti...,The speed at which a man can row a boat in sti...,The speed at which a man can row a boat in sti...,The at which a man can row a boat in still wat...,The speed at which a can row a boat in still w...,The speed at which a man can row a boat in sti...,The speed at which a man can row a boat in sti...,The speed at which a man can row a boat in sti...,Teconomists speed at which a man can row a boa...,The speed at which a man can row a boat in sti...
4,"There are k-2 members in a certain band, inclu...","There are k-2 members in a particular band, in...","There are k-2 members in a certain band, inclu...","There are k-two members in a certain band, inc...",,"There are k-2 members in a , including Jim and...","There are in a certain band, including Jim and...","There are k-2 members in a certain band, inclu...","There are k-2 members in a certain band, inclu...","There are k- members in a certain band, includ...","There are k-2 members in a certain band, inclu...","There are k-2 members in a certain band, inclu..."


In [14]:
rne = ReplaceNamedEntities(resource_dir)
tfidf = TF_IDF_Replacement(resource_dir)



In [15]:
c = 0
for idx, row in df.iterrows():
    qn = row["question"]
    text = rne.generate(qn)
    if text == qn:
        text = tfidf.generate(qn)
    row["neg_3"] = text


In [16]:
df.head()

Unnamed: 0,question,positive1,positive2,positive3,positive4,neg_0,neg_1,neg_2,neg_3,neg_4,neg_5,neg_6
0,"Two friends plan to walk along a 43-km trail, ...","Two friends plan to walk a 43 km long trail, w...","Two friends plan to walk along a 43-km trail, ...",Two friends plan to walk along a forty-three-k...,,"Two friends plan to walk along a 43-km , start...","Two to walk along a 43-km trail, starting at o...","Two friends plan to walk along a 43-km trail, ...","Two friends plan to walk along a 43-km trail, ...","2 friends plan to walk along a -km trail, star...","Two focis plan to walk along a 43-km 9kg, star...","Two friends plan to walk along a 43-km trail, ..."
1,"In the coordinate plane, points (x, 1) and (5,...","In the coordinate plane, the points (x, 1) and...","In the coordinate plane, points (x, 1) and (5,...","In the coordinate plane, points (x, one) and (...",,"In the , points (x, 1) and (5, y) are on line ...","In the , points (x, 1) and (5, y) are on k. If...","In the coordinate plane, points (x, 1) and (5,...","In the coordinate plane, points (x, 1) and (5,...","In the coordinate plane, points (x, ) and ( , ...","In the coordinate plane, points (x, 1) and (5,...","In the coordinate plane, points (x, 1) and (5,..."
2,Carl is facing very difficult financial times ...,Carl is facing very difficult financial times ...,Carl is facing very difficult financial times ...,Carl is facing very difficult financial times ...,Carl is facing very difficult financial times ...,Carl is facing very difficult financial times ...,Carl is facing very difficult financial times ...,Carl is facing very difficult financial times ...,Pavak is facing very difficult financial times...,Carl is facing very difficult financial times ...,Carl is facing very difficult y20 times and ca...,Carl is facing very difficult financial times ...
3,The speed at which a man can row a boat in sti...,The speed with which a man can row a boat is 2...,The speed at which a man can row a boat in sti...,The speed at which a man can row a boat in sti...,The speed at which a man can row a boat in sti...,The at which a man can row a boat in still wat...,The speed at which a can row a boat in still w...,The speed at which a man can row a boat in sti...,The speed at which a man can row a boat in sti...,The speed at which a man can row a boat in sti...,Teconomists speed at which a man can row a boa...,The speed at which a man can row a boat in sti...
4,"There are k-2 members in a certain band, inclu...","There are k-2 members in a particular band, in...","There are k-2 members in a certain band, inclu...","There are k-two members in a certain band, inc...",,"There are k-2 members in a , including Jim and...","There are in a certain band, including Jim and...","There are k-2 members in a certain band, inclu...","There are k-2 members in a certain band, inclu...","There are k- members in a certain band, includ...","There are k-2 members in a certain band, inclu...","There are k-2 members in a certain band, inclu..."


In [21]:
df = df.drop(columns=["neg_1"])
df.rename(columns={"neg_0": "neg_1"}, inplace=True)
df.head()

Unnamed: 0,question,positive1,positive2,positive3,positive4,neg_2,neg_3,neg_4,neg_5,neg_6
0,"Two friends plan to walk along a 43-km trail, ...","Two friends plan to walk a 43 km long trail, w...","Two friends plan to walk along a 43-km trail, ...",Two friends plan to walk along a forty-three-k...,,"Two friends plan to walk along a 43-km trail, ...","Two friends plan to walk along a 43-km trail, ...","2 friends plan to walk along a -km trail, star...","Two focis plan to walk along a 43-km 9kg, star...","Two friends plan to walk along a 43-km trail, ..."
1,"In the coordinate plane, points (x, 1) and (5,...","In the coordinate plane, the points (x, 1) and...","In the coordinate plane, points (x, 1) and (5,...","In the coordinate plane, points (x, one) and (...",,"In the coordinate plane, points (x, 1) and (5,...","In the coordinate plane, points (x, 1) and (5,...","In the coordinate plane, points (x, ) and ( , ...","In the coordinate plane, points (x, 1) and (5,...","In the coordinate plane, points (x, 1) and (5,..."
2,Carl is facing very difficult financial times ...,Carl is facing very difficult financial times ...,Carl is facing very difficult financial times ...,Carl is facing very difficult financial times ...,Carl is facing very difficult financial times ...,Carl is facing very difficult financial times ...,Pavak is facing very difficult financial times...,Carl is facing very difficult financial times ...,Carl is facing very difficult y20 times and ca...,Carl is facing very difficult financial times ...
3,The speed at which a man can row a boat in sti...,The speed with which a man can row a boat is 2...,The speed at which a man can row a boat in sti...,The speed at which a man can row a boat in sti...,The speed at which a man can row a boat in sti...,The speed at which a man can row a boat in sti...,The speed at which a man can row a boat in sti...,The speed at which a man can row a boat in sti...,Teconomists speed at which a man can row a boa...,The speed at which a man can row a boat in sti...
4,"There are k-2 members in a certain band, inclu...","There are k-2 members in a particular band, in...","There are k-2 members in a certain band, inclu...","There are k-two members in a certain band, inc...",,"There are k-2 members in a certain band, inclu...","There are k-2 members in a certain band, inclu...","There are k- members in a certain band, includ...","There are k-2 members in a certain band, inclu...","There are k-2 members in a certain band, inclu..."


In [18]:
df.to_csv("../data/aqua_train.csv", index=False)

In [19]:
df_val = pd.read_csv("../data/aqua_val_augmented.csv")

In [20]:
df_val.head()

Unnamed: 0,question,positive1,positive2,positive3,positive4,neg_0,neg_1,neg_2,neg_3,neg_4,neg_5,neg_6
0,Three birds are flying at a fast rate of 900 k...,Three birds fly at a speed of 900 kilometres p...,Three birds are flying at a fast rate of 900 k...,Three birds are flying at a fast rate of 9zero...,Three birds are flying at a fast rate of 900 k...,Three birds are flying at a fast rate of 900 k...,Three are flying at a of 900 kilometers per ho...,Three birds are flying at a fast rate of 900 k...,Three birds are flying at a fast rate of 900 k...,birds are flying at a fast rate of kilometers ...,Three birds are flying at a fast rate of 900 k...,Three birds are flying at a fast rate of 900 m...
1,A ship is leaving a port. It takes 240 seconds...,A ship leaves a harbor.A 750-metre canal takes...,A ship is leaving a port. It takes 240 seconds...,A ship is leaving a port. It takes two hundred...,A ship is leaving a port. It takes 240 seconds...,A ship is leaving a . It takes 240 seconds to ...,A ship is leaving a port. It takes 240 seconds...,A ship is leaving a port. It takes 240 seconds...,A ship is leaving a subscription. It takes 240...,A ship is leaving a port. It takes seconds to ...,A ship is leaving a unique. It takes 240 secon...,A ship is leaving a port. It takes 240 seconds...
2,A rectangular piece of cloth 2 feet wide was c...,"A rectangular piece of fabric, 2 feet wide, wa...",A rectangular piece of cloth 2 feet wide was c...,A rectangular piece of cloth two feet wide was...,,A rectangular piece of 2 feet wide was cut len...,A of cloth 2 feet wide was into two smaller s....,A rectangular piece of cloth 2 feet wide was c...,A rectangular accelerated of cloth 2 mismatche...,A rectangular piece of cloth feet wide was cut...,A rectangular pity of cloth 2 wbc wide was cut...,A rectangular piece of cloth 2 kilometres wide...
3,"In the xy-coordinate plane, which of the follo...",Which of the following points must be in the x...,"In the xy-coordinate plane, which of the follo...","In the xy-coordinate plane, which of the follo...",,"In the xy- , which of the following points mus...","In the , which of the following must lie on th...","In the xy-coordinate plane, which of the follo...","In the xy-coordinate plane, which of the follo...","In the xy-coordinate plane, which of the follo...",The points must lie on the line kx + 2y for ev...,"In the xy-coordinate plane, which of the follo..."
4,A travel company wants to charter a plane to t...,A travel company wants to charter an airplane ...,A travel company wants to charter a plane to t...,A travel company wants to charter a plane to t...,A travel company wants to charter a plane to t...,A travel company wants to charter a to the Bah...,A wants to charter a plane to the Bahamas. Cha...,A travel company wants to charter a plane to t...,A travel company wants to charter a plane to t...,A travel company wants to charter a plane to t...,A travel complete wants to charter a liz to th...,A travel company wants to charter a plane to t...


In [22]:
c = 0
for idx, row in df_val.iterrows():
    qn = row["question"]
    text = rne.generate(qn)
    if text == qn:
        text = tfidf.generate(qn)
    row["neg_3"] = text

In [23]:
df_val = df_val.drop(columns=["neg_1"])
df_val.rename(columns={"neg_0": "neg_1"}, inplace=True)
df_val.head()

Unnamed: 0,question,positive1,positive2,positive3,positive4,neg_1,neg_2,neg_3,neg_4,neg_5,neg_6
0,Three birds are flying at a fast rate of 900 k...,Three birds fly at a speed of 900 kilometres p...,Three birds are flying at a fast rate of 900 k...,Three birds are flying at a fast rate of 9zero...,Three birds are flying at a fast rate of 900 k...,Three birds are flying at a fast rate of 900 k...,Three birds are flying at a fast rate of 900 k...,Three birds are flying at a fast rate of 900 k...,birds are flying at a fast rate of kilometers ...,Three birds are flying at a fast rate of 900 k...,Three birds are flying at a fast rate of 900 m...
1,A ship is leaving a port. It takes 240 seconds...,A ship leaves a harbor.A 750-metre canal takes...,A ship is leaving a port. It takes 240 seconds...,A ship is leaving a port. It takes two hundred...,A ship is leaving a port. It takes 240 seconds...,A ship is leaving a . It takes 240 seconds to ...,A ship is leaving a port. It takes 240 seconds...,A ship is leaving a arethe. It takes 240 secon...,A ship is leaving a port. It takes seconds to ...,A ship is leaving a unique. It takes 240 secon...,A ship is leaving a port. It takes 240 seconds...
2,A rectangular piece of cloth 2 feet wide was c...,"A rectangular piece of fabric, 2 feet wide, wa...",A rectangular piece of cloth 2 feet wide was c...,A rectangular piece of cloth two feet wide was...,,A rectangular piece of 2 feet wide was cut len...,A rectangular piece of cloth 2 feet wide was c...,A rectangular piece of cloth 2 kh wide was cut...,A rectangular piece of cloth feet wide was cut...,A rectangular pity of cloth 2 wbc wide was cut...,A rectangular piece of cloth 2 kilometres wide...
3,"In the xy-coordinate plane, which of the follo...",Which of the following points must be in the x...,"In the xy-coordinate plane, which of the follo...","In the xy-coordinate plane, which of the follo...",,"In the xy- , which of the following points mus...","In the xy-coordinate plane, which of the follo...","In the xy-coordinate plane, which of the follo...","In the xy-coordinate plane, which of the follo...",The points must lie on the line kx + 2y for ev...,"In the xy-coordinate plane, which of the follo..."
4,A travel company wants to charter a plane to t...,A travel company wants to charter an airplane ...,A travel company wants to charter a plane to t...,A travel company wants to charter a plane to t...,A travel company wants to charter a plane to t...,A travel company wants to charter a to the Bah...,A travel company wants to charter a plane to t...,A travel company wants to charter a plane to t...,A travel company wants to charter a plane to t...,A travel complete wants to charter a liz to th...,A travel company wants to charter a plane to t...


In [24]:
df_val.to_csv("../data/aqua_val.csv", index=False)