In [10]:
from abc import ABC, abstractmethod
import os
import re
import numpy as np
import random
import spacy
import pandas as pd
import pickle
import json

In [2]:
class Operation(ABC):
    """
    Abstract class for augmenting a given text.
    """

    @abstractmethod
    def generate(self, text, **kwargs):
        """
        Corrupts the given text.
        """
        pass

In [4]:
class ReplaceNamedEntities(Operation):
    def __init__(self, resource_dir) -> None:
        super().__init__()
        self.nlp = spacy.load("en_core_web_sm")
        fpaths = [os.path.join(resource_dir, fpath) for fpath in os.listdir(resource_dir) if fpath.endswith(".txt")]
        self.resources = [open(fpath, "r").read().splitlines() for fpath in fpaths]

    def get_replacement(self, entity):
        possibilities = []
        for resource in self.resources:
            if entity.lower() in list(map(lambda x: x.lower(), resource)):
                possibilities.extend(random.sample(resource, 5))
                break
        possibilities = [" {} ".format(x) for x in possibilities]
        possibilities.append(" ")
        return random.choice(possibilities)

    def replace_named_entities(self, text, soften=False):
        """
        loses persons, organizations, products and places
        """
        doc = self.nlp(text)
        named_entities = set(["PERSON", "ORG", "PRODUCT", "EVENT", "GPE", "GEO"])
        ne = []
        for x in doc.ents:
            if x.label_ in named_entities:
                ne.append((x.text, x.start_char, x.end_char))
        if len(ne) == 0:
            return text
        ne_new = random.sample(ne, np.random.randint(1, min(len(ne), 3)+1))
        ne_new = sorted(ne_new, key=lambda x: x[1])
        if soften:
            ne_new = ne_new[:1]
        shift = 0
        for (entity, start, end) in ne_new:
            replacemnt = self.get_replacement(entity)
            text = text[:start-shift] + replacemnt + text[end-shift:]
            shift += end - start - len(replacemnt)
        text = re.sub("\s+", " ", text)
        return text.strip()

    def generate(self, text, **kwargs):
        soften = kwargs.get("soften", False)
        return self.replace_named_entities(text, soften)

In [15]:
class TF_IDF_Replacement(Operation):
    def __init__(self, resource_dir) -> None:
        super().__init__()
        tfidf_path = os.path.join(resource_dir, "tfidf_em.pkl")
        self.tfidf = pickle.load(open(tfidf_path, "rb"))
        self.words = self.tfidf.get_feature_names()

    def __sample(self, n=3):
        return random.sample(self.words, n)

    def generate(self, text, **kwargs):
        soften = kwargs.get("soften", False)
        
        transformed = self.tfidf.transform([text]).toarray()
        most_imp = np.argpartition(transformed, -4)[:, -4:]
        array = most_imp[0]
        question = text
        vals = []
        num_replace = np.random.randint(1, 3)
        if soften:
            num_replace = 1
        replacements = self.__sample(num_replace)
        for idx in array:
            val = transformed[0][idx]
            word = self.words[idx]
            vals.append((val, word))
        vals.sort(reverse = True)
        replaced = list(map(lambda x: x[1], vals))[:num_replace]
        for replaced_, replacement in zip(replaced, replacements):
            question = question.replace(replaced_, replacement, 1)
        return question

In [6]:
resource_dir = "../operators/resources"

In [17]:
df = pd.read_csv("../data/EM/train_em_ours.csv")
print(df.shape)
df.head()

(10085, 12)


Unnamed: 0,question,positive1,positive2,positive3,positive4,neg_0,neg_1,neg_2,neg_3,neg_4,neg_5,neg_6
0,A circle has an area of 36 square unit. The le...,One circle has an area of 36 square units.The ...,A circle has an area of 36 square unit. The le...,A circle has an area of thirty-six square unit...,,A has an area of 36 square unit. The length of...,A has an area of 36 square unit. The of the it...,A circle has an area of 36 square unit.,A circle has an area of basketball square away...,A circle has an area of square unit. The lengt...,36 square unit is the area of a circle,A circle has an area of entry square pictograp...
1,"In the figure given below, find the value of z.",Below you will find the value of e.g.,"In the figure given below, find the value of z.","In the figure given below, find the value of z.",,"In the figure given below, find the of z.","In the given below, find the value of z.","In the figure given below, find the","In the figure given moves, find the value of z.","In the figure given convex, find the value of z.",The value of Z is given in the figure below.,"In the figure given sugar, find the value of z."
2,The number whch is not irrational will be,"The number, which is not irrational, is",The number whch is not irrational will be,The number whch is not irrational will be,,The is not irrational will be,The is not will be,The number whch is not,The number wolud is not parrallel will be,The number improper is not irrational will be,"No, the number is not irrational",The number combined is not irrational will be
3,How many rational numbers we can find between ...,How many rational numbers can we find among tw...,How many rational numbers we can find between ...,How many rational numbers we can find between ...,,How we can find between two rational numbers?,How many we can between two ?,How many rational numbers we can find between,How many compact square we can find between tw...,How many rational numbers we can find between ...,Can we find between two and three rational num...,How many 115 numbers we can find between two r...
4,The number of circle passing through a given p...,The number of circles passing a certain point is,The number of circle passing through a given p...,The number of circle passing through a given p...,,The number of passing through a given point is,The of through a given point is,The number of circle passing through a,The number of circle acd 68x a given point is,The number of circle public bag a given point is,There is a number of circles passing through a...,The number of circle 941 through a given point is


In [16]:
rne = ReplaceNamedEntities(resource_dir)
tfidf = TF_IDF_Replacement(resource_dir)



In [18]:
c = 0
for idx, row in df.iterrows():
    qn = row["question"]
    text = rne.generate(qn)
    if text == qn:
        text = tfidf.generate(qn)
    row["neg_3"] = text


In [19]:
df.head()

Unnamed: 0,question,positive1,positive2,positive3,positive4,neg_0,neg_1,neg_2,neg_3,neg_4,neg_5,neg_6
0,A circle has an area of 36 square unit. The le...,One circle has an area of 36 square units.The ...,A circle has an area of 36 square unit. The le...,A circle has an area of thirty-six square unit...,,A has an area of 36 square unit. The length of...,A has an area of 36 square unit. The of the it...,A circle has an area of 36 square unit.,A circle has an area of 36 square km. The leng...,A circle has an area of square unit. The lengt...,36 square unit is the area of a circle,A circle has an area of entry square pictograp...
1,"In the figure given below, find the value of z.",Below you will find the value of e.g.,"In the figure given below, find the value of z.","In the figure given below, find the value of z.",,"In the figure given below, find the of z.","In the given below, find the value of z.","In the figure given below, find the","In the figure given below, find the value of","In the figure given convex, find the value of z.",The value of Z is given in the figure below.,"In the figure given sugar, find the value of z."
2,The number whch is not irrational will be,"The number, which is not irrational, is",The number whch is not irrational will be,The number whch is not irrational will be,,The is not irrational will be,The is not will be,The number whch is not,The number wrist is not irrational will be,The number improper is not irrational will be,"No, the number is not irrational",The number combined is not irrational will be
3,How many rational numbers we can find between ...,How many rational numbers can we find among tw...,How many rational numbers we can find between ...,How many rational numbers we can find between ...,,How we can find between two rational numbers?,How many we can between two ?,How many rational numbers we can find between,How many collecting filled we can find between...,How many rational numbers we can find between ...,Can we find between two and three rational num...,How many 115 numbers we can find between two r...
4,The number of circle passing through a given p...,The number of circles passing a certain point is,The number of circle passing through a given p...,The number of circle passing through a given p...,,The number of passing through a given point is,The of through a given point is,The number of circle passing through a,The number of circle 91a92 6174 a given point is,The number of circle public bag a given point is,There is a number of circles passing through a...,The number of circle 941 through a given point is


In [20]:
df = df.drop(columns=["neg_1"])
df.rename(columns={"neg_0": "neg_1"}, inplace=True)
df.head()

Unnamed: 0,question,positive1,positive2,positive3,positive4,neg_1,neg_2,neg_3,neg_4,neg_5,neg_6
0,A circle has an area of 36 square unit. The le...,One circle has an area of 36 square units.The ...,A circle has an area of 36 square unit. The le...,A circle has an area of thirty-six square unit...,,A has an area of 36 square unit. The length of...,A circle has an area of 36 square unit.,A circle has an area of 36 square km. The leng...,A circle has an area of square unit. The lengt...,36 square unit is the area of a circle,A circle has an area of entry square pictograp...
1,"In the figure given below, find the value of z.",Below you will find the value of e.g.,"In the figure given below, find the value of z.","In the figure given below, find the value of z.",,"In the figure given below, find the of z.","In the figure given below, find the","In the figure given below, find the value of","In the figure given convex, find the value of z.",The value of Z is given in the figure below.,"In the figure given sugar, find the value of z."
2,The number whch is not irrational will be,"The number, which is not irrational, is",The number whch is not irrational will be,The number whch is not irrational will be,,The is not irrational will be,The number whch is not,The number wrist is not irrational will be,The number improper is not irrational will be,"No, the number is not irrational",The number combined is not irrational will be
3,How many rational numbers we can find between ...,How many rational numbers can we find among tw...,How many rational numbers we can find between ...,How many rational numbers we can find between ...,,How we can find between two rational numbers?,How many rational numbers we can find between,How many collecting filled we can find between...,How many rational numbers we can find between ...,Can we find between two and three rational num...,How many 115 numbers we can find between two r...
4,The number of circle passing through a given p...,The number of circles passing a certain point is,The number of circle passing through a given p...,The number of circle passing through a given p...,,The number of passing through a given point is,The number of circle passing through a,The number of circle 91a92 6174 a given point is,The number of circle public bag a given point is,There is a number of circles passing through a...,The number of circle 941 through a given point is


In [21]:
df.to_csv("../data/EM/em_train.csv", index=False)

In [22]:
df_val = pd.read_csv("../data/EM/val_samples_em.csv")

In [23]:
df_val.head()

Unnamed: 0,question,positive1,positive2,positive3,positive4,neg_0,neg_1,neg_2,neg_3,neg_4,neg_5,neg_6
0,A cubical block of side 14 cm is surmounted by...,A cubic block with a side length of 14 cm is t...,A cubical block of side 14 cm is surmounted by...,A cubical block of side fourteen cm is surmoun...,A cubical block of side 14 centimetre is surmo...,A of side 14 cm is surmounted by a hemisphere....,A of side 14 cm is by a hemisphere. The total ...,A cubical block of side 14 cm is surmounted by...,A cubical block of side 14 cm is aob by a hemi...,A cubical block of side cm is surmounted by a ...,A cubical block of side 14 cm is ndash by a he...,A cubical block of side 14 mm is surmounted by...
1,Numerical coefficients of the expression 4x – ...,Numerical coefficients of expression 4x - 3y are,Numerical coefficients of the expression 4x – ...,Numerical coefficients of the expression fourx...,,Numerical coefficients of the 4x – 3y are,Numerical coefficients of the – 3y are,Numerical coefficients of the expression 4x,Numerical stamps of the expression 4x – 3y are,Numerical coefficients of the expression x – y...,The coefficients of the expression are 4x 3Y.,Numerical fridays of the expression 4x – 3y are
2,Rational numbers between any two rational numb...,Rational numbers are / are between any two rat...,Rational numbers between any two rational numb...,Rational numbers between any two rational numb...,,Rational numbers between any two is/are,Rational numbers between any two is/are,Rational numbers between any two,Rational numbers between any two nmeasures num...,Rational numbers between any rational numbers ...,Any two rational numbers are rational numbers,Rational 170 between any two marble numbers is...
3,"In the given figure, the value of a equals","In the given illustration, the value of a corr...","In the given figure, the value of a equals","In the given figure, the value of a equals",,"In the given , the value of a equals","In the given , the value of a","In the given figure, the value","In the given figure, the value of a took","In the given figure, the value of a bowls",The value of a equals is given in the figure,"In the given figure, the value of a yz"
4,The largest side of a right angle triangle is,The largest side of a rectangular triangle is,The largest side of a right angle triangle is,The largest side of a right angle triangle is,,The largest side of a is,The of a right is,The largest side of a right,The get side of a right angle triangle is,The bce side of a right angle triangle is,A right angle triangle has the largest side.,The braces side of a opting angle triangle is


In [24]:
c = 0
for idx, row in df_val.iterrows():
    qn = row["question"]
    text = rne.generate(qn)
    if text == qn:
        text = tfidf.generate(qn)
    row["neg_3"] = text

In [25]:
df_val = df_val.drop(columns=["neg_1"])
df_val.rename(columns={"neg_0": "neg_1"}, inplace=True)
df_val.head()

Unnamed: 0,question,positive1,positive2,positive3,positive4,neg_1,neg_2,neg_3,neg_4,neg_5,neg_6
0,A cubical block of side 14 cm is surmounted by...,A cubic block with a side length of 14 cm is t...,A cubical block of side 14 cm is surmounted by...,A cubical block of side fourteen cm is surmoun...,A cubical block of side 14 centimetre is surmo...,A of side 14 cm is surmounted by a hemisphere....,A cubical block of side 14 cm is surmounted by...,A 3yx block of side 14 cm is taj by a hemisphe...,A cubical block of side cm is surmounted by a ...,A cubical block of side 14 cm is ndash by a he...,A cubical block of side 14 mm is surmounted by...
1,Numerical coefficients of the expression 4x – ...,Numerical coefficients of expression 4x - 3y are,Numerical coefficients of the expression 4x – ...,Numerical coefficients of the expression fourx...,,Numerical coefficients of the 4x – 3y are,Numerical coefficients of the expression 4x,Numerical below of the expression 4x – 3y are,Numerical coefficients of the expression x – y...,The coefficients of the expression are 4x 3Y.,Numerical fridays of the expression 4x – 3y are
2,Rational numbers between any two rational numb...,Rational numbers are / are between any two rat...,Rational numbers between any two rational numb...,Rational numbers between any two rational numb...,,Rational numbers between any two is/are,Rational numbers between any two,Rational numbers between any two it numbers is...,Rational numbers between any rational numbers ...,Any two rational numbers are rational numbers,Rational 170 between any two marble numbers is...
3,"In the given figure, the value of a equals","In the given illustration, the value of a corr...","In the given figure, the value of a equals","In the given figure, the value of a equals",,"In the given , the value of a equals","In the given figure, the value","In the given figure, the value of a aman","In the given figure, the value of a bowls",The value of a equals is given in the figure,"In the given figure, the value of a yz"
4,The largest side of a right angle triangle is,The largest side of a rectangular triangle is,The largest side of a right angle triangle is,The largest side of a right angle triangle is,,The largest side of a is,The largest side of a right,The thelength side of a him angle triangle is,The bce side of a right angle triangle is,A right angle triangle has the largest side.,The braces side of a opting angle triangle is


In [26]:
df_val.to_csv("../data/EM/em_val.csv", index=False)