**We present EDA: easy data augmentation techniques for boosting performance of NMT**

Given a sentence in the training set, we perform the following operations:

**Synonym Replacement (SR):** Randomly choose n words from the sentence that are not stop words. Replace each of these words with one of its synonyms chosen at random.

**Random Insertion (RI**): Find a random synonym of a random word in the sentence that is not a stop word. Insert that synonym into a random position in the sentence. Do this n times.

**Random Swap (RS):** Randomly choose two words in the sentence and swap their positions. Do this n times.

**Random Deletion (RD):** For each word in the sentence, randomly remove it with probability p


##**Import Gdrive so output is directy stored in drive***

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%cd /content/gdrive/MyDrive/Eda/english_telgu

/content/gdrive/MyDrive/Eda/english_telgu


# ##***1. Import the wordnet***

In [None]:
pip install --upgrade pyiwn


In [None]:
import pyiwn

# ***List of supported languages***

In [None]:
list(map(str, pyiwn.Language))

['Language.ASSAMESE',
 'Language.BENGALI',
 'Language.BODO',
 'Language.GUJARATI',
 'Language.HINDI',
 'Language.KANNADA',
 'Language.KASHMIRI',
 'Language.KONKANI',
 'Language.MALAYALAM',
 'Language.MARATHI',
 'Language.MEITEI',
 'Language.NEPALI',
 'Language.ORIYA',
 'Language.PUNJABI',
 'Language.SANSKRIT',
 'Language.TAMIL',
 'Language.TELUGU',
 'Language.URDU']

In [None]:
iwn = pyiwn.IndoWordNet(lang=pyiwn.Language.TELUGU)


# *language defaults to Hindi*

to use other language wordnet(s),
use:


**iwn = pyiwn.IndoWordNet(lang=pyiwn.Language.KANNADA)**

# **2. Import stopwords**

In [None]:
pip install stopwordsiso

In [None]:
import stopwordsiso

In [None]:
from stopwordsiso import stopwords

In [None]:
stop_words=list(stopwords("hi"))

# ###***FOR SYNNONYM REPLACEMENT***(sr) 

**Get a list of all the vocabulary present in wordnet**

In [None]:
vocab=[]
synsets = iwn.all_synsets()
for synset in synsets:
  for syn in synset.lemma_names():
    vocab.append(syn)

In [None]:
def get_synonyms(word,n):
	c=[]
	if word in vocab and word not in stop_words:
			a=iwn.synsets(word)
			b=a[0]
	
			c=b.lemma_names()
	x=min(n,len(c)) 
	return list(c[1:x])

## **FOR A MONOLINGAUL TEXT FILE**

# If there is a monolingual file to be augmented we use the code below



In [None]:
import string 
import sys
 
# Open the file in read mode 
text = open("/content/gdrive/MyDrive/Eda/aug.tda.hi.txt", "r") 
f = open("aug.tda.hi.sr.txt" ,'a') 
for line in text: 
    # Remove the leading spaces and newline character 
    line = line.strip() 

 
 
    # Split the line into words 
    words = line.split(" ")
    for word in words:
      synonyms=get_synonyms(word,3)
      if(len(synonyms)>0):
        
        for syn in synonyms:
          line2=line.replace(word,syn)
          sys.stdout =f
          print(line2) 
f.close


  

# **For Bilingual file in fomrat**

**English sentence  ||| Hindi sentence**

***We use the Hindi/any target language for Augmentation and at the end concatinate both Source and target language in above style ***

In [None]:
import string 
import sys
 
# Open the file in read mode 
text = open("/content/gdrive/MyDrive/Eda/english_telgu/tda.eng_te.txt", "r") 
f = open("aug.tda.eng_telugu.sr.txt" ,'a')

for line in text:
    parts= line.split(" ||| ")
    eng=parts[0]
    hindi=parts[1]
  
    # Remove the leading spaces and newline character 
    hin = hindi.strip() 
    hind=hin

 

 
 
    # Split the line into words 
    words = hin.split(" ")
    for word in words:
      synonyms=get_synonyms(word,4)
      if(len(synonyms)>0):
        for x in range(len(synonyms)):


          hin=hin.replace(word,synonyms[x])
          sys.stdout =f
          print(eng +" ||| "+hin)
          hin=hind

f.close






<function TextIOWrapper.close>

# ***RANDOM INSERTION***

In [None]:
def random_insertion(words, n):
	new_words = words.copy()
	for _ in range(n):
		
		add_word(new_words)
	return new_words

def add_word(new_words):
	synonyms = []
	counter = 0
	while len(synonyms) < 1:
		random_word = new_words[random.randint(0, len(new_words)-1)]
		synonyms = get_synonyms(random_word,2)
		counter += 1
		if counter >= 10:
			return
	random_synonym = synonyms[-1]
	random_idx = random.randint(0, len(new_words)-1)
	new_words.insert(random_idx, random_synonym)

In [None]:
import random
from random import shuffle
random.seed(1)


## **FOR A MONOLINGAUL TEXT FILE**

In [None]:
import string 
import sys


 
# Open the file in read mode 
 

text= open("/content/gdrive/MyDrive/Eda/aug.tda.hi.txt" ,'r') 
f = open("aug.tda.hi.ri.txt" ,'a') 

for line in text: 
    # Remove the leading spaces and newline character 
    line = line.strip() 
    augmented_sentences = []

 
 
 
    # Split the line into words 
    words = line.split(" ")
    a=random_insertion(words,1)
    augmented_sentences.append(' '.join(a))
    for aug_sentence in augmented_sentences:
      sys.stdout =f
      print(aug_sentence)




    
     
f.close

# **For Bilingual file**

In [None]:
import string 
import sys


 
# Open the file in read mode 
 

text= open("/content/gdrive/MyDrive/Eda/eng_tamil/tda.eng_ta.txt" ,'r') 
f = open("aug.tda.eng_tamil.ri.txt" ,'a') 
for line in text:
    parts= line.split(" ||| ")
    eng=parts[0]
    hindi=parts[1]

 
    # Remove the leading spaces and newline character 
    hin = hindi.strip() 
    augmented_sentences = []
 
    # Split the line into words 
    words = hin.split(" ")
    a=random_insertion(words,1)
    augmented_sentences.append(' '.join(a))
    for aug_sentence in augmented_sentences:
      sys.stdout =f
      print(eng+" ||| "+aug_sentence)
     
f.close

<function TextIOWrapper.close>

# ###***FOR*** **RAMDOM** **DELETION** 

In [None]:
def random_deletion(words, p):

	#obviously, if there's only one word, don't delete it
	if len(words) == 1:
		return words

	#randomly delete words with probability p
	new_words = []
	for word in words:
		r = random.uniform(0, 1)
		if r > p:
			new_words.append(word)

	#if you end up deleting all words, just return a random word
	if len(new_words) == 0:
		rand_int = random.randint(0, len(words)-1)
		return [words[rand_int]]

	return new_words

## **FOR A MONOLINGAUL TEXT FILE**

In [None]:
import string 
import sys


 
# Open the file in read mode 
 

text= open("/content/gdrive/MyDrive/Eda/aug.tda.hi.txt" ,'r') 
f = open("aug.tda.hi.rd.txt" ,'a') 

for line in text: 
    # Remove the leading spaces and newline character 
    line = line.strip() 
    augmented_sentences = []
   
   
    # Split the line into words 
    words = line.split(" ")
    a=random_deletion(words,0.2)
    augmented_sentences.append(' '.join(a))
    for aug_sentence in augmented_sentences:
      sys.stdout =f
      print(aug_sentence)

f.close

<function TextIOWrapper.close>

# **For Bilingual file**

In [None]:
import string 
import sys


 
# Open the file in read mode 
 

text= open("/content/gdrive/MyDrive/Eda/eng_tamil/tda.eng_ta.txt" ,'r') 
f = open("aug.tda.eng_tamil.rd.txt" ,'a') 

for line in text:
    parts= line.split(" ||| ")
    eng=parts[0]
    hindi=parts[1]
    hindi = hindi.strip()
    
    
    augmented_sentences = []
   
   
    # Split the line into words 
    words = hindi.split(" ")
    a=random_deletion(words,0.2)
    augmented_sentences.append(' '.join(a))
    for aug_sentence in augmented_sentences:
      sys.stdout =f
      print(eng+" ||| "+aug_sentence)

f.close

<function TextIOWrapper.close>

# ***RANDOM SWAP(rs)***

In [None]:
def random_swap(words, n):
	new_words = words.copy()
	for _ in range(n):
		new_words = swap_word(new_words)
	return new_words

def swap_word(new_words):
	random_idx_1 = random.randint(0, len(new_words)-1)
	random_idx_2 = random_idx_1
	counter = 0
	while random_idx_2 == random_idx_1:
		random_idx_2 = random.randint(0, len(new_words)-1)
		counter += 1
		if counter > 3:
			return new_words
	new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] 
	return new_words

## **FOR A MONOLINGAUL TEXT FILE**

In [None]:
import string 
import sys


 
# Open the file in read mode 
 

text= open("/content/gdrive/MyDrive/Eda/aug.tda.hi.txt" ,'r') 
f = open("aug.tda.hi.rs.txt" ,'a') 

for line in text: 
    # Remove the leading spaces and newline character 
    line = line.strip() 
    augmented_sentences = []

 
 
    # Split the line into words 
    words = line.split(" ")
    a=random_swap(words,1)
    augmented_sentences.append(' '.join(a))
    for aug_sentence in augmented_sentences:
      sys.stdout =f
      print(aug_sentence)

f.close

<function TextIOWrapper.close>

# **For Bilingual file**

In [None]:
import string 
import sys


 
# Open the file in read mode 
 

text= open("/content/gdrive/MyDrive/Eda/eng_tamil/tda.eng_ta.txt" ,'r') 
f = open("aug.tda.eng_tamil.rs.txt" ,'a') 

for line in text:
    parts= line.split(" ||| ")
    eng=parts[0]
    hindi=parts[1]
    hindi = hindi.strip()
    augmented_sentences = []

 

 
 
    # Split the line into words 
    words = hindi.split(" ")
    a=random_swap(words,1)
    augmented_sentences.append(' '.join(a))
    for aug_sentence in augmented_sentences:
      sys.stdout =f
      print(eng+" ||| "+aug_sentence)


f.close

# **To extract Hindi text file from bilingual aug data**

In [None]:
import string 
import sys
 
# Open the file in read mode 
text = open("/content/gdrive/MyDrive/Eda/augmented.bil.txt", "r") 

h = open("hin.aug.txt" ,'a')

for line in text:
    parts= line.split(" ||| ")
    eng=parts[0]
    hindi=parts[1]
    hindi = hindi.strip()

    sys.stdout =h
    print(hindi)
h.close

# **To extract english text file from bilingual aug data**

In [None]:
import string 
import sys
 
# Open the file in read mode 
text = open("/content/gdrive/MyDrive/Eda/augmented.bil.txt", "r") 

e = open("eng.aug.txt" ,'a')

for line in text:
    parts= line.split(" ||| ")
    eng=parts[0]
    hindi=parts[1]
    eng = eng.strip()

    sys.stdout =h
    print(eng)
e.close