Data Augmentation in NLP


In [1]:
!pip install --upgrade gensim --quiet
# quiet ensures only essential information, such as errors or warnings, will be displayed

In [2]:
import gensim

In [3]:
gensim.__version__

'4.3.2'

In [4]:
#install transformers to use base models like BERT
!pip install transformers --quiet

In [5]:
import transformers

In [7]:
#install the tokenizer required by back translation model
!pip install sacremoses --quiet

In [8]:
#install tokenizer
import sacremoses

In [10]:
#install nlpaug model
!pip install nlpaug --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/410.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/410.5 kB[0m [31m811.9 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/410.5 kB[0m [31m967.2 kB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m204.8/410.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m409.6/410.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [11]:
# Import the nlpaug module and its methods
import nlpaug.augmenter.char as nac                      #mports the character-level augmentation methods from nlpaug
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action                         #Action class is used to specify the type of augmentation action to be performed, such as INSERT, SUBSTITUTE, or DELETE.

Download word embedding models

In [12]:
# Download models to a temporary path
from nlpaug.util.file.download import DownloadUtil
DownloadUtil.download_word2vec(dest_dir = '.')
# Possible values are ‘wiki-news-300d-1M’, ‘wiki-news-300d-1M-subword’, ‘crawl-300d-2M’ and ‘crawl-300d-2M-subword’
DownloadUtil.download_fasttext(dest_dir = '.', model_name = 'crawl-300d-2M')
# Possible values are ‘glove.6B’, ‘glove.42B.300d’, ‘glove.840B.300d’ and ‘glove.twitter.27B’
DownloadUtil.download_glove(dest_dir = '.', model_name = 'glove.6B')

Downloading...
From (original): https://drive.google.com/uc?export=download&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM
From (redirected): https://drive.google.com/uc?export=download&id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&confirm=t&uuid=11fc1ac3-e621-42fe-9432-5c1050669770
To: /content/GoogleNews-vectors-negative300.bin.gz
100%|██████████| 1.65G/1.65G [00:16<00:00, 98.5MB/s]


Example text

In [13]:
text = """
  Is daily coffee consumption good for our health?
  I guess it is reasonable to believe so, but it may also depend on how much you drink.
  """

Option 1: Substitute or insert word randomly using word embeddings similarity

In [15]:
# Initialize the augmenter with model "word2vec"
aug = naw.WordEmbsAug(
  # You can choose from "word2vec", "glove", or "fasttext"
  model_type = 'word2vec',
  model_path = 'GoogleNews-vectors-negative300.bin',
  # You may also choose "insert"
  action = "substitute")

# Augment the text
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)


Original:

  Is daily coffee consumption good for our health? 
  I guess it is reasonable to believe so, but it may also depend on how much you drink.
  
Augmented Text:
['Is daily coffee consumption BY good for our health? PINIELLA I generational guess Wide it is reasonable to believe http://www.ata.net.cn so, but HP it may By also depend Countesswells on how much Minit you drink.']


In [19]:

# Initialize the augmenter with model "fasttext"
aug = naw.WordEmbsAug(
  # You can choose from "word2vec", "glove", or "fasttext"
  model_type = 'fasttext',
  model_path = 'crawl-300d-2M.vec',
  # You may also choose "insert"
  action = "substitute")

# Augment the text
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:

  Is daily coffee consumption good for our health? 
  I guess it is reasonable to believe so, but it may also depend on how much you drink.
  
Augmented Text:
['Is daily coffee consumption.There goooood -For our health? I guess on.It is reasonable to believe so, and it maynot too.There depend on how much.I you libation.']


In [23]:
# Initialize the augmenter with model "glove"
aug = naw.WordEmbsAug(
  # You can choose from "word2vec", "glove", or "fasttext"
  model_type = 'glove',
  # Note: check your "content" path to find out specific model names
  model_path = 'glove.6B.300d.txt',
  # You may also choose "insert"
  action = "substitute")

# Augment the text
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:

  Is daily coffee consumption good for our health? 
  I guess it is reasonable to believe so, but it may also depend on how much you drink.
  
Augmented Text:
['Is daily coffee imported good way our health? I guess it is reasonable unable if now, but does may also depend on take much because juices.']


Option 2: Substitute or insert word by contextual word embeddings

In [25]:
## Substitute word by contextual word embeddings (BERT, DistilBERT, RoBERTA or XLNet)
aug = naw.ContextualWordEmbsAug(
  # Other models include 'distilbert-base-uncased', 'roberta-base', etc.
  model_path = 'bert-base-uncased',
  # You can also choose "insert"
  action = "substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:

  Is daily coffee consumption good for our health? 
  I guess it is reasonable to believe so, but it may also depend on how much you drink.
  
Augmented Text:
['is daily coffee consumption good for our health? i know will seem reasonable me encourage you, but you may also disagree on how heavily you drink.']


Option 3: Substitute or insert word by synonym

In [27]:
## Substitute word by WordNet's synonym
aug = naw.SynonymAug(aug_src = 'wordnet')      #WordNet is a lexical database of English words organized into synsets, where each synset represents a distinct concept and contains a set of synonyms
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)


Original:

  Is daily coffee consumption good for our health? 
  I guess it is reasonable to believe so, but it may also depend on how much you drink.
  
Augmented Text:
['Is daily coffee consumption good for our health? Atomic number 53 suppose information technology is sensible to believe thus, but it crataegus oxycantha also depend on how much you drink in.']


In [28]:
## Substitute word by WordNet's synonym.
# You can optionally set the max number of words to replace with synonym.
aug = naw.SynonymAug(aug_src = 'wordnet', aug_max = 3)
augmented_text = aug.augment(text, )
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:

  Is daily coffee consumption good for our health? 
  I guess it is reasonable to believe so, but it may also depend on how much you drink.
  
Augmented Text:
['Is daily coffee tree consumption good for our health? I guess it is reasonable to trust so, but it whitethorn also depend on how much you drink.']



Option 4: Substitute or insert word using back translation

In [33]:
# Use back translation augmenter
back_translation_aug = naw.BackTranslationAug(
    from_model_name = 'facebook/wmt19-en-de',           #English to German
    to_model_name = 'facebook/wmt19-de-en'              #Back to ENG
)
print("Original:")
print(text)
print("Augmented Text:")
back_translation_aug.augment(text)

Original:

  Is daily coffee consumption good for our health? 
  I guess it is reasonable to believe so, but it may also depend on how much you drink.
  
Augmented Text:


['Is daily coffee consumption good for our health? I think it is reasonable to believe so, but it can also depend on how much you drink.']