In [6]:
import nlpaug.augmenter.word as naw
from nltk.corpus import wordnet

original_text = "Data augmentation is a technique used to increase the size, of a dataset.Data augmentation is the process of artificially generating new data from existing data, primarily to train new machine learning models."

def get_synonyms(word):
    synonyms = [] #empty list to store sets of synonyms
    for syn in wordnet.synsets(word): #wordnet - dictionary usages, synsets - sets of synonyms
        for lemma in syn.lemmas(): # lemmas - single synonym
            synonyms.append(lemma.name())
    return synonyms

def augment_with_synonyms(text):
    augmented_text = []
    words = text.split() # splits words , remove whitespaces
    for word in words:
        synonyms = get_synonyms(word)
        if synonyms:
            augmented_text.append(synonyms[0])  # use only one/first synonym from the list= synonyms
        else:
            augmented_text.append(word)
    return ' '.join(augmented_text) # join the splitted words

augmented_text_synonyms = augment_with_synonyms(original_text)

print("Original Text:")
print(original_text)

print("\nAugmented Text (Synonyms):")
print(augmented_text_synonyms)


Original Text:
Data augmentation is a technique used to increase the size of a dataset.Data augmentation is the process of artificially generating new data from existing data, primarily to train new machine learning models.

Augmented Text (Synonyms):
data augmentation be angstrom technique use to addition the size of angstrom dataset.Data augmentation be the procedure of artificially generate new data from exist data, chiefly to train new machine learning models.


In [5]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw

original_text = "Data augmentation is a technique used to increase the size of a dataset."

aug_char = nac.RandomCharAug(action="insert") # to generate random character and insert , delete, substitute, swap
augmented_text_char = aug_char.augment(original_text)

aug_word = naw.RandomWordAug() # to generate random word based on the existing dataset
augmented_text_word = aug_word.augment(original_text)

print("Original Text:")
print(original_text)

print("\nAugmented Text (Character-level):")
print(augmented_text_char)

print("\nAugmented Text (Word-level):")
print(augmented_text_word)


Original Text:
Data augmentation is a technique used to increase the size of a dataset.

Augmented Text (Character-level):
['xDatha aBuYgLmentaution is a techn!iqsufe used to increase the sMilze of a daXtafseut.']

Augmented Text (Word-level):
['Data a technique used to increase a dataset.']


In [4]:
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
pip install nltk



In [2]:
pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/410.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m215.0/410.5 kB[0m [31m6.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11
