# Text Augmentation

In [1]:
import nltk
import random
import string
from nltk.corpus import wordnet

# Ensure required NLTK data is available
nltk.download('wordnet')
nltk.download('omw-1.4')  # Synonym database

# Sample text for augmentation
text = "Natural language processing is an exciting field of artificial intelligence. It is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language."

# Display the original text
print("Original Text:\n", text)

Original Text:
 Natural language processing is an exciting field of artificial intelligence. It is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language.


[nltk_data] Downloading package wordnet to /home/omar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/omar/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## 1. Synonym Replacement 🔄
Replace words in a sentence with their synonyms.

In [2]:
def get_synonyms(word):
    """Fetch synonyms for a given word."""
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    return synonyms

def synonym_replacement(sentence, n=3):
    """Replace 'n' random words in the sentence with their synonyms."""
    words = sentence.split()
    new_words = words.copy()
    random_word_list = list(set(words))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if synonyms:
            synonym = random.choice(synonyms)
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    return ' '.join(new_words)

# Perform synonym replacement
augmented_text = synonym_replacement(text, n=12)
print("Augmented Text with Synonyms:\n", augmented_text)

Augmented Text with Synonyms:
 innate language sue be AN turn_on field of hokey intelligence. IT be amp subfield of linguistics, computer science, and hokey word concerned with the fundamental_interaction betwixt computers and human_being language.


# 2. Random Insertion 📥
Randomly insert new words into a sentence.

In [3]:
def random_insertion(sentence, n=2):
    """Insert 'n' random synonyms into the sentence."""
    words = sentence.split()
    new_words = words.copy()
    for _ in range(n):
        add_word(new_words)
    return ' '.join(new_words)

def add_word(new_words):
    """Add a random synonym of a random word in the sentence."""
    synonyms = []
    counter = 0
    while len(synonyms) < 1 and counter < 10:
        random_word = new_words[random.randint(0, len(new_words) - 1)]
        synonyms = get_synonyms(random_word)
        counter += 1
    if synonyms:
        synonym = random.choice(synonyms)
        random_idx = random.randint(0, len(new_words) - 1)
        new_words.insert(random_idx, synonym)

# Perform random insertion
augmented_text = random_insertion(text, n=5)
print("Augmented Text with Insertions:\n", augmented_text)

Augmented Text with Insertions:
 Natural language be processing is embody an exciting field of artificial embody intelligence. It is artificial a subfield of linguistics, computer science, and artificial type_A intelligence concerned with the interactions between computers and human language.


# 3. Random Deletion ❌
Randomly delete words from a sentence.

In [4]:
def random_deletion(sentence, p=0.2):
    """Randomly delete words from the sentence with probability 'p'."""
    words = sentence.split()
    if len(words) == 1:
        return sentence
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)
    if not new_words:
        return random.choice(words)
    return ' '.join(new_words)

# Perform random deletion
augmented_text = random_deletion(text, p=0.3)
print("Augmented Text with Deletions:\n", augmented_text)

Augmented Text with Deletions:
 Natural language processing an artificial intelligence. It is computer artificial intelligence concerned with the between computers and human language.


# 4. Random Swap 🔀
Swap the positions of two words in a sentence.

In [5]:
def random_swap(sentence, n=2):
    """Swap the positions of two words 'n' times."""
    words = sentence.split()
    new_words = words.copy()
    for _ in range(n):
        new_words = swap_word(new_words)
    return ' '.join(new_words)

def swap_word(new_words):
    """Randomly swap two words in the list."""
    idx1 = random.randint(0, len(new_words) - 1)
    idx2 = idx1
    counter = 0
    while idx2 == idx1 and counter < 10:
        idx2 = random.randint(0, len(new_words) - 1)
        counter += 1
    new_words[idx1], new_words[idx2] = new_words[idx2], new_words[idx1]
    return new_words

# Perform random swap
augmented_text = random_swap(text, n=2)
print("Augmented Text with Swaps:\n", augmented_text)

Augmented Text with Swaps:
 Natural language processing is an exciting field computer artificial intelligence. It is a subfield of linguistics, of science, and artificial intelligence concerned with between interactions the computers and human language.


# 5. Back Translation 🌐
Translate a sentence to another language and then back to the original language.

In [6]:
from googletrans import Translator

def back_translation(sentence, src_lang='en', mid_lang='fr'):
    """Translate the sentence to 'mid_lang' and back to 'src_lang'."""
    translator = Translator()
    translated = translator.translate(sentence, src=src_lang, dest=mid_lang).text
    back_translated = translator.translate(translated, src=mid_lang, dest=src_lang).text
    return back_translated

# Perform back translation
augmented_text = back_translation(text, src_lang='en', mid_lang='ar')
print("Augmented Text with Back Translation:\n", augmented_text)

Augmented Text with Back Translation:
 Natural language processing is an artificial intelligence field. It is a sub -field of linguistics, computer science and artificial intelligence concerned with interactions between computers and human language.


# 6. Noise Injection 🐛
Add noise to a sentence by replacing characters with similar looking characters.

In [7]:
def noise_injection(sentence, n=2):
    """Inject noise into the sentence by adding typos."""
    words = list(sentence)
    num_chars = len(words)
    for _ in range(n):
        char_idx = random.randint(0, num_chars - 1)
        words[char_idx] = random.choice(string.ascii_letters)
    return ''.join(words)

# Perform noise injection
augmented_text = noise_injection(text, n=5)
print("Augmented Text with Noise Injection:\n", augmented_text)

Augmented Text with Noise Injection:
 Natural language processiRg is an exciting field of artificial intelligencL. It is a subfield of lUnguistiQs, computer science, and artificial intelligence concerned with the interactions cetween computers and human language.


## Putting It All Together
Let's create a complete text augmentation pipeline that can apply multiple augmentations to a text.

In [9]:
def augment_text(text, num_augmentations=4):
    """Apply multiple text augmentation techniques."""
    augmentations = [
        synonym_replacement,
        random_insertion,
        random_deletion,
        random_swap,
        noise_injection
    ]
    augmented_text = text
    for i in range(num_augmentations):
        func = random.choice(augmentations)
        if func == back_translation:
            augmented_text = func(augmented_text)
        else:
            augmented_text = func(augmented_text, 2)
    return augmented_text

# Perform text augmentation pipeline
augmented_text = augment_text(text)
print("Final Augmented Text:\n", augmented_text)

Final Augmented Text:
 Natural intelligence. processing is an computer field of artificVal language It is a subfield of linguistics, exciting between and artificial intelligence_information concerned with languZge. interactions science, computers and man the
