In [None]:
!pip3 install numpy requests nlpaug

## Synonym replacement

In [None]:
import nlpaug.augmenter.word as naw

aug = naw.ContextualWordEmbsAug(model_path='HooshvareLab/bert-fa-base-uncased', action="substitute")
sentence = 'شهاب امروز با اتوبوس از خانه تا رستوران رفت تا چلوکباب بخورد'
augmented_text = aug.augment(sentence)
print(augmented_text)

## Random Insertion

In [None]:
aug = naw.ContextualWordEmbsAug(model_path='HooshvareLab/bert-fa-base-uncased', action="insert")
sentence = 'شهاب امروز با اتوبوس از خانه تا رستوران رفت تا چلوکباب بخورد'
augmented_text = aug.augment(sentence)
print(augmented_text)

## Random Swap

In [None]:
aug = naw.RandomWordAug(action='swap')
sentence = 'شهاب امروز با اتوبوس از خانه تا رستوران رفت تا چلوکباب بخورد'
augmented_text = aug.augment(sentence)
print(augmented_text)

## Random Deletoon

In [None]:
aug = naw.RandomWordAug(action='delete')
sentence = 'شهاب امروز با اتوبوس از خانه تا رستوران رفت تا چلوکباب بخورد'
augmented_text = aug.augment(sentence)
print(augmented_text)

In [None]:
aug.augment(sentence, n=2)

## Creating augmented datasets

In [None]:
import pandas as pd
import nlpaug.augmenter.word as naw
from tqdm import tqdm

NUMBER_OF_AUGMENTATION_WANTED = 300

targets = ["Anger", "Fear", "Happiness", "Hatred", "Sadness", "Wonder"]
target = targets[2]

df = pd.read_csv(f'/content/{target.lower()}.csv', usecols=["text", target])
df.head()

df = df[(df[target] > 3) | (df[target] < 2)]

df = df.replace([1], 0)
df = df.replace([4], 1)
df = df.replace([5], 1)

print(f'Value counts before augmentation: ')
print(df[target].value_counts())

texts = df[df[target] > 0]['text'].tolist()

augmeneted_data = {"text": [], f"{target}": []}

for text in tqdm(texts):
  for i in range(NUMBER_OF_AUGMENTATION_WANTED):
    # Swap
    aug = naw.RandomWordAug(action='swap', aug_p=0.6)
    augmented_text = aug.augment(text)
    # Synonym replacement
    aug = naw.ContextualWordEmbsAug(model_path='HooshvareLab/bert-fa-base-uncased', action="substitute", aug_p=0.6, device='cuda')
    augmented_text = aug.augment(augmented_text)
    # Deletion
    aug = naw.RandomWordAug(action='swap', aug_p=0.3)
    augmented_text = aug.augment(augmented_text)
    # Insertion
    aug = naw.ContextualWordEmbsAug(model_path='HooshvareLab/bert-fa-base-uncased', action="insert", aug_p=0.3, device='cuda')
    augmented_text = aug.augment(augmented_text)

    augmeneted_data["text"].append(augmented_text)
    augmeneted_data[target].append(1)

In [None]:
df = df.append(pd.DataFrame(augmeneted_data))
print(f'Value counts after augmentation: ')
print(df[target].value_counts())

In [None]:
df.to_csv(f"/content/mutated_{target}.csv", encoding='utf-8')