SYNONYM REPLACEMENT

In [None]:
from datasets import load_dataset

squad = load_dataset("squad", split="train")
squad = squad.train_test_split(test_size=0.2, seed=42)

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [None]:
import nltk
from nltk import pos_tag
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger')

In [None]:
import random
from random import shuffle
from nltk.corpus import wordnet 
def get_only_chars(line):

    clean_line = ""

    line = line.replace("â€™", "")
    line = line.replace("'", "")
    line = line.replace("-", " ") #replace hyphens with spaces
    line = line.replace("\t", " ")
    line = line.replace("\n", " ")
    line = line.lower()

    for char in line:
        if char in 'qwertyuiopasdfghjklzxcvbnm ':
            clean_line += char
        else:
            clean_line += ' '

    clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    if clean_line[0] == ' ':
        clean_line = clean_line[1:]
    return clean_line

stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 
			'ours', 'ourselves', 'you', 'your', 'yours', 
			'yourself', 'yourselves', 'he', 'him', 'his', 
			'himself', 'she', 'her', 'hers', 'herself', 
			'it', 'its', 'itself', 'they', 'them', 'their', 
			'theirs', 'themselves', 'what', 'which', 'who', 
			'whom', 'this', 'that', 'these', 'those', 'am', 
			'is', 'are', 'was', 'were', 'be', 'been', 'being', 
			'have', 'has', 'had', 'having', 'do', 'does', 'did',
			'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
			'because', 'as', 'until', 'while', 'of', 'at', 
			'by', 'for', 'with', 'about', 'against', 'between',
			'into', 'through', 'during', 'before', 'after', 
			'above', 'below', 'to', 'from', 'up', 'down', 'in',
			'out', 'on', 'off', 'over', 'under', 'again', 
			'further', 'then', 'once', 'here', 'there', 'when', 
			'where', 'why', 'how', 'all', 'any', 'both', 'each', 
			'few', 'more', 'most', 'other', 'some', 'such', 'no', 
			'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 
			'very', 's', 't', 'can', 'will', 'just', 'don', 
			'should', 'now', '']
def synonym_replacement(words, n):
    new_words = words.copy()
    
    # POS tagging
    tags = pos_tag(words)
    
    # Candidate words: skip stopwords, proper nouns (NNP/NNPS), numbers
    candidates = [
        word for word, pos in tags
        if word.lower() not in stop_words
        and pos not in ["NNP", "NNPS"]   # skip proper nouns
        and not word.isdigit()           # skip numbers
    ]
    
    random.shuffle(candidates)
    num_replaced = 0
    
    for word in candidates:
        synonyms = get_synonyms(word)
        if synonyms:
            synonym = random.choice(synonyms)
            new_words = [synonym if w == word else w for w in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    # Reconstruct sentence
    sentence = ' '.join(new_words)
    new_words = sentence.split(' ')

    return new_words

def get_synonyms(word):
	synonyms = set()
	for syn in wordnet.synsets(word): 
		for l in syn.lemmas(): 
			synonym = l.name().replace("_", " ").replace("-", " ").lower()
			synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
			synonyms.add(synonym) 
	if word in synonyms:
		synonyms.remove(word)
	return list(synonyms)

In [None]:
import random
import re
from nltk.corpus import wordnet
from copy import deepcopy

# squad = load_dataset("squad")
examples_dict = squad['train'][:]

# Convert column-oriented dict to list of row dicts
examples = [dict(zip(examples_dict.keys(), values)) for values in zip(*examples_dict.values())]

# Augmentation parameters
N_AUGMENTATIONS_PER_SAMPLE = 1
MAX_REPLACEMENTS = 1
random.seed(1)

augmented_samples = []

for ex in examples:
    original_q = ex['question']
    cleaned = get_only_chars(original_q)
    words = cleaned.split(' ')

    # Create augmented versions
    for i in range(N_AUGMENTATIONS_PER_SAMPLE):
        augmented_words = synonym_replacement(words, MAX_REPLACEMENTS)
        augmented_q = ' '.join(augmented_words)

        new_sample = deepcopy(ex)
        new_sample['id'] = ex['id'] + f"_aug_{i}"
        new_sample['question'] = augmented_q
        augmented_samples.append(new_sample)

# Print results
print(f"Original examples: {len(examples)}")
print(f"Augmented examples: {len(augmented_samples)}\n")

for i, ex in enumerate(examples[1:5]):
    print(f"ðŸŸ© Original:  {ex['question']}")
    for aug in augmented_samples:
        if aug['id'].startswith(ex['id']):
            print(f"ðŸŸ¦ Augmented: {aug['question']}")
    print('-' * 80)


In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd

# Convert augmented samples (list of dicts) to a Dataset
augmented_train_dataset = Dataset.from_pandas(pd.DataFrame(augmented_samples))

# Keep original test dataset
test_dataset = squad['test']

# Create new DatasetDict
augmented_squad = DatasetDict({
    'train': augmented_train_dataset,
    'test': test_dataset
})

# Check
print(squad['train']['question'][4])
print(augmented_squad['train']['question'][4])