In [12]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [13]:
import pandas as pd
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords

In [32]:
# Load the initial dataset
initial_dataset = pd.read_csv('amazonreviews.tsv', delimiter='\t')

# Perform data augmentation using NLTK
augmented_data = []
stopwords_set = set(stopwords.words('english'))

for text in initial_dataset['review']:
    tokens = word_tokenize(text)
    augmented_tokens = []
    
    # Synonym Replacement
    for token in tokens:
        synsets = wordnet.synsets(token)
        if synsets:
            synonyms = [synset.lemmas()[0].name() for synset in synsets]
            augmented_token = synonyms[0] if len(synonyms) == 1 else synonyms[1]
        else:
            augmented_token = token
        augmented_tokens.append(augmented_token)
    
    # Random Swap
    n = len(augmented_tokens)
    for i in range(n):
        if i < n-1 and i % 2 == 0:
            augmented_tokens[i], augmented_tokens[i+1] = augmented_tokens[i+1], augmented_tokens[i]
    
    # Random Deletion
    augmented_tokens = [token for token in augmented_tokens if token.lower() not in stopwords_set or token.lower() == 'not']
    augmented_text = ' '.join(augmented_tokens)
    augmented_data.append(augmented_text)

# Create a new DataFrame with augmented data
augmented_dataset = pd.DataFrame({'label': initial_dataset['label'],'review': augmented_data})

# Save the augmented dataset to a file
augmented_dataset.to_csv('augmented_dataset.tsv', sep='\t', index=False)

# Merge the two datasets into a single file
merged_dataset = pd.extend([initial_dataset, augmented_dataset], ignore_index=True)

# Save the merged dataset to a file
merged_dataset.to_csv('merged_dataset.csv', index=False)

In [36]:
merged_dataset.tail()

Unnamed: 0,label,review
19995,pos,revelation vitamin_A life small indium America...
19996,pos,biography great vitamin_A concern identical : ...
19997,neg,subject concern hapless ; : presentation 'd di...
19998,neg,n't : buy box use look information_technology ...
19999,pos,pen beautiful fast . delivery : pen promptly s...
