In [1]:
pip install nlpaug numpy pandas tqdm torch transformers sentencepiece gensim

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import nlpaug.augmenter.word as naw
from nlpaug.util.file.download import DownloadUtil
import torch

In [3]:
# Load dataset
file_path = '/kaggle/input/17k-essays/essay_training_set.csv' #give path to 17kEssays file
df = pd.read_csv(file_path).dropna()

print(f"Full dataset size: {len(df)}")
print("Original score distribution:")
print(df['score'].value_counts().sort_index())

Full dataset size: 17645
Original score distribution:
score
1    1252
2    4723
3    6280
4    3926
5     970
6     494
Name: count, dtype: int64


In [4]:
# Split into train and validation sets (80% train, 20% validation)
train_df = df.sample(frac=0.8, random_state=42)
val_df = df.drop(train_df.index)

print(f"\nTraining set size: {len(train_df)}")
print("Training score distribution:")
print(train_df['score'].value_counts().sort_index())
print(f"Validation set size: {len(val_df)}")
print("Validation score distribution:")
print(val_df['score'].value_counts().sort_index())


Training set size: 14116
Training score distribution:
score
1    1002
2    3758
3    5040
4    3123
5     781
6     412
Name: count, dtype: int64
Validation set size: 3529
Validation score distribution:
score
1     250
2     965
3    1240
4     803
5     189
6      82
Name: count, dtype: int64


In [5]:
# RoBERTa-based augmenter (contextual embeddings)
roberta_aug = naw.ContextualWordEmbsAug(
    model_path='roberta-base', 
    action="substitute",  # or "insert"
    aug_p=0.3, 
    aug_min=1, 
    aug_max=5,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [6]:
# Example text
text = "Data augmentation is important for NLP tasks."
augmented_text = roberta_aug.augment(text)

print(f"Original: {text}")
print(f"Augmented: {augmented_text}")

Original: Data augmentation is important for NLP tasks.
Augmented: ['Data sources are important across NLP tasks.']


In [7]:
#Augmentation logic (only for training set)
target_count = 3990
minority_scores = [1, 5, 6]
majority_scores = [2, 3, 4]

augmented_texts = []
augmented_scores = []

augmenter = roberta_aug  # For RoBERTa

# Augment minority classes in training set
for score in minority_scores:
    minority_df = train_df[train_df['score'] == score]
    original_count = len(minority_df)
    augmentation_factor = max(1, min(int(target_count / original_count), 12))
    
    print(f"\nAugmenting minority score {score} from {original_count} to ~{target_count}")
    print(f"Each essay augmented {augmentation_factor} times")
    
    with tqdm(total=original_count * augmentation_factor, desc=f"Score {score} Progress") as pbar:
        for _, row in minority_df.iterrows():
            augmented_texts.append(row['full_text'])  # Original
            augmented_scores.append(row['score'])
            pbar.update(1)
            for _ in range(augmentation_factor - 1):
                new_text = augmenter.augment(row['full_text'])[0]  # nlpaug returns a list
                augmented_texts.append(new_text)
                augmented_scores.append(row['score'])
                pbar.update(1)

# Adjust majority classes in training set
for score in majority_scores:
    majority_df = train_df[train_df['score'] == score]
    original_count = len(majority_df)
    
    if original_count > target_count:
        sampled_df = majority_df.sample(n=target_count, random_state=42)
        print(f"\nDownsampling majority score {score} from {original_count} to {target_count}")
        adjustment_factor = 1
    else:
        adjustment_factor = max(1, min(int(target_count / original_count), 2))
        sampled_df = majority_df
        print(f"\nAdjusting majority score {score} from {original_count} to ~{target_count}")
        print(f"Each essay augmented {adjustment_factor} times")
    
    with tqdm(total=len(sampled_df) * adjustment_factor, desc=f"Score {score} Progress") as pbar:
        for _, row in sampled_df.iterrows():
            augmented_texts.append(row['full_text'])  # Original
            augmented_scores.append(row['score'])
            pbar.update(1)
            for _ in range(adjustment_factor - 1):
                new_text = augmenter.augment(row['full_text'])[0]
                augmented_texts.append(new_text)
                augmented_scores.append(row['score'])
                pbar.update(1)

# Create augmented training DataFrame
augmented_train_df = pd.DataFrame({'full_text': augmented_texts, 'score': augmented_scores})

# Combine with validation set (unaugmented)
final_df = pd.concat([augmented_train_df, val_df], ignore_index=True)

print(f"\nOriginal training set size: {len(train_df)}")
print(f"Augmented training set size: {len(augmented_train_df)}")
print(f"Validation set size (unchanged): {len(val_df)}")
print(f"Final combined dataset size: {len(final_df)}")
print("Final score distribution:")
print(final_df['score'].value_counts().sort_index())


Augmenting minority score 1 from 1002 to ~3990
Each essay augmented 3 times


Score 1 Progress: 100%|██████████| 3006/3006 [05:07<00:00,  9.78it/s]



Augmenting minority score 5 from 781 to ~3990
Each essay augmented 5 times


Score 5 Progress: 100%|██████████| 3905/3905 [12:04<00:00,  5.39it/s]



Augmenting minority score 6 from 412 to ~3990
Each essay augmented 9 times


Score 6 Progress: 100%|██████████| 3708/3708 [09:11<00:00,  6.73it/s]



Adjusting majority score 2 from 3758 to ~3990
Each essay augmented 1 times


Score 2 Progress: 100%|██████████| 3758/3758 [00:00<00:00, 21498.63it/s]



Downsampling majority score 3 from 5040 to 3990


Score 3 Progress: 100%|██████████| 3990/3990 [00:00<00:00, 21339.48it/s]



Adjusting majority score 4 from 3123 to ~3990
Each essay augmented 1 times


Score 4 Progress: 100%|██████████| 3123/3123 [00:00<00:00, 19310.47it/s]


Original training set size: 14116
Augmented training set size: 21490
Validation set size (unchanged): 3529
Final combined dataset size: 25019
Final score distribution:
score
1    3256
2    4723
3    5230
4    3926
5    4094
6    3790
Name: count, dtype: int64





In [8]:
augmented_train_df.to_csv('augmented_train_dataset.csv', index=False)
val_df.to_csv('validation_dataset.csv', index=False)
final_df.to_csv('complete_essay_dataset.csv', index=False)
print("Datasets saved!")

Datasets saved!
