In [None]:
! pip install transformers==4.10.1

# **Data**

In [None]:
!pip install datasets

# WASSA dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [4]:
import pandas as pd
import numpy as np

wassa_train_csv_path = '/content/drive/My Drive/Colab Notebooks/NLP Augmentation/Datasets/WASSA_train_all.csv'
wassa_dev_csv_path = '/content/drive/My Drive/Colab Notebooks/NLP Augmentation/Datasets/WASSA_dev_all.csv'

wassa_train = pd.read_csv(wassa_train_csv_path)
wassa_dev = pd.read_csv(wassa_dev_csv_path)


In [5]:
print('wassa_train shape:', wassa_train.shape)
wassa_train['essay'].head(10)

wassa_train shape: (1860, 2)


0    it is really diheartening to read about these ...
1    the phone lines from the suicide prevention li...
2    no matter what your heritage, you should be ab...
3    it is frightening to learn about all these sha...
4    the eldest generation of russians aren't being...
5    middle east is fucked up, I've honestly never ...
6    well first of all whoever wrote this article d...
7    well well well, look at what we have well, the...
8    just another fucked up mental sickness of amer...
9    it seems a horny male college student has fina...
Name: essay, dtype: object

In [6]:
wassa_train['emotion'].head(10)

0    0
1    0
2    6
3    2
4    0
5    3
6    3
7    4
8    3
9    4
Name: emotion, dtype: int64

In [7]:
X_train = wassa_train['essay']
y_train = wassa_train['emotion']

X_test = wassa_dev['essay']
y_test = wassa_dev['emotion']

In [8]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1860,)
(1860,)
(270,)
(270,)


In [9]:
# label: 'sadness': 0, 'neutral': 6, 'fear': 2, 'anger': 3, 'disgust': 4, 'surprise': 5, 'joy': 6
y_train.value_counts()

0    647
3    349
6    275
2    194
5    164
4    149
1     82
Name: emotion, dtype: int64

In [10]:
y_test.value_counts()

0    98
3    76
2    31
6    25
1    14
5    14
4    12
Name: emotion, dtype: int64

# Augmentation using Embedding

In [13]:
! pip install nlpaug==1.1.7

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nlpaug==1.1.7
  Downloading nlpaug-1.1.7-py3-none-any.whl (405 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m405.1/405.1 KB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.7


In [14]:
from nlpaug.util.file.download import DownloadUtil
DownloadUtil.download_glove('glove.6B', '.')

In [15]:
# apply augmentation to the train data and save the results into a file
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf

from nlpaug.util import Action
import nlpaug.flow as naf

aug_emb = naf.Sequential([naf.Sometimes([
    naw.WordEmbsAug(
    model_type='glove', model_path='./glove.6B.200d.txt',
    action="substitute")]),
    naf.Sometimes([naw.WordEmbsAug(
    model_type='glove', model_path='./glove.6B.200d.txt',
    action="insert"),
])])


In [16]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [17]:
from tqdm.notebook import tqdm
tqdm.pandas()

data = {'text': X_train, 'label': y_train}
base_train = pd.DataFrame.from_dict(data)
base_train.shape

(1860, 2)

In [18]:
rep = 5 # how many repetitions of the record


emb_train = base_train.copy()
emb_train['paraphrase'] = emb_train['text'].progress_apply(lambda x:aug_emb.augment(x, rep))
emb_train = emb_train.explode('paraphrase').reset_index(drop=True)

  0%|          | 0/1860 [00:00<?, ?it/s]

In [19]:
emb_train.head(20)

Unnamed: 0,text,label,paraphrase
0,it is really diheartening to read about these ...,0,it is really suzerains diheartening to read ab...
1,it is really diheartening to read about these ...,0,on-line it is langlands really diheartening to...
2,it is really diheartening to read about these ...,0,it is really diheartening to read about these ...
3,it is really diheartening to read about these ...,0,it is really diheartening so read about these ...
4,it is really diheartening to read about these ...,0,madhoo it is really diheartening to read nearl...
5,the phone lines from the suicide prevention li...,0,maiken the phone lines parkmore from shangai t...
6,the phone lines from the suicide prevention li...,0,the phone lines from joss the suicide stealthi...
7,the phone lines from the suicide prevention li...,0,cockettes the phone lines from the confernce s...
8,the phone lines from the suicide prevention li...,0,the subscriber lines from all suicide preventi...
9,the phone lines from the suicide prevention li...,0,the online lines from some suicide prevention ...


# Create CSV file from augmented dataset

In [22]:
!cd '/content/drive/My Drive/Colab Notebooks/NLP Augmentation/Datasets/'
emb_train.to_csv('/content/drive/My Drive/Colab Notebooks/NLP Augmentation/Datasets/wassa_all_emb_Aug_5.csv', encoding='utf-8', index=False, sep=',')