In [None]:
! pip install transformers

# **Data**

# covid dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np

wassa_train_csv_path = 'emoevent_train.csv'
wassa_dev_csv_path = 'emoevent_test.csv'

wassa_train = pd.read_csv(wassa_train_csv_path)
wassa_dev = pd.read_csv(wassa_dev_csv_path)


In [None]:
print('wassa_train shape:', wassa_train.shape)
wassa_train['tweet'].head(10)

wassa_train shape: (5112, 2)


0    What is one thing that you can not live withou...
1    Hahahhaha bells will be ringing to show solida...
2    The Compassionate Civilization Collaborative (...
3    They say much was spared, but the images of HA...
4    "I may be small. I may be a girl, but I won’t ...
5    HASHTAG Today we have begun the definitive pha...
6    "The highest education is that which does not ...
7    “Hi Friends! Lots of people are making comment...
8    HASHTAG election: socialist party HASHTAG decl...
9    THAT EPISODE WAS FUCKING EVERYTHING... 🤯🤯🤯🤯🤯 H...
Name: tweet, dtype: object

In [None]:
wassa_train['emotion'].head(10)

0    4
1    0
2    2
3    5
4    4
5    3
6    4
7    4
8    4
9    3
Name: emotion, dtype: int64

In [None]:
X_train = wassa_train['tweet']
y_train = wassa_train['emotion']

X_test = wassa_dev['tweet']
y_test = wassa_dev['emotion']

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5112,)
(5112,)
(2191,)
(2191,)


In [None]:
# label: 'sadness': 0, 'neutral': 6, 'fear': 2, 'anger': 3, 'disgust': 4, 'surprise': 5, 'joy': 6
y_train.value_counts()

4    2313
3    1427
1     536
5     291
0     274
6     165
2     106
Name: emotion, dtype: int64

In [None]:
y_test.value_counts()

4    992
3    612
1    229
5    125
0    118
6     70
2     45
Name: emotion, dtype: int64

# Augmentation using ProtAugmenter

In [None]:
! pip install nlpaug==1.1.7

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

data = {'text': X_train, 'label': y_train}
base_train = pd.DataFrame.from_dict(data)
base_train.shape

(5112, 2)

In [None]:
from nlpaug.augmenter.word import WordAugmenter

class ProtAugmenter(WordAugmenter):


    def __init__(self,n_data = 1, num_beams = 10,
        name='ProtAugmenter'):
        super().__init__(
            action='substitute')

        from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

        tokenizer = AutoTokenizer.from_pretrained("tdopierre/ProtAugment-ParaphraseGenerator")

        model = AutoModelForSeq2SeqLM.from_pretrained("tdopierre/ProtAugment-ParaphraseGenerator")
        model = model.to('cuda')

        self.model = model
        self.tokenizer = tokenizer
        self.n_data = n_data
        self.num_beams = num_beams

    def substitute(self, data, n=1):

        batch = self.tokenizer(data, return_tensors='pt').to('cuda')
        generated_ids = self.model.generate(batch['input_ids'], num_return_sequences = self.n_data, num_beams = self.num_beams)
        result = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

        return result

In [None]:
aug_prot = ProtAugmenter(5)
ProtAugm_train = base_train.copy()
ProtAugm_train['paraphrase'] = ProtAugm_train['text'].progress_apply(lambda x:aug_prot.augment(x))
ProtAugm_train = ProtAugm_train.explode('paraphrase').reset_index(drop=True)

In [None]:
ProtAugm_train.shape

(25560, 3)

# Create CSV file from augmented dataset

In [None]:
!cd '/content/drive/My Drive/Colab Notebooks/NLP Augmentation/Datasets/'
ProtAugm_train.to_csv('emoevent_prot_aug_5.csv', encoding='utf-8', index=False, sep=',')