In [14]:
import pandas as pd
import numpy as np
import nlpaug.augmenter.word as naw

from tqdm import tqdm
from gensim.models import KeyedVectors

In [2]:
def read_sent_and_labels(path):
    labels = []
    msgs = []
    with open(f'{path}/label', "r") as labels_file:
        for line in labels_file:
            labels.append(line.strip())

    with open(f'{path}/seq.in', "r") as msgs_file:
        for line in msgs_file:
            msgs.append(line.strip())

    return msgs, labels

In [3]:
msgs_10, labels_10 = read_sent_and_labels("./data/banking77/train_10")

In [4]:
def make_aug(msgs, labels, aug_method, num_gens=1):
    new_msgs = []
    new_lbls = []
    for msg, lbl in tqdm(zip(msgs, labels), total=len(msgs)):
        for _ in range(num_gens):
            new_msgs.append(aug_method.augment(msg))
            new_lbls.append(lbl)
    
    return (new_msgs, new_lbls)

In [5]:
import os
def save_sent_and_labels(path, labels, msgs):
    if not os.path.exists(path):
        os.makedirs(path)
    with open(f'{path}/label', "w") as labels_file:
        for line in labels:
            labels_file.write(f"{line}\n")
  
    with open(f'{path}/seq.in', "w") as msgs_file:
        for line in msgs:
            msgs_file.write(f"{line}\n")

# Random swap (RS)

In [33]:
random_swp_aug = naw.RandomWordAug(action="swap")

In [34]:
rs_new = make_aug(msgs_10, labels_10, random_swp_aug)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 770/770 [00:00<00:00, 9993.89it/s]


In [43]:
save_sent_and_labels("./data/random_swap", rs_new[1], rs_new[0])

# Random delete (RD)

In [35]:
random_del_aug = naw.RandomWordAug(action="delete")

In [36]:
rd_new = make_aug(msgs_10, labels_10, random_del_aug)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 770/770 [00:00<00:00, 12784.17it/s]


In [44]:
save_sent_and_labels("./data/random_delete", rd_new[1], rd_new[0])

# Contextual Replacement (RoBERT)

In [37]:
contextual_replace_aug = naw.ContextualWordEmbsAug(model_path='roberta-base', action="substitute")

HBox(children=(IntProgress(value=0, description='Downloading', max=481, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=898823, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=456318, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=1355863, style=ProgressStyle(description_wi…




HBox(children=(IntProgress(value=0, description='Downloading', max=501200538, style=ProgressStyle(description_…




In [38]:
context_replace_new = make_aug(msgs_10, labels_10, contextual_replace_aug)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 770/770 [01:01<00:00, 12.52it/s]


In [45]:
save_sent_and_labels("./data/contextual_replacement", context_replace_new[1], context_replace_new[0])

# Synonym Replacement (WordNet)

In [46]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/gdrozdov/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/gdrozdov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/gdrozdov/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [47]:
synonym_replacement_aug = naw.SynonymAug(aug_src='wordnet')

In [48]:
synonym_replace_new = make_aug(msgs_10, labels_10, synonym_replacement_aug)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 770/770 [00:05<00:00, 152.19it/s]


In [50]:
save_sent_and_labels("./data/synonym_replace", synonym_replace_new[1], synonym_replace_new[0])

# Word Replacement (Word2Vec)

In [23]:
!gdown 0B7XkCwpI5KDYNlNUTTlSS21pQmM -O ./data/
!wget https://nlp.stanford.edu/data/glove.840B.300d.zip -O data/glove.840B.300d.zip

Downloading...
From: https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM
To: /home/gdrozdov/diploma/Text-Augmentation/data/GoogleNews-vectors-negative300.bin.gz
100%|██████████████████████████████████████| 1.65G/1.65G [00:16<00:00, 98.7MB/s]


In [20]:
!wget https://nlp.stanford.edu/data/glove.6B.zip -O data/glove.6B.zip

--2022-05-10 14:58:12--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-05-10 14:58:13--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘data/glove.6B.zip’


2022-05-10 15:00:55 (5.07 MB/s) - ‘data/glove.6B.zip’ saved [862182613/862182613]



In [11]:
!gzip -d data/GoogleNews-vectors-negative300.bin.gz

In [52]:
word_replacement_aug = naw.WordEmbsAug(
    model_type='glove', 
    model_path='./glove.6B.200d.txt',
    action="substitute"
)

In [53]:
word_replace_new = make_aug(msgs_10, labels_10, word_replacement_aug)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 770/770 [00:29<00:00, 26.39it/s]


In [54]:
save_sent_and_labels("./data/glove_word_replace", word_replace_new[1], word_replace_new[0])

# Back Translation

In [6]:
back_translation_aug = naw.BackTranslationAug(
    from_model_name='Helsinki-NLP/opus-mt-en-de', 
    to_model_name='Helsinki-NLP/opus-mt-de-en'
)

In [7]:
back_translation_new = make_aug(msgs_10, labels_10, back_translation_aug)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 770/770 [07:15<00:00,  1.47it/s]


In [9]:
save_sent_and_labels("./data/back_translation", back_translation_new[1], back_translation_new[0])

# GPT-2 Generatioon

In [15]:
import csv
import os
import argparse
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_cosine_with_hard_restarts_schedule_with_warmup
import warnings
warnings.filterwarnings('ignore')

In [19]:
import pandas as pd

In [20]:
df = pd.DataFrame({'intent': labels_10, 'message': msgs_10})

In [22]:
df.to_csv('./data/banking77.csv', index=False)

In [23]:
class MyDataset(Dataset):
    def __init__(self, data_file_name, data_dir='.data/'):
        super().__init__()

        data_path = os.path.join(data_file_name)

        self.data_list = []
        self.end_of_text_token = " <|endoftext|> "
        
        with open(data_path) as csv_file:
            csv_reader = csv.reader(csv_file)
            
            for row in csv_reader:
                data_str = f"{row[0]}: {row[1]}{self.end_of_text_token}"
                self.data_list.append(data_str)
        
    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, item):
        return self.data_list[item]

In [24]:
def get_data_loader(data_file_name):
    dataset = MyDataset(data_file_name)
    data_loader = DataLoader(dataset, batch_size=1, shuffle=True)
    return data_loader

In [30]:
def train(epochs, data_loader, batch_size, tokenizer, model, device):	
    batch_counter = 0
    sum_loss = 0.0

    for epoch in range(epochs):
        print (f'Running {epoch+1} epoch')

        for idx, txt in enumerate(data_loader):
            txt = torch.tensor(tokenizer.encode(txt[0]))
            txt = txt.unsqueeze(0).to(device)
            outputs = model(txt, labels=txt)
            loss, _ = outputs[:2]
            loss.backward()
            sum_loss += loss.data

            if idx%batch_size==0:
                batch_counter += 1
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            if batch_counter == 10:
                print(f"Total Loss is {sum_loss}") #printed after every 10*batch_size
                batch_counter = 0
                sum_loss = 0.0

    return model

def save_model(model, name):
    """
    Summary:
        Saving model to the Disk
    Parameters:
        model: Trained model object
        name: Name of the model to be saved
    """
    print("Saving model to Disk")
    torch.save(model.state_dict(), f"{name}.pt")
    return

def load_models():
    """
    Summary:
        Loading Pre-trained model
    """
    print('Loading/Downloading GPT-2 Model')
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
    model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
    return tokenizer, model

In [33]:
BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 3e-5
WARMUP_STEPS = 300
MAX_SEQ_LEN = 200
MODEL_NAME = "banking77.pt"
DATA_FILE = "./data/banking77.csv"

TOKENIZER, MODEL = load_models()
LOADER = get_data_loader(DATA_FILE)

DEVICE = 'cpu'
if torch.cuda.is_available():
    DEVICE = 'cuda'

model = MODEL.to(DEVICE)
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=-1)

Loading/Downloading GPT-2 Model


In [34]:
model = train(EPOCHS, LOADER, BATCH_SIZE, TOKENIZER, MODEL, DEVICE)

Running 1 epoch
Total Loss is 1433.6307373046875
Total Loss is 1501.3101806640625
Running 2 epoch
Total Loss is 1274.2281494140625
Total Loss is 1260.2935791015625
Total Loss is 1080.64599609375
Running 3 epoch
Total Loss is 820.4241333007812
Total Loss is 798.3790893554688
Running 4 epoch
Total Loss is 642.8914794921875
Total Loss is 658.754638671875
Total Loss is 634.7021484375
Running 5 epoch
Total Loss is 534.4229736328125
Total Loss is 566.64697265625
Running 6 epoch
Total Loss is 484.8356018066406
Total Loss is 492.71929931640625
Total Loss is 483.59454345703125
Running 7 epoch
Total Loss is 410.1628723144531
Total Loss is 436.7789001464844
Running 8 epoch
Total Loss is 381.4982604980469
Total Loss is 404.4756774902344
Total Loss is 403.80194091796875
Running 9 epoch
Total Loss is 334.16302490234375
Total Loss is 371.2838134765625
Running 10 epoch
Total Loss is 344.3932189941406
Total Loss is 343.9325866699219
Total Loss is 347.8749084472656


In [35]:
save_model(model, MODEL_NAME)

Saving model to Disk


In [59]:
def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""

def choose_from_top_k_top_n(probs, k=50, p=0.8):
    ind = np.argpartition(probs, -k)[-k:]
    top_prob = probs[ind]
    top_prob = {i: top_prob[idx] for idx,i in enumerate(ind)}
    sorted_top_prob = {k: v for k, v in sorted(top_prob.items(), key=lambda item: item[1], reverse=True)}

    t=0
    f=[]
    pr = []
    for k,v in sorted_top_prob.items():
        t+=v
        f.append(k)
        pr.append(v)
        if t>=p:
            break
    top_prob = pr / np.sum(pr)
    token_id = np.random.choice(f, 1, p = top_prob)

    return int(token_id)

def generate(tokenizer, model, sentences, label):
    res = []
    with torch.no_grad():
        for idx in range(sentences):
            finished = False
            cur_ids = torch.tensor(tokenizer.encode(label)).unsqueeze(0).to('cpu')
            for i in range(100):
                outputs = model(cur_ids, labels=cur_ids)
                loss, logits = outputs[:2]

                softmax_logits = torch.softmax(logits[0,-1], dim=0)

                if i < 5:
                    n = 10
                else:
                    n = 5

                next_token_id = choose_from_top_k_top_n(softmax_logits.to('cpu').numpy()) #top-k-top-n sampling
                cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to('cpu') * next_token_id], dim = 1)

                if next_token_id in tokenizer.encode('<|endoftext|>'):
                    finished = True
                    break

            if finished:        
                output_list = list(cur_ids.squeeze().to('cpu').numpy())
                output_text = tokenizer.decode(output_list)
                res.append(find_between(output_text, ": ", " <|endoftext|>"))
            else:
                output_list = list(cur_ids.squeeze().to('cpu').numpy())
                output_text = tokenizer.decode(output_list)
                res.append(find_between(output_text, ": ", " <|endoftext|>"))
    
    return res

def load_models(model_name):
    """
    Summary:
        Loading the trained model
    """
    print ('Loading Trained GPT-2 Model')
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
    model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
    model_path = model_name
    model.load_state_dict(torch.load(model_path))
    return tokenizer, model

In [60]:
SENTENCES = 10
MODEL_NAME = "./banking77.pt.pt"
LABEL = "activate_my_card" # LABEL = "atis_flight"

TOKENIZER, MODEL = load_models(MODEL_NAME)

Loading Trained GPT-2 Model


In [63]:
uniq_labels = set(labels_10)

In [68]:
gpt2_lbls, gpt2_msgs = [], [] 

In [None]:
for lbl in tqdm(uniq_labels):
    res = generate(TOKENIZER, MODEL, SENTENCES, lbl)
    for cur in res:
        gpt2_msgs.append(cur)
        gpt2_lbls.append(lbl)


  0%|                                                                                                                                                                                                                  | 0/77 [00:00<?, ?it/s][A
  1%|██▌                                                                                                                                                                                                       | 1/77 [00:06<07:56,  6.28s/it][A
  3%|█████▏                                                                                                                                                                                                    | 2/77 [00:12<07:49,  6.26s/it][A
  4%|███████▊                                                                                                                                                                                                  | 3/77 [00:18<07:31,  6.10s/it][A
  5%|██████████▍               

In [72]:
len(gpt2_msgs)

770

In [70]:
print(1)

1


In [75]:
save_sent_and_labels("./data/gpt2_msgs", gpt2_lbls, gpt2_msgs)

In [48]:
res

['activate_my_card: i need to activate my card <|endoftext|>',
 'activate_my_card: how do i activate my card? <|endoftext|>',
 'activate_my_card: how do i activate my card? <|endoftext|>',
 'activate_my_card: where can i find my new card? <|endoftext|>',
 'activate_my_card: can i start using my card? <|endoftext|>',
 'activate_my_card: can i have my card activated? <|endoftext|>',
 'activate_my_card: i have activated my card. <|endoftext|>',
 'activate_my_card: how do i activate my card? <|endoftext|>',
 'activate_my_card: what is my option to activate my card? <|endoftext|>',
 'activate_my_card: how can i activate my card? <|endoftext|>']