# Data Augmentation for Aggression

In [1]:
## ENABLE GPU BEFORE PROCEEDING WITH NOTEBOOK

! pip3 install -qq transformers
import transformers

In [2]:
import transformers
import numpy as np
import nltk
import pandas as pd
from google.colab import files
from tqdm import tqdm
import warnings

# nltk setup
nltk.download('stopwords')

# Set random seed
transformers.trainer_utils.set_seed(0)

# Shut off warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Set augmentation parameters
# DATA SPECIFIC PARAMETERS

stop_words = set(nltk.corpus.stopwords.words('english'))

aug_map = {
    'task' : 'Sub-task B',
    'map' : {
        'NGEN' : 0,
        'GEN' : 1,
    },
    'low' : ('GEN',)
}

non_aug_map = {
    'task' : 'Sub-task A',
    'map' : {
        'NAG' : 0,
        'CAG' : 1,
        'OAG' : 2,
    },
    'low' : ('CAG', 'OAG')
}

train_data_url = 'https://raw.githubusercontent.com/Dutta-SD/NLP/master/Aggression_Detection/trac2_eng_train.csv'

In [4]:
# Data to augment
train = pd.read_csv(train_data_url)
train.drop(['ID', non_aug_map['task']], axis = 1, inplace = True)

In [5]:
def create_aug_pipeline(model_name : str):
  return transformers.pipeline("fill-mask", model_name)

In [6]:
def stringCleanerMasker(
    ip_string, 
    stop_words, 
    num_mask_per_str = 1, 
    mask_delim = '[MASK]',
    max_valid_length = 512,
    max_mask_delim_replace = 1):
    '''
    Takes in a single string, applies a MASK token in some words.
    '''

    # Remove Stop Words
    ip_list = [tok for tok in ip_string.split() if tok not in stop_words]
    length = len(ip_list)

    if length > max_valid_length:
        return 'INVALID'

    try:
        mask_token = np.random.choice(ip_list, num_mask_per_str)[0]
        finalString = ' '.join(ip_list)
        finalString = finalString.replace(str(mask_token), mask_delim, max_mask_delim_replace)
        return finalString
    except Exception as e:
        return 'INVALID'

# Main Augmentation Function

In [7]:
def appendAugDataToDataFrame(
    train,
    aug_pipe_model_name, 
    stopwords, 
    target_col : str,
    target_label : str,):
    '''
    Created Augmented Data using BERT
    train : DataFrame - The dataframe to append
    target_col : str - name of column to augment
    target_label : str - label to augment
    '''

    subset = (train[target_col] == target_label)
    # print(subset.sum())

    _data = train[subset]

    _text, _labels = _data['Text'], _data[target_col]

    # Augmentation Pipeline, 
    aug_pipe = create_aug_pipeline(aug_pipe_model_name)
    
    # Augmented Data
    aug_data = []
    
    def stringApp(string, target_label, stopwords, aug_container):
        clean_string = stringCleanerMasker(string, stop_words = stopwords)
        if clean_string == 'INVALID':
            return

        # Generate augementations
        aug_preds = aug_pipe(clean_string)

        for item in aug_preds:
            sentence = item['sequence']
            aug_container.append((sentence, target_label))

        return aug_container

    for x, y in tqdm(zip(_text, _labels)):
        aug_data.extend(stringApp(x, y, stopwords, []))

    aug_data = pd.DataFrame(aug_data, columns = train.columns)
    df = train.append(aug_data, ignore_index = True)
    return df

In [8]:
# Augmentation
train = appendAugDataToDataFrame(
    train = train,
    aug_pipe_model_name = "bert-base-multilingual-cased",
    stopwords = stop_words,
    target_col = aug_map['task'],
    target_label = aug_map['low'][0]
)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
309it [01:20,  3.85it/s]


In [9]:
train[aug_map['task']].value_counts()

NGEN    3954
GEN     1854
Name: Sub-task B, dtype: int64

# Download the data

In [11]:
file_name = f"AUG_{aug_map['task']}_ENGLISH.csv"

train.to_csv(file_name, index = False)

files.download(file_name)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>