In [1]:
# Install libraries
import pandas as pd
import numpy as np
import nlpaug.augmenter.word as nlpaw
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Import utility functions
from src.data_utils import analyze_dist
from src.data_utils import augment_sentence
from src.data_utils import augment_text
from src.data_utils import combine_toxic_classes
from src.data_utils import get_relevant_words
from src.data_utils import undersample_majority

In [2]:
# load the data 
train_valid = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
test_labels = pd.read_csv('data/test_labels.csv')

# check data 
print('Our (training + valid) data has ', train_valid.shape[0], ' rows.')
print('Our test data has ', test.shape[0], ' rows.')
print('Our test label data has ', test_labels.shape[0], ' rows.')

# Allow us to see full text (not truncated)
pd.set_option('display.max_colwidth', None)

Our (training + valid) data has  159571  rows.
Our test data has  153164  rows.
Our test label data has  153164  rows.


## Generating  a UnBalanced Dataset

undersampling the class with more data points, until toxic reaches around 20 % 

In [3]:
# Convert from multi-label --> binary classification
train_valid = combine_toxic_classes(train_valid)

# Undersample majority class (class=0)
unbalanced_df = undersample_majority(train_valid, .42)

# Generate 80-20 train-validation splits
X_train, X_valid, y_train, y_valid = train_test_split(unbalanced_df['comment_text'],
                                                      unbalanced_df['isToxic'],
                                                      train_size=0.8,
                                                      stratify=unbalanced_df['isToxic'],
                                                      shuffle=True,
                                                      random_state=42)

# Output splits of unbalanced dataset
X_train.to_csv('data/unbalanced/X_train.csv', index=False)
X_valid.to_csv('data/unbalanced/X_valid.csv', index=False)
y_train.to_csv('data/unbalanced/y_train.csv', index=False)
y_valid.to_csv('data/unbalanced/y_valid.csv', index=False)

## Generating Balanced Dataset
using nlpaug, to generate synthetic data we will be augmenting it to the dataset making it 50-50 balanced 

In [4]:
to_aug = pd.concat([X_train, y_train], axis=1)

# Select the first 128 words of text (the maximum token length we will be using)
# so that augmentation is only applied to these words.
to_aug['comment_text'].apply(lambda text: get_relevant_words(text, 128))

# Define nlpaug augmentation object 
aug10p = nlpaw.ContextualWordEmbsAug(model_path='bert-base-uncased', aug_min=1, aug_p=0.1, action="substitute")

# Upsample minority class ('isToxic'==1) to create a roughly 50-50 class distribution
balanced_df = augment_text(to_aug, aug10p, 8, 3)

# Get splits for Balanced Dataset
X_train_aug = balanced_df['comment_text']
X_valid_aug = X_valid
y_train_aug = balanced_df['isToxic']
y_valid_aug = y_valid

# Output balanced data
X_train_aug.to_csv('data/balanced/X_train_aug.csv', index=False)
X_valid_aug.to_csv('data/balanced/X_valid_aug.csv', index=False)
y_train_aug.to_csv('data/balanced/y_train_aug.csv', index=False)
y_valid_aug.to_csv('data/balanced/y_valid_aug.csv', index=False)

  0%|          | 0/3 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1670 > 512). Running this sequence through the model will result in indexing errors
  0%|          | 0/3 [15:50<?, ?it/s]


RuntimeError: The size of tensor a (533) must match the size of tensor b (512) at non-singleton dimension 1