### Quora preprocessing 

**(~1:30h gpu run time)**

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

### Load data

In [None]:
df_raw = pd.read_csv("../data/raw/train.csv", low_memory=False)
df_raw.head()

In [None]:
df_raw.shape

In [None]:
sampled_df = df_raw[df_raw.target == 0].sample(n=480_000, random_state=42)

df = pd.concat(
    [sampled_df, df_raw[df_raw.target == 1]]
)
df.head()

In [None]:
# Target samples
df.shape[0] - 200_000

### Preprocessing

In [None]:
# Train/Test
X, y = df.drop('target', axis=1), df.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=42
)

train_df = X_train
train_df['target'] = y_train

test_df = X_test
test_df['target'] = y_test

### Random Oversampling

In [None]:
# Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

resampled_df = X_resampled
resampled_df['target'] = y_resampled

### nlpaug

Using:
  * KeyboardAug
  * ContextualWordEmbsAug
  * SynonymAug
  * BackTranslationAug
  * SpellingAug

80_000 new examples for each augment 

In [None]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas

df_1 = train_df[train_df.target==1].copy()

texts = list(df_1.question_text)
texts[:5]

In [None]:
key_board_aug = nac.KeyboardAug(aug_char_max=2, aug_word_max=2)
key_board_texts = key_board_aug.augment(texts)
key_board_texts[:5]

In [None]:
synonym_aug = naw.SynonymAug(aug_max=2)
synonym_texts = synonym_aug.augment(texts)
synonym_texts[:5]

In [None]:
# Load contextual words
contextual_words_aug = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', aug_max=4, device='cuda')
contextual_words_aug

In [None]:
contextual_words = []

for i in range(0, len(texts), 64):
    if i%1024==0:
        print(f"{i}/{len(texts)}")
    contextual_words += contextual_words_aug.augment(texts[i: i + 64])

contextual_words[:5]

In [None]:
back_translation_aug = nac.BackTranslationAug(aug_char_max=2)
back_translation_aug

In [None]:
back_translation_texts = []

for i in range(0, len(texts), 64):
    if i%1024==0:
        print(f"{i}/{len(texts)}")
    contextual_words += contextual_words_aug.augment(texts[i: i + 64])

back_translation_texts = back_translation_aug.augment(texts)
back_translation_texts[:5]

In [None]:
new_texts = key_board_texts + synonym_texts + contextual_words  + back_translation_texts
new_texts[:5]

In [None]:
df_nlpaug = pd.DataFrame({'question_text': new_texts, 'target': np.ones(len(new_texts))})
df_nlpaug.head()

In [None]:
train_nlpaug = pd.concat([train_df, df_nlpaug])
train_nlpaug.head()

In [None]:
train_nlpaug.shape

In [None]:
train_nlpaug.target.value_counts()

### Save

In [None]:
# train datasets
resampled_df.to_csv("../data/ros/train.csv")
train_nlpaug.to_csv("../data/nlpaug/train.csv")

# nlpaug
test_df.to_csv("../data/processed/test.csv")