### Quora preprocessing 

**(~1:30h gpu run time)**

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

### Load data

In [2]:
df_raw = pd.read_csv("../data/raw/train.csv", low_memory=False)
df_raw.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [3]:
df_raw.shape

(1306122, 3)

In [4]:
sampled_df = df_raw[df_raw.target == 0].sample(n=480_000, random_state=42)

df = pd.concat(
    [sampled_df, df_raw[df_raw.target == 1]]
)
df.head()

Unnamed: 0,qid,question_text,target
1022714,c86ab618f85e9b7fc374,Is sadism a coping mechanism for people who ar...,0
641364,7d9ea6d66b8866e69240,Is it possible for me as a soul to go outside ...,0
1225111,f01982d0cd06aba308ed,Do Pet Animal Rescue workers minimize shows of...,0
1130433,dd8a6b5452a407cea2ac,How do you identify a sonnet and what can we i...,0
1220402,ef30e73bf0a81a06ccf6,Is there a special place in hell for the likes...,0


In [5]:
# Target samples
df.shape[0] - 200_000

360810

### Preprocessing

In [6]:
# Train/Test
X, y = df.drop('target', axis=1), df.target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.10, random_state=42
)

train_df = X_train
train_df['target'] = y_train

test_df = X_test
test_df['target'] = y_test

### Random Oversampling

In [7]:
# Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

resampled_df = X_resampled
resampled_df['target'] = y_resampled

### nlpaug

Using:
  * KeyboardAug
  * ContextualWordEmbsAug
  * SynonymAug
  * BackTranslationAug
  * SpellingAug

80_000 new examples for each augment 

In [8]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas

df_1 = train_df[train_df.target==1].copy()

texts = list(df_1.question_text)
texts[:5]

['Do you see a time where white people will become Trump supporters and start systematically killing Hispanics just like Germany did with Jewish people and other minorities back in the 1930s?',
 'Are Brahmins responsible for the perpetuation of caste system and caste-based discrimination in India?',
 'Why do the Kardashians and Jenner sisters only date black men? Why do the Kardashian sisters seem to prefer black boyfriends?',
 'Why are books about ‘how to rape a woman’ in schools in the US?',
 'How can I train my girlfriend to stop speaking out of place in public?']

In [10]:
key_board_aug = nac.KeyboardAug(aug_char_max=2, aug_word_max=2)
key_board_texts = key_board_aug.augment(texts)
key_board_texts[:5]

['Do you see a time where white people will become Trump supporters and start systematically killing Hispanics just like Germany did with Jeeiah people and other minorities back in the 1920d?',
 'Are Brahmins responsible for the perpe4uatiIn of caste system and caste - based discrihinatlon in India?',
 'Why do the Kardashians and Jenner siCterx only date black men? Why do the Kardashian sisters seem to prebe% black boyfriends?',
 'Why are books about ‘ how to rape a wLmWn ’ in sfBools in the US?',
 'How can I 5raiH my girlfriend to stop speaking out of olacW in public?']

In [44]:
synonym_aug = naw.SynonymAug(aug_max=2)
synonym_texts = synonym_aug.augment(texts)
synonym_texts[:5]

['Do you see a time where white people will become Trump help and start systematically killing Hispanics just like Germany did with Jewish people and other minority back in the 1930s?',
 'Ar Brahmins responsible for the perpetuation of caste organization and caste - based discrimination in India?',
 'Why do the Kardashians and Jenner sisters only date black men? Why do the Kardashian sisters seem to prefer black fellow?',
 'Why are books about ‘ how to rape a woman ’ in schooling in the u?',
 'How can I train my lady friend to stop speaking out of place in public?']

In [12]:
# Load contextual words
contextual_words_aug = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', aug_max=4, device='cuda')
contextual_words_aug

<nlpaug.augmenter.word.context_word_embs.ContextualWordEmbsAug at 0x7fc67db84c10>

In [None]:
contextual_words = []

for i in range(0, len(texts), 64):
    if i%1024==0:
        print(f"{i}/{len(texts)}")
    contextual_words += contextual_words_aug.augment(texts[i: i + 64])

contextual_words[:5]

In [None]:
#back_translation_aug = nac.BackTranslationAug(aug_char_max=2)
#back_translation_aug

In [None]:
#back_translation_texts = []
#
#for i in range(0, len(texts), 64):
#    if i%1024==0:
#        print(f"{i}/{len(texts)}")
#    contextual_words += contextual_words_aug.augment(texts[i: i + 64])
#
#back_translation_texts = back_translation_aug.augment(texts)
#back_translation_texts[:5]

In [56]:
new_texts = key_board_texts + synonym_texts + contextual_words # + back_translation_texts
new_texts[:5]

['Do you see a time where white people will become Trump supporters and start systematically killing Hispanics just like Germany did with Jeeiah people and other minorities back in the 1920d?',
 'Are Brahmins responsible for the perpe4uatiIn of caste system and caste - based discrihinatlon in India?',
 'Why do the Kardashians and Jenner siCterx only date black men? Why do the Kardashian sisters seem to prebe% black boyfriends?',
 'Why are books about ‘ how to rape a wLmWn ’ in sfBools in the US?',
 'How can I 5raiH my girlfriend to stop speaking out of olacW in public?']

In [57]:
df_nlpaug = pd.DataFrame({'question_text': new_texts, 'target': np.ones(len(new_texts))})
df_nlpaug.head()

Unnamed: 0,question_text,target
0,Do you see a time where white people will beco...,1.0
1,Are Brahmins responsible for the perpe4uatiIn ...,1.0
2,Why do the Kardashians and Jenner siCterx only...,1.0
3,Why are books about ‘ how to rape a wLmWn ’ in...,1.0
4,How can I 5raiH my girlfriend to stop speaking...,1.0


In [58]:
train_nlpaug = pd.concat([train_df, df_nlpaug])
train_nlpaug.head()

Unnamed: 0,qid,question_text,target
90150,11aaa9f9ddea69b5809f,What is the meaning of merits of the case?,0.0
58642,0b816d1f1762919ca21d,How do I find out the original owner of house ...,0.0
350652,44b93de11db2266b17ac,Why was Veer Savarkar linked to Mahatma Gandhi...,0.0
982543,c07df1117a2439500d86,What is the best moveset for Xurkitree?,0.0
743026,918731b2ade71a2cd655,Which is the best portal to buy dermatologists...,0.0


In [59]:
train_nlpaug.shape

(723216, 3)

In [60]:
train_nlpaug.target.value_counts()

0.0    431900
1.0    291316
Name: target, dtype: int64

### Save

In [61]:
# train datasets
resampled_df.to_csv("../data/ros/train.csv")
train_nlpaug.to_csv("../data/nlpaug/train.csv")

# nlpaug
test_df.to_csv("../data/processed/test.csv")