In [2]:
import re
import string 
import random
import warnings  
import numpy as np
import pandas as pd 
import seaborn as sns 
from tqdm.notebook import tqdm 
import matplotlib.pyplot as plt

In [3]:
import nltk
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

In [4]:
tqdm.pandas()
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('stopwords')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package omw-1.4 to /home/anindya/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/anindya/nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/anindya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
data = pd.read_csv(
    '../Data/train.csv'
)

In [6]:
# some utility functions to clean the text 

lemmatizer = WordNetLemmatizer()

def decontract(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

def process_text(text):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    text = re.sub(r'$\w*', '', text)
    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'#', '', text)
    text = str(re.sub("\S*\d\S*", "", text).strip()) 
    text=decontract(text)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
    tokens = tokenizer.tokenize(text)

    texts_clean = []
    for word in tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation+'...'):  # remove punctuation
            # 
            stem_word = lemmatizer.lemmatize(word,"v")  # Lemmatizing word
            texts_clean.append(stem_word)

    return " ".join(texts_clean)

In [7]:
data['text'] = data['text'].apply(lambda text: process_text(text))
data['reason'] = data['reason'].apply(lambda reason: process_text(reason))

### Dividing the data into samples

In [14]:
sample1 = data.sample(1000, random_state=143)
sample1_index = sample1.index

In [15]:
left = len(data) - 1000
sample2 = data[~data.index.isin(sample1_index)]

In [16]:
sample1, sample2 = sample1.reset_index(drop=True), sample2.reset_index(drop=True)

**Negative sampling for Sample1**

In [17]:
import nlpaug
import nlpaug.augmenter.word as naw

2022-12-30 18:35:15.144452: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-30 18:35:15.348193: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-12-30 18:35:16.040137: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-12-30 18:35:16.040286: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] 

In [18]:
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', 
    action="substitute",aug_max=3)

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [19]:
def augment_text(text):
    return aug.augment(text, n=1)[0]

In [20]:
sample1['text'] = sample1['text'].progress_apply(lambda text: augment_text(text))
sample1['reason'] = sample1['reason'].progress_apply(lambda text: augment_text(text))
sample1['label'] = 0

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

**Negative sampling for Sample2**

In [23]:
from transformers import pipeline  

generator = pipeline(
    'text-generation', model='gpt2'
)

In [24]:
text_samples = []
sampled_reasons = sample2['reason'].tolist()

In [25]:
def generate_text_from_prompt(prompt):
    generated_reason = generator(
        prompt, 
        max_length=len(prompt), pad_token_id=50256, num_return_sequences=1
    )[0]['generated_text'][len(prompt):]
    return generated_reason.replace('\n', '')

In [26]:
generated_texts = []
for prompt in tqdm(sampled_reasons, total=len(sampled_reasons)):
    generated_texts.append(
        generate_text_from_prompt(prompt)
    )

  0%|          | 0/1061 [00:00<?, ?it/s]

In [27]:
generated_reasons=[]
sampled_texts = sample2['text'].tolist()

for prompt in tqdm(sampled_texts, total=len(sampled_texts)):
    generated_reasons.append(
        generate_text_from_prompt(prompt)
    )

  0%|          | 0/1061 [00:00<?, ?it/s]

In [33]:
sample2['generated_text'] = generated_texts
sample2['generated_reason'] = generated_reasons

In [40]:
sample2['generated_text'] = sample2['generated_text'].progress_apply(lambda text: process_text(text))
sample2['generated_reason'] = sample2['generated_reason'].progress_apply(lambda reason: process_text(reason))

  0%|          | 0/1061 [00:00<?, ?it/s]

  0%|          | 0/1061 [00:00<?, ?it/s]

In [42]:
sample2['label'] = 0

In [44]:
sample2 = sample2[['generated_text', 'generated_reason', 'label']]

In [50]:
sample2.columns = ['text', 'reason', 'label']
sample2.head()

Unnamed: 0,text,reason,label
0,timewe meet share class time,think case bite easier find,0
1,magazine addition also additional support offl...,installationproudly make easy,0
2,stability make search base search significantl...,app support video audio,0
3,low quality higher quality positive,think seem really crappy video,0
4,,sign password mail,0


In [46]:
mkdir samples

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [48]:
sample1.to_csv('samples/sample1.csv')
sample2.to_csv('samples/sample2.csv')

### Approach 3 500 + 500 data points

In [62]:
opp_aug = naw.AntonymAug('wordnet',aug_max=3)
syn_aug = naw.SynonymAug('wordnet', aug_max=3)
del_aug = naw.RandomWordAug(aug_max=3, action='delete')
sub_aug = naw.RandomWordAug(aug_max=3, action='substitute')

In [53]:
def augment_text_oppo(text):
    return aug2.augment(text, n=1)[0]

In [57]:
sample1000 = data.sample(1000, random_state=119)

In [63]:
def random_aug(df):
    df1 = df.iloc[:250, :]
    df2 = df.iloc[250:500, :]
    df3 = df.iloc[500:750, :]
    df4 = df.iloc[750:, :]
    
    df1['text'] = df1['text'].progress_apply(lambda x : opp_aug.augment(x, n=1)[0])
    df1['reason'] = df1['reason'].progress_apply(lambda x : syn_aug.augment(x, n=1)[0])

    df2['text'] = df2['text'].progress_apply(lambda x : del_aug.augment(x, n=1)[0])
    df2['reason'] = df2['reason'].progress_apply(lambda x : sub_aug.augment(x, n=1)[0])

    df3['text'] = df3['text'].progress_apply(lambda x : opp_aug.augment(x, n=1)[0])
    df3['reason'] = df3['reason'].progress_apply(lambda x : del_aug.augment(x, n=1)[0])

    df4['text'] = df4['text'].progress_apply(lambda x : sub_aug.augment(x, n=1)[0])
    df4['reason'] = df4['reason'].progress_apply(lambda x : syn_aug.augment(x, n=1)[0])
    
    text2 = df2['text'].tolist()
    text3 = df4['text'].tolist()
    
    df2['text'] = text3
    df3['text'] = text2
    
    df = pd.concat([df1, df2, df3, df4], axis=0)
    return df

In [55]:
sample1000['text'] = sample1000['text'].progress_apply(lambda text: augment_text_oppo(text))
sample1000['reason'] = sample1000['reason'].progress_apply(lambda text: augment_text_oppo(text))

  0%|          | 0/1000 [00:00<?, ?it/s]

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/anindya/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


  0%|          | 0/1000 [00:00<?, ?it/s]

In [64]:
sample1000_aug = random_aug(sample1000)

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

In [67]:
sample1000_aug['label'] = 0

In [69]:
sample1000.to_csv('samples/sample1000.csv')

### Done