# Different approaches of data augmentation are listed below :-


1. **Random Insertion:** Inserting this identified synonym at some random position in the sentence and this word is not in stopwords.

2. **Random Deletion:** Randomly removing words within the sentence.

3. **Random Swapping:** Randomly choose two words within the sentence and swap their positions.
4. **Backtranslation:** A sentence is translated in one language and then a new sentence is translated again in the original language. So, different sentences are created.
5. **Generative Models:** A generative adversarial network (GAN) is trained to generate text with a few words and generative language models like BERT, RoBERTa, BART and T5 model can be used to generate the text in a more class category preserving manner.

# We will be utilizing nlpaug library for data set augmentation using random insertion method

In [None]:
!pip install nlpaug
!pip install keras_preprocessing

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11
Collecting keras_preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: keras_preprocessing
Successfully installed keras_preprocessing-1.1.2


In [None]:
import nlpaug.augmenter.word as naw
aug = naw.SynonymAug(aug_src='wordnet', model_path=None, name='Synonym_Aug', aug_min=1, aug_max=10, aug_p=0.3, lang='eng',
                     stopwords=None, tokenizer=None, reverse_tokenizer=None, stopwords_regex=None, force_reload=False,
                     verbose=0)
test_sentence = "I am rich"
test_sentence_aug = aug.augment(test_sentence)
print(test_sentence_aug[0])

I be rich


# Augmenting the data to the original dataset

In [None]:
import pandas as pd
import numpy as np
import bz2
import os
import re
import gc

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.utils import pad_sequences

from tensorflow.keras import models, layers, optimizers

from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
def assign_labels_and_comments(file):
    labels = []
    extra_labels = []
    comments = []
    extra_comments = []
    i=0
    for line in bz2.BZ2File(file):
        i+=1
        if i>20000:
          break
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        extra_labels.append(int(x[9]) - 1)
        comments.append(x[10:].strip())
        x = aug.augment(x[10:])
        extra_comments.append(x[0].strip())
    return labels, comments, extra_labels, extra_comments

In [None]:
train_labels, train_comments, et_l, et_c = assign_labels_and_comments('/content/drive/MyDrive/Intel SIP/train.ft.txt.bz2')
test_labels, test_comments, ete_l, ete_c = assign_labels_and_comments('/content/drive/MyDrive/Intel SIP/test.ft.txt.bz2')

In [None]:
print(f"Size of dataset before augmentation {len(train_labels)+len(test_labels)}")
train_labels.extend(et_l)
test_labels.extend(ete_l)
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)
train_comments.extend(et_c)
test_comments.extend(ete_c)
print(f"Size of dataset after augmentation {len(train_labels)+len(test_labels)}")

Size of dataset before augmentation 40000
Size of dataset after augmentation 80000


In [None]:
not_alphanumeric = re.compile(r'[\W]')
not_ascii = re.compile(r'[^a-z0-1\s]')
def processed_comments(texts):
    processed_comments = []
    for text in texts:
        lower = text.lower()
        no_punctuation = not_alphanumeric.sub(r' ', lower)
        no_non_ascii = not_ascii.sub(r'', no_punctuation)
        processed_comments.append(no_non_ascii)
    return processed_comments
train_comments = processed_comments(train_comments)
test_comments = processed_comments(test_comments)
train_comments, val_comments, train_labels, val_labels = train_test_split(train_comments, train_labels, random_state=42, test_size=0.2)
maximum_features = 14000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=maximum_features)
tokenizer.fit_on_texts(train_comments)
train_comments = tokenizer.texts_to_sequences(train_comments)
val_comments = tokenizer.texts_to_sequences(val_comments)
test_comments = tokenizer.texts_to_sequences(test_comments)
maximum_length = max(len(train_ex) for train_ex in train_comments)
train_comments_pad = tf.keras.preprocessing.sequence.pad_sequences(train_comments, maxlen=maximum_length)
val_comments_pad = tf.keras.preprocessing.sequence.pad_sequences(val_comments, maxlen=maximum_length)
test_comments_pad = tf.keras.preprocessing.sequence.pad_sequences(test_comments, maxlen=maximum_length)
del train_comments, val_comments, test_comments

In [None]:
def cnn_model():
    sequences = layers.Input(shape=(maximum_length,))
    embedded = layers.Embedding(maximum_features, 64)(sequences)
    x = layers.Conv1D(64, 3, activation='relu')(embedded)
    x = tf.keras.layers.BatchNormalization()(x)
    x = layers.MaxPool1D(3)(x)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = layers.MaxPool1D(5)(x)
    x = layers.Conv1D(64, 5, activation='relu')(x)
    x = layers.GlobalMaxPool1D()(x)
    x = layers.Flatten()(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    model.summary()
    return model

model = cnn_model()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 205)]             0         
                                                                 
 embedding (Embedding)       (None, 205, 64)           896000    
                                                                 
 conv1d (Conv1D)             (None, 203, 64)           12352     
                                                                 
 batch_normalization (Batch  (None, 203, 64)           256       
 Normalization)                                                  
                                                                 
 max_pooling1d (MaxPooling1  (None, 67, 64)            0         
 D)                                                              
                                                                 
 conv1d_1 (Conv1D)           (None, 63, 64)            20544 

In [None]:
model.fit(
    train_comments_pad,
    train_labels,
    batch_size=512,
    epochs=3,
    validation_data=(val_comments_pad, val_labels))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7be422e683a0>

In [None]:
def rnn_model():
    sequences = layers.Input(shape=(maximum_length,))
    embedded = layers.Embedding(maximum_features, 64)(sequences)
    x = tf.compat.v1.keras.layers.CuDNNGRU(128, return_sequences=True)(embedded)
    x = tf.compat.v1.keras.layers.CuDNNGRU(128)(x)
    x = layers.Dense(32, activation='relu')(x)
    x = layers.Dense(100, activation='relu')(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='rmsprop',
        loss='binary_crossentropy',
        metrics=['binary_accuracy']
    )
    model.summary()
    return model

rnn_model = rnn_model()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 205)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 205, 64)           896000    
                                                                 
 cu_dnngru (CuDNNGRU)        (None, 205, 128)          74496     
                                                                 
 cu_dnngru_1 (CuDNNGRU)      (None, 128)               99072     
                                                                 
 dense_2 (Dense)             (None, 32)                4128      
                                                                 
 dense_3 (Dense)             (None, 100)               3300      
                                                                 
 dense_4 (Dense)             (None, 1)                 101 

In [None]:
rnn_model.fit(train_comments_pad,
    train_labels,
    batch_size=512,
    epochs=3,
    validation_data=(val_comments_pad, val_labels))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7be4210b2170>

In [None]:
MAX_NB_WORDS = 10000
MAX_SEQUENCE_LENGTH = 250
EMBEDDING_DIM = 100
def get_labels_and_texts(file):
    labels = []
    texts = []
    i = 0
    for line in bz2.BZ2File(file):
        i+=1
        if i>20000:
          break
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    return np.array(labels), texts
train_labels, train_texts = get_labels_and_texts('/content/drive/MyDrive/Intel SIP/train.ft.txt.bz2')
test_labels, test_texts = get_labels_and_texts('/content/drive/MyDrive/Intel SIP/test.ft.txt.bz2')
train_df=pd.DataFrame(zip(train_texts,train_labels),columns=['text','label'])
test_df=pd.DataFrame(zip(test_texts,test_labels),columns=['text','label'])
import regex as re
import spacy
nlp = spacy.load('en_core_web_sm')
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem.regexp import RegexpStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
def remove_special_characters(text):
  text=text.str.lower()
  text=text.apply(lambda x: re.sub(r'[0-9]+','',x))
  text=text.apply(lambda x: re.sub(r'@mention',' ',x))
  text=text.apply(lambda x: re.sub(r'https?:\/\/\S+', ' ',x))
  text=text.apply(lambda x: re.sub(r"www.\[a-z]?\.?(com)+|[a-z]+\.(com)", ' ',x))
  text=text.apply(lambda x: re.sub(r"[_\,\>\(\-:\)\\\/\!\.\^\!\:\];='#]",'',x))
  return text
train_df['text']=remove_special_characters(train_df['text'])
test_df['text']=remove_special_characters(test_df['text'])
from keras.preprocessing import text,sequence


tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(train_df['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
from keras_preprocessing.sequence import pad_sequences

train_text = tokenizer.texts_to_sequences(train_df['text'].values)
train_text = pad_sequences(train_text, maxlen=MAX_SEQUENCE_LENGTH)

y = pd.get_dummies(train_df['label']).values
X_train, X_test, Y_train, Y_test = train_test_split(train_text,y, test_size = 0.10, random_state = 42)

Found 58537 unique tokens.


In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout,SpatialDropout1D,GlobalMaxPooling1D, Dense
import tensorflow as tf

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=train_text.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(2, activation='softmax'))


model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 250, 100)          1000000   
                                                                 
 spatial_dropout1d (Spatial  (None, 250, 100)          0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense_5 (Dense)             (None, 128)               12928     
                                                                 
 dense_6 (Dense)             (None, 2)                 258       
                                                                 
Total params: 1093586 (4.17 MB)
Trainable params: 1093586 (4.17 MB)
Non-trainable params: 0 (0.00 Byte)
__________________

In [None]:
epochs = 3
batch_size = 512

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


# Conclusion

* **LSTM** based model would be the preferred model due to its high accuracy and it's low loss score.  
* Data augmentation helped the model's be more accurate and the training could be performed on a much bigger dataset than previous with very low effort.
* Data augmentation was worth the time.
* We should continuosly improve the dataset by adding further human made comments and adding the augmented data to improve the model further.


