In [None]:
!pip install nlpaug
!pip install nltk
!pip install sacremoses
!pip install datasets
!pip install accelerate -U
!pip install transformers[torch]



In [None]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
import pandas as pd
import accelerate
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import MarianMTModel, MarianTokenizer
from datasets import Dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter, defaultdict
import math
import copy
import random
import operator
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf
import time
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer
from transformers.modeling_outputs import TokenClassifierOutput
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
class BaseModel(nn.Module):
    def __init__(self, num_epochs=1):
        super(BaseModel, self).__init__()

        self.id2label = {0: "NEGATIVE", 1: "POSITIVE"}
        self.label2id = {"NEGATIVE": 0, "POSITIVE": 1}
        self.num_labels = 2

        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        self.model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=self.num_labels, id2label=self.id2label, label2id=self.label2id)
        self.loss_fn = nn.BCELoss()
        self.classifier = nn.Linear(768, self.num_labels)
        self.activation = nn.Sigmoid()
        self.dropout = nn.Dropout(0.1)
        self.trainer = None

        self.train_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=num_epochs,
            per_device_train_batch_size=32,
            per_device_eval_batch_size=32,
            warmup_steps=500,
            weight_decay=0.01,
            logging_strategy='steps',
            logging_steps=10,
            evaluation_strategy="epoch",
            logging_dir='./logs',
        )

    def compute_metrics(self, pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        precision, recall, f1, _ = precision_recall_fscore_support(
            labels, preds, average="binary"
        )
        acc = accuracy_score(labels, preds)
        return {
            "accuracy": acc,
            "f1": f1,
            "precision": precision,
            "recall": recall,
        }

    def apply_tokenizer(self, batch):
        return self.tokenizer(
            batch["text"],
            truncation=True,
            padding=True,
            max_length=100,
            add_special_tokens=True,
        )


    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

        cls_outputs = outputs.last_hidden_state[:, 0, :]

        dropout_output = self.dropout(cls_outputs)

        outputs = self.classifier(dropout_output)

        logits = self.activation_function(outputs).view(-1, 2).float()

        loss = None

        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return TokenClassifierOutput(loss=loss, logits=logits)


    def train(self, train_df, dev_df):
        train_hf = Dataset.from_pandas(train_df)
        dev_hf = Dataset.from_pandas(dev_df)

        tokenized_train = train_hf.map(self.apply_tokenizer, batched=True)
        tokenized_dev = dev_hf.map(self.apply_tokenizer, batched=True)

        self.trainer = Trainer(
            model=self.model,
            args=self.train_args,
            tokenizer=self.tokenizer,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_dev,
            compute_metrics=self.compute_metrics
        )

        self.trainer.train()

    def evaluate_train(self, train_df):
        input_hf = Dataset.from_pandas(train_df)
        tokenized_input = input_hf.map(self.apply_tokenizer, batched=True)
        return self.trainer.evaluate(tokenized_input)

    def evaluate_dev(self):
        return self.trainer.evaluate()

In [None]:
class MultimodalModel(nn.Module):
    def __init__(self, train_batch, eval_batch, lr, embedding_dim, num_categories=10, num_countries=20, num_epochs=1):
        super(MultimodalModel, self).__init__()
        self.num_labels = 2  # Assuming binary classification
        self.id2label = {0: "NEGATIVE", 1: "POSITIVE"}
        self.label2id = {"NEGATIVE": 0, "POSITIVE": 1}

        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        self.model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=self.num_labels, id2label=self.id2label, label2id=self.label2id)

        self.category_embedding = nn.Embedding(num_embeddings=num_categories, embedding_dim=embedding_dim)
        self.country_embedding = nn.Embedding(num_embeddings=num_countries, embedding_dim=embedding_dim)

        self.text_length_linear = nn.Linear(1, embedding_dim)

        self.combined_fc = nn.Linear(768 + embedding_dim + embedding_dim + embedding_dim, 512)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(512, self.num_labels)
        self.loss_fn = nn.CrossEntropyLoss()

        self.train_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=num_epochs,
            per_device_train_batch_size=train_batch,
            per_device_eval_batch_size=eval_batch,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=10,
            evaluation_strategy='epoch',
            learning_rate=lr
        )

    def apply_tokenizer(self, batch):
        tokenized_inputs = self.tokenizer(
            batch["text"],
            truncation=True,
            padding=True,
            max_length=100,
            add_special_tokens=True,
        )

        tokenized_inputs['category_num'] = batch['category_num']
        tokenized_inputs['country_num'] = batch['country_num']
        tokenized_inputs['text_length'] = [[length] for length in batch['text_length']]

        return tokenized_inputs


    def compute_metrics(self, pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        precision, recall, f1, _ = precision_recall_fscore_support(
            labels, preds, average="binary"
        )
        acc = accuracy_score(labels, preds)
        return {
            "accuracy": acc,
            "f1": f1,
            "precision": precision,
            "recall": recall,
        }

    def forward(self, input_ids, attention_mask=None, categories=None, countries=None, text_lengths=None, labels=None):

        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output

        category_features = self.category_embedding(categories)
        country_features = self.country_embedding(countries)

        text_length_features = self.text_length_linear(text_lengths.view(-1, 1))
        combined_features = torch.cat((pooled_output, category_features, country_features, text_length_features), dim=1)
        combined_features = self.dropout(combined_features)

        logits = self.classifier(combined_features)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels.view(-1))

        return logits, loss

    def train(self, train_df, dev_df):
        train_hf = Dataset.from_pandas(train_df)
        dev_hf = Dataset.from_pandas(dev_df)

        tokenized_train = train_hf.map(self.apply_tokenizer, batched=True)
        tokenized_dev = dev_hf.map(self.apply_tokenizer, batched=True)

        self.trainer = Trainer(
            model=self.model,
            args=self.train_args,
            tokenizer=self.tokenizer,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_dev,
            compute_metrics=self.compute_metrics,
        )

        self.trainer.train()

    def evaluate_train(self, train_df):
        input_hf = Dataset.from_pandas(train_df)
        tokenized_input = input_hf.map(self.apply_tokenizer, batched=True)
        return self.trainer.evaluate(tokenized_input)

    def evaluate_dev(self):
        return self.trainer.evaluate()

In [None]:
columns = ['id', 'identifier', 'category', 'country_code', 'text', 'multi_label']

try:
    data_df = pd.read_csv("/content/dontpatronizeme_pcl.tsv", sep='\t', header=None, names=columns, skiprows=3, index_col='id')
except pd.errors.ParserError as e:
    print("ParserError:", e)

data_df['label'] = data_df['multi_label'].apply(lambda x: 0 if x == 0 or x == 1 else 1)
data_df = data_df.dropna()
data_df["text_length"] = data_df["text"].apply(lambda x: len(x))

In [None]:
train_ids = pd.read_csv("/content/train_semeval_parids-labels.csv")
dev_ids = pd.read_csv("/content/dev_semeval_parids-labels.csv")

In [None]:
train_df = data_df.loc[data_df.index.isin(train_ids['par_id'])]
dev_df = data_df.loc[data_df.index.isin(dev_ids['par_id'])]

In [None]:
train_df["country_num"] = pd.Categorical(train_df['country_code']).codes
dev_df["country_num"] = pd.Categorical(dev_df['country_code']).codes
train_df['category_num'] = pd.Categorical(train_df['category']).codes
dev_df['category_num'] = pd.Categorical(dev_df['category']).codes

In [None]:
def augment_text(df, augmentor):
    all_data = [df]
    n = int(len(df[df["label"] == 0]) / len(df[df["label"] == 1])) if len(df[df["label"] == 1]) != 0 else 0
    n = n // 2
    print(f"Data augmentation: rebalancing {n} times...")
    for i in range(n):
        print(f"    Iteration {i+1}")
        start_time = time.time()
        df_new = df[df["label"] == 1].copy(deep=True)
        texts = df_new["text"].tolist()
        augmented_text = [augmentor.augment(text)[0] for text in texts]
        df_new["text"] = augmented_text
        all_data.append(df_new)
        print(f"Elapsed time is {int(time.time() - start_time)}s")
    return pd.concat(all_data, axis=0)

In [None]:
device_type = 'cuda' if torch.cuda.is_available() else 'cpu'


In [None]:
def clean_text_1(text):
    text = re.sub('n\'t', 'not', text)
    text = re.sub('\'s', '', text)
    text = re.sub('<h>', '.', text)
    text = re.sub(" +", " ", text)
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    text = ' '.join(tokens)

    return text.strip()

def clean_text_2(text):
    text = re.sub('n\'t', 'not', text)
    text = re.sub('\'s', '', text)
    text = re.sub('<h>', '.', text)
    text = re.sub(" +", " ", text)
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stop_words]
    text = ' '.join(tokens)
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    text = ' '.join(tokens)

    return text.strip()


def clean_text_3(text):
    text = re.sub('n\'t', 'not', text)
    text = re.sub('\'s', '', text)
    tokens = word_tokenize(text)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    text = ' '.join(tokens)
    text = re.sub(r"\d", "", text)

    return text.strip()


In [None]:
train_df_clean_1 = train_df.copy(deep=True)
train_df_clean_1["text"] = train_df["text"].apply(
    lambda x: clean_text_1(x)
)
train_df_clean_2 = train_df.copy(deep=True)
train_df_clean_2["text"] = train_df["text"].apply(
    lambda x: clean_text_2(x)
)
train_df_clean_3 = train_df.copy(deep=True)

train_df_clean_3["text"] = train_df["text"].apply(
    lambda x: clean_text_3(x)
)

In [None]:
import nlpaug.augmenter.word as naw

# Define a text to be paraphrased
original_text = "The quick brown fox jumps over the lazy dog."

aug = naw.BackTranslationAug(
    from_model_name='Helsinki-NLP/opus-mt-en-es',
    to_model_name='Helsinki-NLP/opus-mt-es-en'
)

# Apply the augmentation
paraphrased_texts = [aug.augment(original_text) for _ in range(3)]

print("Original Text:", original_text)
print("Paraphrased Texts:")
for paraphrased_text in paraphrased_texts:
    print(paraphrased_text)


In [None]:
def back_translate(text, src_language="en", intermediate_language="fr"):
    tokenizer_to_intermediate = MarianTokenizer.from_pretrained(f'Helsinki-NLP/opus-mt-{src_language}-{intermediate_language}')
    model_to_intermediate = MarianMTModel.from_pretrained(f'Helsinki-NLP/opus-mt-{src_language}-{intermediate_language}')

    translated = model_to_intermediate.generate(**tokenizer_to_intermediate(text, return_tensors="pt", padding=True))
    intermediate_text = tokenizer_to_intermediate.decode(translated[0], skip_special_tokens=True)

    tokenizer_to_src = MarianTokenizer.from_pretrained(f'Helsinki-NLP/opus-mt-{intermediate_language}-{src_language}')
    model_to_src = MarianMTModel.from_pretrained(f'Helsinki-NLP/opus-mt-{intermediate_language}-{src_language}')

    back_translated = model_to_src.generate(**tokenizer_to_src(intermediate_text, return_tensors="pt", padding=True))
    src_text = tokenizer_to_src.decode(back_translated[0], skip_special_tokens=True)

    return src_text

original_text = "The quick brown fox jumps over the lazy dog."
back_translated_text = back_translate(original_text)

print("Original Text:", original_text)
print("Back-translated Text:", back_translated_text)

## Synonym Word Augmentation - 1

In [None]:
aug_1 = naw.SynonymAug(aug_src='wordnet')

text = 'The quick brown fox jumps over the lazy dog'

aug_1.augment(text)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


['The flying brown charles james fox jumps over the lazy weenie']

In [None]:
model_1 = MultimodalModel(num_epochs=5)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_df_1 = train_df.copy(deep=True)
train_df_1 = augment_text(train_df_1, aug_1)

Data augmentation: rebalancing 4 times...
    Iteration 1
Elapsed time is 3s
    Iteration 2
Elapsed time is 2s
    Iteration 3
Elapsed time is 2s
    Iteration 4
Elapsed time is 2s


In [None]:
train_df_1

Unnamed: 0_level_0,identifier,category,country_code,text,multi_label,label,text_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0,620
2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0,237
3,@@16584954,immigrant,ie,White House press secretary Sean Spicer said t...,0,0,158
4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0,162
5,@@1494111,refugee,ca,""" Just like we received migrants fleeing El Sa...",0,0,273
...,...,...,...,...,...,...,...
10424,@@4665292,women,jm,""" I get along n ' t consider in miscarriage, I...",3,1,133
10445,@@3923193,refugee,gb,More than than cl unpaid worker spent the nigh...,3,1,112
10454,@@22338535,vulnerable,ie,""" We are challenged, I intimate, to turn this ...",4,1,240
10467,@@20282330,in-need,ng,""" She own one huge platform, and information c...",3,1,282


In [None]:
train_df_1.to_csv("/content/synonymaug.csv")

In [None]:
model_1.train(train_df_1, dev_df)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

Map:   0%|          | 0/2093 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1839,0.467804,0.903966,0.009852,0.25,0.005025
2,0.1306,0.300156,0.921166,0.357977,0.793103,0.231156
3,0.1247,0.309091,0.915432,0.517711,0.565476,0.477387
4,0.0758,0.45933,0.920688,0.517442,0.613793,0.447236
5,0.0004,0.527282,0.920688,0.508876,0.618705,0.432161


In [None]:
model_1.evaluate_train(train_df_1)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

{'eval_loss': 0.006418897304683924,
 'eval_accuracy': 0.9988745563154705,
 'eval_f1': 0.9983612756838524,
 'eval_precision': 0.9992429977289932,
 'eval_recall': 0.9974811083123426,
 'eval_runtime': 17.7457,
 'eval_samples_per_second': 650.918,
 'eval_steps_per_second': 40.686,
 'epoch': 5.0}

In [None]:
model_1.evaluate_dev()

{'eval_loss': 0.5272817015647888,
 'eval_accuracy': 0.9206880076445294,
 'eval_f1': 0.5088757396449705,
 'eval_precision': 0.6187050359712231,
 'eval_recall': 0.4321608040201005,
 'eval_runtime': 3.4365,
 'eval_samples_per_second': 609.045,
 'eval_steps_per_second': 38.12,
 'epoch': 5.0}

In [None]:
del model_1

# Random Insert Contextual - 2

In [None]:
aug_2 = naw.ContextualWordEmbsAug(model_path='bert-base-cased', action="insert", device=device_type)


text = 'The quick brown fox jumps over the lazy dog'

aug_2.augment(text)

['The next quick calculating brown fox jumps in over the lazy dog']

In [None]:
model_2 = MultimodalModel(num_epochs=5)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_df_2 = train_df.copy(deep=True)
train_df_2 = augment_text(train_df_2, aug_2)

Data augmentation: rebalancing 4 times...
    Iteration 1
Elapsed time is 86s
    Iteration 2
Elapsed time is 87s
    Iteration 3
Elapsed time is 88s
    Iteration 4
Elapsed time is 86s


In [None]:
train_df_2

Unnamed: 0_level_0,identifier,category,country_code,text,multi_label,label,text_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0,620
2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0,237
3,@@16584954,immigrant,ie,White House press secretary Sean Spicer said t...,0,0,158
4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0,162
5,@@1494111,refugee,ca,""" Just like we received migrants fleeing El Sa...",0,0,273
...,...,...,...,...,...,...,...
10424,@@4665292,women,jm,""" I s do n'ad t believe in an abortion, I thin...",3,1,133
10445,@@3923193,refugee,gb,More importantly than 150 Australian volunteer...,3,1,112
10454,@@22338535,vulnerable,ie,""" We are challenged, yes I suggest, to turn th...",4,1,240
10467,@@20282330,in-need,ng,""" She already has one huge platform, and infor...",3,1,282


In [None]:
train_df_2.to_csv("/content/randominsertcontextualaug.csv")

In [None]:
model_2.train(train_df_2, dev_df)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

Map:   0%|          | 0/2093 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1651,0.387136,0.90301,0.0,0.0,0.0
2,0.1435,0.275997,0.918299,0.408304,0.655556,0.296482
3,0.2389,0.327579,0.918299,0.53406,0.583333,0.492462
4,0.0612,0.428771,0.922121,0.411552,0.730769,0.286432
5,0.0149,0.507898,0.92451,0.5,0.675214,0.396985


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-3500 already exists and is non-empty. Saving will proceed but saved res

In [None]:
model_2.evaluate_train(train_df_2)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

{'eval_loss': 0.008791192434728146,
 'eval_accuracy': 0.9978356852220587,
 'eval_f1': 0.9968422382215485,
 'eval_precision': 0.9997466430200151,
 'eval_recall': 0.9939546599496222,
 'eval_runtime': 17.5946,
 'eval_samples_per_second': 656.507,
 'eval_steps_per_second': 41.035,
 'epoch': 5.0}

In [None]:
model_2.evaluate_dev()

{'eval_loss': 0.5078981518745422,
 'eval_accuracy': 0.9245102723363593,
 'eval_f1': 0.5,
 'eval_precision': 0.6752136752136753,
 'eval_recall': 0.3969849246231156,
 'eval_runtime': 3.4005,
 'eval_samples_per_second': 615.491,
 'eval_steps_per_second': 38.523,
 'epoch': 5.0}

In [None]:
del model_2

## Random Substitute Contextual - 3

In [None]:
aug_3 = naw.ContextualWordEmbsAug(model_path='bert-base-cased', action="substitute", device=device_type)

text = 'The quick brown fox jumps over the lazy dog'

aug_3.augment(text)

['The quick brown fox skipped over the third piece']

In [None]:
model_3 = MultimodalModel(num_epochs=5)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_df_3 = train_df.copy(deep=True)
train_df_3 = augment_text(train_df_3, aug_3)

Data augmentation: rebalancing 4 times...
    Iteration 1
Elapsed time is 86s
    Iteration 2
Elapsed time is 85s
    Iteration 3
Elapsed time is 85s
    Iteration 4
Elapsed time is 85s


In [None]:
train_df_3

Unnamed: 0_level_0,identifier,category,country_code,text,multi_label,label,text_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0,620
2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0,237
3,@@16584954,immigrant,ie,White House press secretary Sean Spicer said t...,0,0,158
4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0,162
5,@@1494111,refugee,ca,""" Just like we received migrants fleeing El Sa...",0,0,273
...,...,...,...,...,...,...,...
10424,@@4665292,women,jm,""" I could n'ever believe in abortion, nor hope...",3,1,133
10445,@@3923193,refugee,gb,More than 700 participants spent the night in'...,3,1,112
10454,@@22338535,vulnerable,ie,""" We are challenged, I suggest, we turn a time...",4,1,240
10467,@@20282330,in-need,ng,""" Katy has one huge platform, and Izzy can go ...",3,1,282


In [None]:
train_df_3.to_csv("/content/randomsubcontextualaug.csv")

In [None]:
model_3.train(train_df_3, dev_df)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

Map:   0%|          | 0/2093 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1966,0.347603,0.901099,0.0,0.0,0.0
2,0.141,0.339083,0.905877,0.195918,0.521739,0.120603
3,0.0624,0.355156,0.909221,0.483696,0.526627,0.447236
4,0.0577,0.412874,0.913999,0.464286,0.569343,0.39196
5,0.0478,0.526648,0.918299,0.501458,0.597222,0.432161


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-3500 already exists and is non-empty. Saving will proceed but saved res

In [None]:
model_3.evaluate_train(train_df_3)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

{'eval_loss': 0.004596482962369919,
 'eval_accuracy': 0.9988745563154705,
 'eval_f1': 0.9983604489847396,
 'eval_precision': 0.9997474109623642,
 'eval_recall': 0.996977329974811,
 'eval_runtime': 17.5755,
 'eval_samples_per_second': 657.223,
 'eval_steps_per_second': 41.08,
 'epoch': 5.0}

In [None]:
model_3.evaluate_dev()

{'eval_loss': 0.5266479253768921,
 'eval_accuracy': 0.9182990922121357,
 'eval_f1': 0.5014577259475219,
 'eval_precision': 0.5972222222222222,
 'eval_recall': 0.4321608040201005,
 'eval_runtime': 3.3522,
 'eval_samples_per_second': 624.369,
 'eval_steps_per_second': 39.079,
 'epoch': 5.0}

In [None]:
del model_3

## Sentence Augmentation

In [None]:
aug_4 = nas.ContextualWordEmbsForSentenceAug(model_path='gpt2', device=device_type)

text = 'The quick brown fox jumps over the lazy dog'

aug_4.augment(text)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

['The quick brown fox jumps over the lazy dog " is government the a and : " , same , - U time new only first same more , the will first A of the one way of more']

In [None]:
model_4 = MultimodalModel(num_epochs=5)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_df_4 = train_df.copy(deep=True)
train_df_4 = augment_text(train_df_4, aug_4)

Data augmentation: rebalancing 4 times...
    Iteration 1
Elapsed time is 134s
    Iteration 2
Elapsed time is 135s
    Iteration 3
Elapsed time is 144s
    Iteration 4
Elapsed time is 138s


In [None]:
train_df_4

Unnamed: 0_level_0,identifier,category,country_code,text,multi_label,label,text_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0,620
2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0,237
3,@@16584954,immigrant,ie,White House press secretary Sean Spicer said t...,0,0,158
4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0,162
5,@@1494111,refugee,ca,""" Just like we received migrants fleeing El Sa...",0,0,273
...,...,...,...,...,...,...,...
10424,@@4665292,women,jm,""" I do n't believe in abortion , I think it is...",3,1,133
10445,@@3923193,refugee,gb,More than 150 volunteers spent the night in ' ...,3,1,112
10454,@@22338535,vulnerable,ie,""" We are challenged , I suggest , to turn this...",4,1,240
10467,@@20282330,in-need,ng,""" She has one huge platform , and information ...",3,1,282


In [None]:
train_df_4.to_csv("/content/sentenceaug.csv")

In [None]:
model_4.train(train_df_4, dev_df)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

Map:   0%|          | 0/2093 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1726,0.272194,0.911132,0.295455,0.6,0.19598
2,0.1937,0.295275,0.913999,0.333333,0.633803,0.226131
3,0.1497,0.377727,0.903966,0.544218,0.495868,0.603015
4,0.0582,0.461881,0.92021,0.486154,0.626984,0.396985
5,0.0003,0.539336,0.917821,0.527473,0.581818,0.482412


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-3500 already exists and is non-empty. Saving will proceed but saved res

In [None]:
model_4.evaluate_train(train_df_4)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

{'eval_loss': 0.006051179021596909,
 'eval_accuracy': 0.9988745563154705,
 'eval_f1': 0.9983608624385323,
 'eval_precision': 0.9994950770007573,
 'eval_recall': 0.9972292191435769,
 'eval_runtime': 17.5289,
 'eval_samples_per_second': 658.967,
 'eval_steps_per_second': 41.189,
 'epoch': 5.0}

In [None]:
model_4.evaluate_dev()

{'eval_loss': 0.5393360257148743,
 'eval_accuracy': 0.917821309125657,
 'eval_f1': 0.5274725274725275,
 'eval_precision': 0.5818181818181818,
 'eval_recall': 0.4824120603015075,
 'eval_runtime': 3.3368,
 'eval_samples_per_second': 627.245,
 'eval_steps_per_second': 39.259,
 'epoch': 5.0}

In [None]:
del model_4

## Random Deletion



In [None]:
aug_5 = naw.random.RandomWordAug(action='delete', aug_p=0.2)

text = 'The quick brown fox jumps over the lazy dog'

aug_5.augment(text)

['Quick brown fox jumps over lazy dog']

In [None]:
model_5 = MultimodalModel(num_epochs=5)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_df_5 = train_df.copy(deep=True)
train_df_5 = augment_text(train_df_5, aug_5)

Data augmentation: rebalancing 4 times...
    Iteration 1
Elapsed time is 0s
    Iteration 2
Elapsed time is 0s
    Iteration 3
Elapsed time is 0s
    Iteration 4
Elapsed time is 0s


In [None]:
train_df_5

Unnamed: 0_level_0,identifier,category,country_code,text,multi_label,label,text_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0,620
2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0,237
3,@@16584954,immigrant,ie,White House press secretary Sean Spicer said t...,0,0,158
4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0,162
5,@@1494111,refugee,ca,""" Just like we received migrants fleeing El Sa...",0,0,273
...,...,...,...,...,...,...,...
10424,@@4665292,women,jm,""" I do ' in abortion, I think it wicked. I tel...",3,1,133
10445,@@3923193,refugee,gb,More than 150 volunteers night in ' camps ' to...,3,1,112
10454,@@22338535,vulnerable,ie,""" We challenged, I suggest, time of celebratio...",4,1,240
10467,@@20282330,in-need,ng,""" has one huge platform, information can go to...",3,1,282


In [None]:
train_df_5.to_csv("/content/delaug.csv")

In [None]:
model_5.train(train_df_5, dev_df)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

Map:   0%|          | 0/2093 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1892,0.332073,0.903488,0.0,0.0,0.0
2,0.1574,0.289925,0.921166,0.404332,0.717949,0.281407
3,0.1064,0.373472,0.906355,0.5625,0.506024,0.633166
4,0.046,0.421799,0.919732,0.530726,0.597484,0.477387
5,0.0003,0.493509,0.927855,0.535385,0.690476,0.437186


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-3500 already exists and is non-empty. Saving will proceed but saved res

In [None]:
model_5.evaluate_train(train_df_5)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

{'eval_loss': 0.0053885881789028645,
 'eval_accuracy': 0.9985282659509999,
 'eval_f1': 0.9978554308061057,
 'eval_precision': 0.9994945665908517,
 'eval_recall': 0.9962216624685138,
 'eval_runtime': 17.5762,
 'eval_samples_per_second': 657.197,
 'eval_steps_per_second': 41.078,
 'epoch': 5.0}

In [None]:
model_5.evaluate_dev()

{'eval_loss': 0.49350881576538086,
 'eval_accuracy': 0.9278547539417105,
 'eval_f1': 0.5353846153846153,
 'eval_precision': 0.6904761904761905,
 'eval_recall': 0.4371859296482412,
 'eval_runtime': 3.3998,
 'eval_samples_per_second': 615.618,
 'eval_steps_per_second': 38.531,
 'epoch': 5.0}

In [None]:
del model_5

## Sequential Contextual Word Embeddings

In [None]:
aug_6 = naf.Sometimes([
            naw.ContextualWordEmbsAug(model_path='bert-base-cased', action="insert", device=device_type),
            naw.ContextualWordEmbsAug(model_path='bert-base-cased', action="substitute", device=device_type),
        ])

text = 'The quick brown fox jumps over the lazy dog'

aug_6.augment(text)

['The tiny brown fox jumps over the wild cat']

In [None]:
model_6 = MultimodalModel(num_epochs=5)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_df_6 = train_df.copy(deep=True)
train_df_6 = augment_text(train_df_6, aug_6)

Data augmentation: rebalancing 4 times...
    Iteration 1
Elapsed time is 153s
    Iteration 2
Elapsed time is 150s
    Iteration 3
Elapsed time is 149s
    Iteration 4
Elapsed time is 146s


In [None]:
train_df_6

Unnamed: 0_level_0,identifier,category,country_code,text,multi_label,label,text_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0,620
2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0,237
3,@@16584954,immigrant,ie,White House press secretary Sean Spicer said t...,0,0,158
4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0,162
5,@@1494111,refugee,ca,""" Just like we received migrants fleeing El Sa...",0,0,273
...,...,...,...,...,...,...,...
10424,@@4665292,women,jm,""" I must seemn't believe in Christian abortion...",3,1,133
10445,@@3923193,refugee,gb,2007 far more than 150 volunteers visited the ...,3,1,112
10454,@@22338535,vulnerable,ie,""" We are challenged, I suggest, simply to turn...",4,1,240
10467,@@20282330,in-need,ng,""" She has created one huge platform, and this ...",3,1,282


In [None]:
model_6.train(train_df_6, dev_df)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

Map:   0%|          | 0/2093 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1528,0.295355,0.907788,0.254826,0.55,0.165829
2,0.2069,0.306254,0.913999,0.20354,0.851852,0.115578
3,0.129,0.307901,0.919732,0.502959,0.611511,0.427136
4,0.1101,0.435398,0.92021,0.473186,0.635593,0.376884
5,0.0193,0.480245,0.923077,0.519403,0.639706,0.437186


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-3500 already exists and is non-empty. Saving will proceed but saved res

In [None]:
train_df_6.to_csv("/content/sequentialbothaug.csv")

In [None]:
model_6.evaluate_train(train_df_6)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

{'eval_loss': 0.009120704606175423,
 'eval_accuracy': 0.9979222578131763,
 'eval_f1': 0.9969696969696968,
 'eval_precision': 0.999493670886076,
 'eval_recall': 0.9944584382871536,
 'eval_runtime': 17.618,
 'eval_samples_per_second': 655.636,
 'eval_steps_per_second': 40.981,
 'epoch': 5.0}

In [None]:
model_6.evaluate_dev()

{'eval_loss': 0.4802446663379669,
 'eval_accuracy': 0.9230769230769231,
 'eval_f1': 0.5194029850746269,
 'eval_precision': 0.6397058823529411,
 'eval_recall': 0.4371859296482412,
 'eval_runtime': 3.4127,
 'eval_samples_per_second': 613.292,
 'eval_steps_per_second': 38.386,
 'epoch': 5.0}

In [None]:
del model_6

## Sequential Contextual Word Embeddings with Deletion

In [None]:
aug_7 = naf.Sometimes([
            naw.ContextualWordEmbsAug(model_path='bert-base-cased', action="insert", device=device_type),
            naw.ContextualWordEmbsAug(model_path='bert-base-cased', action="substitute", device=device_type),
            naw.random.RandomWordAug(action='delete', aug_p=0.2)
        ])

text = 'The quick brown fox jumps over the lazy dog'

aug_7.augment(text)

['23 quick brown fox jumps out the lazy dog']

In [None]:
model_7 = MultimodalModel(num_epochs=5)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_df_7 = train_df.copy(deep=True)
train_df_7 = augment_text(train_df_7, aug_7)

Data augmentation: rebalancing 4 times...
    Iteration 1
Elapsed time is 143s
    Iteration 2
Elapsed time is 142s
    Iteration 3
Elapsed time is 140s
    Iteration 4
Elapsed time is 146s


In [None]:
train_df_7

Unnamed: 0_level_0,identifier,category,country_code,text,multi_label,label,text_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0,620
2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0,237
3,@@16584954,immigrant,ie,White House press secretary Sean Spicer said t...,0,0,158
4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0,162
5,@@1494111,refugee,ca,""" Just like we received migrants fleeing El Sa...",0,0,273
...,...,...,...,...,...,...,...
10424,@@4665292,women,jm,""" All do ' er say abortion, only it is. would ...",3,1,133
10445,@@3923193,refugee,gb,For so few lined up all relaxing into ' danger...,3,1,112
10454,@@22338535,vulnerable,ie,""" We are lucky, I suggest, they turn the time ...",4,1,240
10467,@@20282330,in-need,ng,""" She has one huge platform, and information c...",3,1,282


In [None]:
model_7.train(train_df_7, dev_df)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

Map:   0%|          | 0/2093 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1644,0.352037,0.902532,0.019231,0.222222,0.01005
2,0.143,0.307371,0.918299,0.439344,0.632075,0.336683
3,0.0489,0.396006,0.911132,0.513089,0.535519,0.492462
4,0.0721,0.449364,0.918777,0.47205,0.617886,0.38191
5,0.0116,0.505493,0.92451,0.49359,0.681416,0.386935


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-3500 already exists and is non-empty. Saving will proceed but saved res

In [None]:
train_df_7.to_csv("/content/sequentialbothanddelaug.csv")

In [None]:
model_7.evaluate_train(train_df_7)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

{'eval_loss': 0.006081556901335716,
 'eval_accuracy': 0.9988745563154705,
 'eval_f1': 0.9983600353223161,
 'eval_precision': 1.0,
 'eval_recall': 0.9967254408060453,
 'eval_runtime': 17.799,
 'eval_samples_per_second': 648.968,
 'eval_steps_per_second': 40.564,
 'epoch': 5.0}

In [None]:
model_7.evaluate_dev()

{'eval_loss': 0.5054931044578552,
 'eval_accuracy': 0.9245102723363593,
 'eval_f1': 0.4935897435897436,
 'eval_precision': 0.6814159292035398,
 'eval_recall': 0.3869346733668342,
 'eval_runtime': 3.503,
 'eval_samples_per_second': 597.487,
 'eval_steps_per_second': 37.396,
 'epoch': 5.0}

In [None]:
del model_7

Synonym + Sentence + Deletion - 8

In [None]:
aug_8 = naf.Sometimes([
            nas.ContextualWordEmbsForSentenceAug(model_path='gpt2', device=device_type),
            naw.SynonymAug(aug_src='wordnet'),
            naw.random.RandomWordAug(action='delete', aug_p=0.2)
        ])

text = 'The quick brown fox jumps over the lazy dog'

aug_8.augment(text)

['The quick jumps over lazy dog " the a and: " the same to - " most only first more two will other A of and and way of new']

In [None]:
model_8 = MultimodalModel(num_epochs=5)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_df_8 = train_df.copy(deep=True)
train_df_8 = augment_text(train_df_8, aug_8)

Data augmentation: rebalancing 4 times...
    Iteration 1
Elapsed time is 108s
    Iteration 2
Elapsed time is 109s
    Iteration 3
Elapsed time is 112s
    Iteration 4
Elapsed time is 114s


In [None]:
train_df_8

Unnamed: 0_level_0,identifier,category,country_code,text,multi_label,label,text_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0,620
2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0,237
3,@@16584954,immigrant,ie,White House press secretary Sean Spicer said t...,0,0,158
4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0,162
5,@@1494111,refugee,ca,""" Just like we received migrants fleeing El Sa...",0,0,273
...,...,...,...,...,...,...,...
10424,@@4665292,women,jm,""" I come n ' believe abortion, Atomic think it...",3,1,133
10445,@@3923193,refugee,gb,More than 150 volunteer spent the night ' surv...,3,1,112
10454,@@22338535,vulnerable,ie,""" We are challenged, I suggest, to turn this t...",4,1,240
10467,@@20282330,in-need,ng,""" She has one huge platform, and information c...",3,1,282


In [None]:
train_df_8.to_csv("/content/synsentdelaug.csv")

In [None]:
model_8.train(train_df_8, dev_df)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

Map:   0%|          | 0/2093 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2277,0.343188,0.903488,0.0,0.0,0.0
2,0.1532,0.261473,0.919732,0.363636,0.738462,0.241206
3,0.1202,0.340971,0.916388,0.523161,0.571429,0.482412
4,0.0607,0.441177,0.92021,0.469841,0.637931,0.371859
5,0.0003,0.536907,0.924032,0.495238,0.672414,0.39196


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-3500 already exists and is non-empty. Saving will proceed but saved res

In [None]:
model_8.evaluate_train(train_df_8)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

{'eval_loss': 0.00626657297834754,
 'eval_accuracy': 0.9987014111332352,
 'eval_f1': 0.9981072555205047,
 'eval_precision': 1.0,
 'eval_recall': 0.9962216624685138,
 'eval_runtime': 17.6115,
 'eval_samples_per_second': 655.878,
 'eval_steps_per_second': 40.996,
 'epoch': 5.0}

In [None]:
model_8.evaluate_dev()

{'eval_loss': 0.536907434463501,
 'eval_accuracy': 0.9240324892498806,
 'eval_f1': 0.4952380952380952,
 'eval_precision': 0.6724137931034483,
 'eval_recall': 0.39195979899497485,
 'eval_runtime': 3.3817,
 'eval_samples_per_second': 618.916,
 'eval_steps_per_second': 38.738,
 'epoch': 5.0}

In [None]:
del model_8

Synonym + delete

In [None]:
aug_9 = naf.Sometimes([
            naw.SynonymAug(aug_src='wordnet'),
            naw.random.RandomWordAug(action='delete', aug_p=0.2)
        ])

text = 'The quick brown fox jumps over the lazy dog'

aug_9.augment(text)

['Ready brown george fox over the lazy wiener']

In [None]:
model_9 = MultimodalModel(num_epochs=5)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_df_clean_1_9 = train_df_clean_1.copy(deep=True)
train_df_clean_1_9 = augment_text(train_df_clean_1_9, aug_9)

Data augmentation: rebalancing 4 times...
    Iteration 1
Elapsed time is 2s
    Iteration 2
Elapsed time is 2s
    Iteration 3
Elapsed time is 2s
    Iteration 4
Elapsed time is 2s


In [None]:
train_df_clean_1_9.to_csv("/content/syndelaug.csv")

In [None]:
model_9.train(train_df_clean_1_9, dev_df)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

Map:   0%|          | 0/2093 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1827,0.378116,0.901099,0.046083,0.277778,0.025126
2,0.1477,0.311464,0.909699,0.341463,0.556818,0.246231
3,0.0715,0.366577,0.914955,0.470238,0.576642,0.396985
4,0.0378,0.436701,0.916388,0.474474,0.589552,0.396985
5,0.0006,0.528071,0.918777,0.484848,0.610687,0.40201


In [None]:
model_9.evaluate_train(train_df_9)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

{'eval_loss': 0.010332810692489147,
 'eval_accuracy': 0.998008830404294,
 'eval_f1': 0.9970955928778886,
 'eval_precision': 0.9997467713345151,
 'eval_recall': 0.9944584382871536,
 'eval_runtime': 17.627,
 'eval_samples_per_second': 655.301,
 'eval_steps_per_second': 40.96,
 'epoch': 5.0}

In [None]:
model_9.evaluate_dev()

{'eval_loss': 0.4947357475757599,
 'eval_accuracy': 0.9245102723363593,
 'eval_f1': 0.5240963855421688,
 'eval_precision': 0.6541353383458647,
 'eval_recall': 0.4371859296482412,
 'eval_runtime': 3.3659,
 'eval_samples_per_second': 621.822,
 'eval_steps_per_second': 38.92,
 'epoch': 5.0}

In [None]:
del model_9

10

In [None]:
aug_10 = naf.Sometimes([
            naw.SynonymAug(aug_src='wordnet'),
            naw.random.RandomWordAug(action='delete', aug_p=0.2),
            naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert", device=device_type)
        ])

text = 'The quick brown fox jumps over the lazy dog'

aug_10.augment(text)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


['the same quick dark brownness for slyboots over your indolent dog']

In [None]:
model_10 = MultimodalModel(num_epochs=5)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_df_clean_1_10 = train_df_clean_1.copy(deep=True)
train_df_clean_1_10 = augment_text(train_df_clean_1_10, aug_10)

Data augmentation: rebalancing 4 times...
    Iteration 1
Elapsed time is 70s
    Iteration 2
Elapsed time is 67s
    Iteration 3
Elapsed time is 67s
    Iteration 4
Elapsed time is 68s


In [None]:
train_df_clean_1_10.to_csv("/content/bestprocaug.csv")

In [None]:
model_10.train(train_df_clean_1_10, dev_df)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

Map:   0%|          | 0/2093 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.178,0.254411,0.91161,0.430769,0.555556,0.351759
2,0.172,0.235382,0.912566,0.287938,0.637931,0.18593
3,0.1092,0.240732,0.924032,0.533724,0.640845,0.457286
4,0.0484,0.384973,0.927377,0.525,0.694215,0.422111
5,0.0426,0.456483,0.930721,0.561934,0.704545,0.467337


In [None]:
torch.save(model_10.state_dict(), '/content/model56.19_state_dict.pth')

In [None]:
model_10.evaluate_train(train_df_clean_1_10)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

{'eval_loss': 0.014399005100131035,
 'eval_accuracy': 0.9976625400398234,
 'eval_f1': 0.9965913394773387,
 'eval_precision': 0.9989875980764363,
 'eval_recall': 0.9942065491183879,
 'eval_runtime': 17.5407,
 'eval_samples_per_second': 658.524,
 'eval_steps_per_second': 41.161,
 'epoch': 5.0}

In [None]:
model_10.evaluate_dev()

{'eval_loss': 0.49296295642852783,
 'eval_accuracy': 0.9225991399904443,
 'eval_f1': 0.5030674846625767,
 'eval_precision': 0.6456692913385826,
 'eval_recall': 0.4120603015075377,
 'eval_runtime': 3.3932,
 'eval_samples_per_second': 616.822,
 'eval_steps_per_second': 38.607,
 'epoch': 5.0}

In [None]:
del model_10

11

In [None]:
aug_11 = naf.Sometimes([
            naw.SynonymAug(aug_src='wordnet'),
            naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert", device=device_type),
            naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute", device=device_type),
            naw.random.RandomWordAug(action='delete', aug_p=0.2)
        ])

text = 'The quick brown fox jumps over the lazy dog'

aug_11.augment(text)

['giant danger fox jumps over the work basket heel']

In [None]:
model_11 = MultimodalModel(num_epochs=5)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
train_df_11 = train_df.copy(deep=True)
train_df_11 = augment_text(train_df_11, aug_11)

Data augmentation: rebalancing 4 times...
    Iteration 1
Elapsed time is 149s
    Iteration 2
Elapsed time is 146s
    Iteration 3
Elapsed time is 147s
    Iteration 4
Elapsed time is 146s


In [None]:
model_11.train(train_df_11, dev_df)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

Map:   0%|          | 0/2093 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1558,0.390397,0.904921,0.0,0.0,0.0
2,0.145,0.295353,0.913043,0.272,0.666667,0.170854
3,0.1183,0.293888,0.922121,0.519174,0.628571,0.442211
4,0.0589,0.38245,0.919732,0.358779,0.746032,0.236181
5,0.0792,0.437246,0.921644,0.484277,0.647059,0.386935


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
  _warn_prf(average, modifier, msg_start, len(result))
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-2500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-3500 already exi

In [None]:
model_11.evaluate_train(train_df_11)

Map:   0%|          | 0/11551 [00:00<?, ? examples/s]

{'eval_loss': 0.023345649242401123,
 'eval_accuracy': 0.9959310882174703,
 'eval_f1': 0.9940468651044965,
 'eval_precision': 0.9997452229299363,
 'eval_recall': 0.9884130982367758,
 'eval_runtime': 17.767,
 'eval_samples_per_second': 650.136,
 'eval_steps_per_second': 40.637,
 'epoch': 5.0}

In [None]:
model_11.evaluate_dev()

{'eval_loss': 0.43724560737609863,
 'eval_accuracy': 0.9216435738174868,
 'eval_f1': 0.4842767295597484,
 'eval_precision': 0.6470588235294118,
 'eval_recall': 0.3869346733668342,
 'eval_runtime': 3.3932,
 'eval_samples_per_second': 616.822,
 'eval_steps_per_second': 38.607,
 'epoch': 5.0}

In [None]:
del model_11