## Linear

In [None]:
!pip install nlpaug
!pip install nltk
!pip install sacremoses
!pip install datasets
!pip install accelerate -U
!pip install transformers[torch]



In [None]:
import torch
import torch.nn as nn
from torchtext.data.utils import get_tokenizer
import pandas as pd
import accelerate
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import MarianMTModel, MarianTokenizer
from datasets import Dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter, defaultdict
import math
import copy
import random
import operator
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf
import time
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer
from transformers.modeling_outputs import TokenClassifierOutput
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset
from transformers import Trainer, TrainingArguments
from torch.optim.lr_scheduler import _LRScheduler

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
from transformers import Trainer, get_linear_schedule_with_warmup, AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau

class BaseModel(nn.Module):
    def __init__(self, num_epochs=1, lr_scheduler_type='linear'):
        super(BaseModel, self).__init__()

        self.id2label = {0: "NEGATIVE", 1: "POSITIVE"}
        self.label2id = {"NEGATIVE": 0, "POSITIVE": 1}
        self.num_labels = 2

        self.num_epochs = num_epochs
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        self.model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=self.num_labels, id2label=self.id2label, label2id=self.label2id)
        self.loss_fn = nn.BCELoss()
        self.classifier = nn.Linear(768, self.num_labels)
        self.activation = nn.Sigmoid()
        self.dropout = nn.Dropout(0.1)
        self.trainer = None
        self.lr_scheduler_type = lr_scheduler_type

        self.train_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=num_epochs,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            warmup_steps=500,
            weight_decay=0.01,
            logging_strategy='steps',
            logging_steps=10,
            evaluation_strategy="epoch",
            logging_dir='./logs',
            lr_scheduler_type=self.lr_scheduler_type,
        )

    def compute_metrics(self, pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        precision, recall, f1, _ = precision_recall_fscore_support(
            labels, preds, average="binary"
        )
        acc = accuracy_score(labels, preds)
        return {
            "accuracy": acc,
            "f1": f1,
            "precision": precision,
            "recall": recall,
        }

    def apply_tokenizer(self, batch):
        return self.tokenizer(
            batch["text"],
            truncation=True,
            padding=True,
            max_length=100,
            add_special_tokens=True,
        )


    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

        cls_outputs = outputs.last_hidden_state[:, 0, :]

        dropout_output = self.dropout(cls_outputs)

        outputs = self.classifier(dropout_output)

        logits = self.activation_function(outputs).view(-1, 2).float()

        loss = None

        if labels is not None:
            loss = self.loss_fn(logits, labels)

        return TokenClassifierOutput(loss=loss, logits=logits)


    def train(self, train_df, dev_df):
        train_hf = Dataset.from_pandas(train_df)
        dev_hf = Dataset.from_pandas(dev_df)

        tokenized_train = train_hf.map(self.apply_tokenizer, batched=True)
        tokenized_dev = dev_hf.map(self.apply_tokenizer, batched=True)

        # optimizer = AdamW(self.model.parameters(), lr=5e-5)
        # scheduler = ReduceLROnPlateau(optimizer, 'min', patience=10, factor=0.1, verbose=True)
        # num_training_steps = self.num_epochs * len(train_dataloader)
        # scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1*num_training_steps, num_training_steps=num_training_steps)

        self.trainer = Trainer(
            model=self.model,
            args=self.train_args,
            tokenizer=self.tokenizer,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_dev,
            compute_metrics=self.compute_metrics,
            # optimizers=(optimizer, scheduler)
        )

        self.trainer.train()

    def evaluate_train(self, train_df):
        input_hf = Dataset.from_pandas(train_df)
        tokenized_input = input_hf.map(self.apply_tokenizer, batched=True)
        return self.trainer.evaluate(tokenized_input)

    def evaluate_dev(self):
        return self.trainer.evaluate()

In [None]:
import torch
import torch.nn as nn
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

class MultimodalModel(nn.Module):
    def __init__(self, num_categories=10, num_countries=20, num_epochs=1):
        super(MultimodalModel, self).__init__()
        self.num_labels = 2
        self.id2label = {0: "NEGATIVE", 1: "POSITIVE"}
        self.label2id = {"NEGATIVE": 0, "POSITIVE": 1}


        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        self.model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=self.num_labels, id2label=self.id2label, label2id=self.label2id)

        self.category_embedding = nn.Embedding(num_embeddings=num_categories, embedding_dim=4)
        self.country_embedding = nn.Embedding(num_embeddings=num_countries, embedding_dim=4)

        self.text_length_linear = nn.Linear(1, 4)

        self.combined_fc = nn.Linear(768 + 4 + 4 + 4, 512)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(512, self.num_labels)
        self.loss_fn = nn.CrossEntropyLoss()

        self.train_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=num_epochs,
            per_device_train_batch_size=32,
            per_device_eval_batch_size=128,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=10,
            evaluation_strategy='epoch'
        )

    def apply_tokenizer(self, batch):
        tokenized_inputs = self.tokenizer(
            batch["text"],
            truncation=True,
            padding=True,
            max_length=100,
            add_special_tokens=True,
        )

        tokenized_inputs['category_num'] = batch['category_num']
        tokenized_inputs['country_num'] = batch['country_num']
        tokenized_inputs['text_length'] = [[length] for length in batch['text_length']]

        return tokenized_inputs


    def compute_metrics(self, pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        precision, recall, f1, _ = precision_recall_fscore_support(
            labels, preds, average="binary"
        )
        acc = accuracy_score(labels, preds)
        return {
            "accuracy": acc,
            "f1": f1,
            "precision": precision,
            "recall": recall,
        }

    def forward(self, input_ids, attention_mask=None, categories=None, countries=None, text_lengths=None, labels=None):

        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output

        category_features = self.category_embedding(categories)
        country_features = self.country_embedding(countries)

        text_length_features = self.text_length_linear(text_lengths.view(-1, 1))
        combined_features = torch.cat((pooled_output, category_features, country_features, text_length_features), dim=1)
        combined_features = self.dropout(combined_features)

        logits = self.classifier(combined_features)

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels.view(-1))

        return logits, loss

    def train(self, train_df, dev_df):
        train_hf = Dataset.from_pandas(train_df)
        dev_hf = Dataset.from_pandas(dev_df)

        tokenized_train = train_hf.map(self.apply_tokenizer, batched=True)
        tokenized_dev = dev_hf.map(self.apply_tokenizer, batched=True)

        self.trainer = Trainer(
            model=self.model,
            args=self.train_args,
            tokenizer=self.tokenizer,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_dev,
            compute_metrics=self.compute_metrics
        )

        self.trainer.train()

    def evaluate_train(self, train_df):
        input_hf = Dataset.from_pandas(train_df)
        tokenized_input = input_hf.map(self.apply_tokenizer, batched=True)
        return self.trainer.evaluate(tokenized_input)

    def evaluate_dev(self):
        return self.trainer.evaluate()




In [None]:
columns = ['id', 'identifier', 'category', 'country_code', 'text', 'multi_label']

data_df = pd.read_csv("/content/bestprocaug.csv")

data_df['label'] = data_df['multi_label'].apply(lambda x: 0 if x == 0 or x == 1 else 1)
data_df = data_df.dropna()
data_df["text_length"] = data_df["text"].apply(lambda x: len(x))

train_df, dev_df = train_test_split(data_df, test_size=0.2, random_state=42)

In [None]:
train_df["country_num"] = pd.Categorical(train_df['country_code']).codes
dev_df["country_num"] = pd.Categorical(dev_df['country_code']).codes
train_df['category_num'] = pd.Categorical(train_df['category']).codes
dev_df['category_num'] = pd.Categorical(dev_df['category']).codes

In [None]:
### eval f1s

# default ('linear' i.e. linear with warm up)) 0.9167213114754098

# scheduling
# cosine 0.9167213114754098
# polynomial 0.9138381201044387

## Linear

In [None]:
model_linear = MultimodalModel(num_epochs=3, lr_scheduler_type='linear')

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model_linear.train(train_df, dev_df)

Map:   0%|          | 0/9240 [00:00<?, ? examples/s]

Map:   0%|          | 0/2311 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.19,0.335469,0.916486,0.862241,1.0,0.757842
2,0.2668,0.203257,0.931631,0.891929,0.980451,0.818068
3,0.0839,0.267112,0.941584,0.911475,0.95467,0.87202


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


In [None]:
model_linear.evaluate_train(train_df)

Map:   0%|          | 0/9240 [00:00<?, ? examples/s]

{'eval_loss': 0.045955561101436615,
 'eval_accuracy': 0.9886363636363636,
 'eval_f1': 0.9833041819049133,
 'eval_precision': 0.9922978177150192,
 'eval_recall': 0.9744721084147494,
 'eval_runtime': 49.2483,
 'eval_samples_per_second': 187.621,
 'eval_steps_per_second': 11.736,
 'epoch': 3.0}

In [None]:
model_linear.evaluate_dev()

{'eval_loss': 0.26711151003837585,
 'eval_accuracy': 0.9415837299870186,
 'eval_f1': 0.9114754098360657,
 'eval_precision': 0.9546703296703297,
 'eval_recall': 0.8720200752823086,
 'eval_runtime': 12.3473,
 'eval_samples_per_second': 187.167,
 'eval_steps_per_second': 11.744,
 'epoch': 3.0}

In [None]:
del model_linear

## Polynomial

In [None]:
model_polynomial = MultimodalModel(num_epochs=3, lr_scheduler_type='polynomial')

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model_polynomial.train(train_df, dev_df)

Map:   0%|          | 0/9240 [00:00<?, ? examples/s]

Map:   0%|          | 0/2311 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2161,0.331092,0.922977,0.874648,0.99679,0.779172
2,0.2158,0.215396,0.932929,0.896873,0.954674,0.845671
3,0.1001,0.267201,0.93942,0.908854,0.94452,0.875784


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


In [None]:
model_polynomial.evaluate_train(train_df)

Map:   0%|          | 0/9240 [00:00<?, ? examples/s]

{'eval_loss': 0.053351979702711105,
 'eval_accuracy': 0.9867965367965368,
 'eval_f1': 0.9805422647527912,
 'eval_precision': 0.9925734581853407,
 'eval_recall': 0.968799243618027,
 'eval_runtime': 49.2835,
 'eval_samples_per_second': 187.487,
 'eval_steps_per_second': 11.728,
 'epoch': 3.0}

In [None]:
model_polynomial.evaluate_dev()

{'eval_loss': 0.26720118522644043,
 'eval_accuracy': 0.9394201644309823,
 'eval_f1': 0.9088541666666667,
 'eval_precision': 0.9445196211096076,
 'eval_recall': 0.875784190715182,
 'eval_runtime': 12.329,
 'eval_samples_per_second': 187.445,
 'eval_steps_per_second': 11.761,
 'epoch': 3.0}

In [None]:
del model_polynomial

## Cosine

In [None]:
model_cosine = MultimodalModel(num_epochs=3, lr_scheduler_type='cosine')

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model_cosine.train(train_df, dev_df)

Map:   0%|          | 0/9240 [00:00<?, ? examples/s]

Map:   0%|          | 0/2311 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2124,0.338775,0.92341,0.87544,0.996795,0.780427
2,0.2222,0.206948,0.936824,0.90068,0.983655,0.830615
3,0.0639,0.253194,0.941584,0.911243,0.957182,0.869511


Checkpoint destination directory ./results/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./results/checkpoint-1500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


In [None]:
model_cosine.evaluate_train(train_df)

Map:   0%|          | 0/9240 [00:00<?, ? examples/s]

{'eval_loss': 0.05465716868638992,
 'eval_accuracy': 0.9863636363636363,
 'eval_f1': 0.9798722044728435,
 'eval_precision': 0.9935212180110139,
 'eval_recall': 0.9665931295304129,
 'eval_runtime': 49.2794,
 'eval_samples_per_second': 187.502,
 'eval_steps_per_second': 11.729,
 'epoch': 3.0}

In [None]:
model_cosine.evaluate_dev()

{'eval_loss': 0.2531941533088684,
 'eval_accuracy': 0.9415837299870186,
 'eval_f1': 0.9112426035502958,
 'eval_precision': 0.9571823204419889,
 'eval_recall': 0.8695106649937264,
 'eval_runtime': 12.3536,
 'eval_samples_per_second': 187.071,
 'eval_steps_per_second': 11.737,
 'epoch': 3.0}

In [None]:
del model_cosine