In [1]:
import gc
import os
import sys
import numpy as np
import random
import pandas as pd
from tqdm.notebook import tqdm
import math
import torch
import transformers
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda.amp import GradScaler, autocast
from accelerate import Accelerator
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModel,
    AutoTokenizer
)
from transformers import AutoConfig

os.environ["TOKENIZERS_PARALLELISM"] = "false"
import warnings
from safetensors.torch import save_file, safe_open
warnings.simplefilter("ignore")

In [2]:
torch.__version__

'2.6.0+cu124'

In [3]:
! nvidia-smi

Tue Jun 10 19:04:09 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   35C    P0             26W /  250W |       3MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                     

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [5]:
model_ckpt = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
config = AutoConfig.from_pretrained(model_ckpt)
m = AutoModel.from_pretrained(model_ckpt)
tokenizer.save_pretrained('roberta_base_tokenizer')
word = m.embeddings.word_embeddings.weight


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

2025-06-10 19:04:17.803517: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749582258.012425      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749582258.069265      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
out = torch.rand(50265,768).split(128,1)

In [7]:
for i in range(len(out)):
    print(out[i].size())

torch.Size([50265, 128])
torch.Size([50265, 128])
torch.Size([50265, 128])
torch.Size([50265, 128])
torch.Size([50265, 128])
torch.Size([50265, 128])


In [8]:
word = m.embeddings.word_embeddings.weight
out = word.split(128,1)
for i in range(len(out)):
    tensors = {
        "embedding": out[i].contiguous()
    }
    save_file(tensors, f"roberta_base_embedding_{i}.safetensors")

In [9]:
out = []
for i in range(6):
    with safe_open(f"roberta_base_embedding_{i}.safetensors", framework="pt") as f:
        out.append(f.get_tensor('embedding'))
    

In [10]:
wb_new = torch.cat(out, dim=1)

In [11]:
wb_new.size()

torch.Size([50265, 768])

In [12]:
EPOCHS = 5
lr = 1e-3
SEED = 42
MAX_LEN = 128
BATCH_SIZE = 128
accumulation_steps = 2
seed_everything(SEED)

**Data Source**

from datasets import load_dataset


clinc = load_dataset("clinc_oos", "plus")

In [13]:
df = pd.read_csv('/kaggle/input/financial-sentiment-analysis/data.csv')
df.head(2)

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative


In [14]:
df = df.rename(columns={'Sentence':'Text'})
df.shape

(5842, 2)

In [15]:
mapping = {'positive':0,'negative':1,'neutral':2}
df['Target'] = df['Sentiment'].map(mapping)

In [16]:
df.head(5)

Unnamed: 0,Text,Sentiment,Target
0,The GeoSolutions technology will leverage Bene...,positive,0
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative,1
2,"For the last quarter of 2010 , Componenta 's n...",positive,0
3,According to the Finnish-Russian Chamber of Co...,neutral,2
4,The Swedish buyout firm has sold its remaining...,neutral,2


In [17]:
from sklearn.model_selection import train_test_split
n_classes = np.unique(df.Target).shape[0]
train,valid = train_test_split(df,test_size=0.2, random_state=42, shuffle=True, stratify=df['Target'])

In [18]:
# data_path = "../input/data-for-distilation"
# train = pd.read_csv("../input/data-for-distilation/Clinc_Train.csv")
# valid = pd.read_csv("../input/data-for-distilation/Clinc_valid.csv")
# n_classes = np.unique(train.Target).shape[0]
# train.head(2)

In [19]:
# train.Target.nunique()

In [20]:
!pip install einops



In [21]:
import torch
import torch.nn as nn
from einops import rearrange, reduce, repeat
from typing import Optional, Tuple


class EncoderAttention(nn.Module):
    def __init__(self, config, layer_idx: int) -> None:
        super().__init__()
        if config.hidden_size % config.num_attention_heads != 0:
            raise ValueError(
                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
                f"heads ({config.num_attention_heads})"
            )
        self.head_size = int(config.hidden_size // config.num_attention_heads)
        self.attention_bias = getattr(config, "attention_bias", True)
        self.layer_idx = layer_idx
        # self.qkv = nn.Linear(config.hidden_size,3*config.hidden_size)
        self.q = nn.Linear(
            config.hidden_size, config.hidden_size, bias=self.attention_bias
        )
        self.k = nn.Linear(
            config.hidden_size, config.hidden_size, bias=self.attention_bias
        )
        self.v = nn.Linear(
            config.hidden_size, config.hidden_size, bias=self.attention_bias
        )
        self.out = nn.Linear(
            config.hidden_size, config.hidden_size, bias=self.attention_bias
        )
        self.num_attention_heads = config.num_attention_heads

    def forward(
        self,
        hidden_state: torch.Tensor,
        attention_mask: torch.Tensor,
        freqs: Optional[torch.Tensor] = None,
    ) -> torch.Tensor:
        q = self.q(hidden_state)
        k = self.k(hidden_state)
        v = self.v(hidden_state)
        # q,k,v = self.qkv(hidden_state).chunk(3, dim = -1) #b X l X d dim =-1 or 2
        # place holder for RoPe operation
        q = rearrange(q, "b l (h d) -> b h l d", h=self.num_attention_heads)
        k = rearrange(k, "b l (h d) -> b h l d", h=self.num_attention_heads)
        v = rearrange(v, "b l (h d) -> b h l d", h=self.num_attention_heads)
        if freqs is not None:
            q, k = apply_rotary_pos_emb(q, k, freqs)

        out = torch.nn.functional.scaled_dot_product_attention(
            query=q, key=k, value=v, attn_mask=attention_mask, is_causal=False
        )
        out = rearrange(out, "b h l d -> b l (h d)")
        out = self.out(out)
        return out



In [22]:
import torch
import torch.nn as nn
from einops import rearrange, reduce
from typing import Optional, Tuple




class RotaryEmbedding(nn.Module):
    """
    RotaryEmbedding is a PyTorch module that implements rotary positional embeddings for attention mechanisms.
    Args:
        config (object): Configuration object containing the following attributes:
            hidden_size (int): The hidden size of the model.
            num_attention_heads (int): The number of attention heads.
    Attributes:
        inv_freq (torch.Tensor): A tensor containing the inverse frequencies for the rotary embeddings.
    Methods:
        forward(seq_len):
            Computes the rotary positional embeddings for a given sequence length.
            Args:
                seq_len (int): The length of the input sequence.
            Returns:
                torch.Tensor: A tensor containing the rotary positional embeddings with shape (1, seq_len, dim).
    """

    def __init__(self, config):
        super().__init__()
        dim = int(config.hidden_size // config.num_attention_heads)
        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer("inv_freq", inv_freq)

    def forward(self, seq_len):
        t = torch.arange(seq_len, device=self.inv_freq.device).type_as(self.inv_freq)
        freqs = torch.einsum("i, j -> i j", t, self.inv_freq)

        return freqs[None, :, :]


def rotate_half(x):
    """
    Rotates half the hidden dimensions of the input tensor.

    Args:
        x (torch.Tensor): The input tensor to be rotated.

    Returns:
        torch.Tensor: The tensor with half of its hidden dimensions rotated.
    """
    x1, x2 = x.chunk(2, dim=-1)
    return torch.cat((-x2, x1), dim=-1)


# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
def apply_rotary_pos_emb(
    q, k, freqs, only_q: bool = False, unsqueeze_dim=1
) -> Tuple[torch.Tensor]:
    """Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        freqs: precalculated frqs for sin cos
        only_q: bool = False for encoder decoder
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    """
    emb = torch.cat((freqs, freqs), dim=-1)
    cos = emb.cos()
    sin = emb.sin()
    cos = cos.unsqueeze(unsqueeze_dim)
    sin = sin.unsqueeze(unsqueeze_dim)
    #     print(cos.size(),sin.size(),q.size(),k.size())
    if only_q:
        q_embed = (q * cos) + (rotate_half(q) * sin)
    else:

        q_embed = (q * cos) + (rotate_half(q) * sin)
        k_embed = (k * cos) + (rotate_half(k) * sin)
        return q_embed, k_embed


# To do :  Alibi

In [23]:
import torch
import torch.nn as nn
from einops import rearrange, reduce
from typing import Optional, Tuple, Union



class FeedForward(nn.Module):
    def __init__(self, config, multiplier: Union[int, float] = 4) -> None:
        super().__init__()
        self.intermediate = nn.Linear(
            config.hidden_size, int(multiplier) * config.hidden_size
        )
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.layerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
       
        self.act_fn = nn.GELU()
        self.out = nn.Linear(int(multiplier) * config.hidden_size, config.hidden_size)

    def forward(
        self, hidden_state: torch.Tensor,input_tensor=None
    ) -> torch.Tensor:
        output = self.intermediate(hidden_state)
        output = self.act_fn(output)
        output = self.out(output)
        output = self.dropout(output)
        if input_tensor is None:
            output = self.layerNorm(output)
        else:
            output = self.layerNorm(output+input_tensor)
            
        return output

In [24]:
import torch
import torch.nn as nn
from typing import Optional, Tuple

from dataclasses import dataclass



@dataclass
class EncoderOutput(object):
    logits: torch.Tensor


@dataclass
class MLMOutput(object):
    hidden_state: torch.Tensor
    logits: torch.Tensor


class EncoderLayer(nn.Module):
    def __init__(self, config, layer_idx: int, attention_type: str = None) -> None:
        super().__init__()
        self.attention = (
            EncoderAttention(config, layer_idx=layer_idx)
          
        )
        
        self.feed_forward = FeedForward(config)
        self.layer_idx = layer_idx

    def forward(
        self,
        hidden_state: torch.Tensor,
        attention_mask: torch.Tensor,
        freqs: torch.Tensor = None,
    ) -> torch.Tensor:
        # residual = hidden_state
        out = self.attention(
            hidden_state=hidden_state, attention_mask=attention_mask, freqs=freqs
        )
        # out = residual+out
        # residual = out
        out = self.feed_forward(out,hidden_state)
        return out


class EncoderModel(nn.Module):

    def __init__(
        self,
        config,
        pos_embedding_type: Optional[str] = "absolute",
        attention_type: str = None,
    ) -> None:
        super().__init__()
        self.word_embeddings = nn.Embedding(
            config.vocab_size,
            config.hidden_size,
            padding_idx=getattr(config, "pad_token_id", None),
        )
       
       
        self.emb_freq = RotaryEmbedding(config)(config.max_position_embeddings)
       
        self.all_layer = nn.ModuleList(
            [
                EncoderLayer(config, layer_idx, attention_type)
                for layer_idx in range(config.num_hidden_layers)
            ]
        )

    def forward(
        self, input_ids: torch.Tensor, attention_mask: torch.Tensor
    ) -> torch.Tensor:
        bsz, seqlen = input_ids.shape
        hidden_state = self.word_embeddings(input_ids)
        freqs = None
        
        freqs = self.emb_freq[:, :seqlen].to(input_ids.device)

        attention_mask = attention_mask.unsqueeze(1).unsqueeze(2).type_as(hidden_state)
        attention_mask = (1.0 - attention_mask) * torch.finfo(hidden_state.dtype).min

        for layer in self.all_layer:
            hidden_state = layer(hidden_state, attention_mask, freqs)
        return EncoderOutput(hidden_state)

    @classmethod
    def from_config(
        cls,
        config,
        pos_embedding_type: Optional[str] = "absolute",
        attention_type: str = None,
    ) -> nn.Module:
        return cls(config, pos_embedding_type, attention_type)

In [25]:
config

RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.51.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

In [26]:
# config.hidden_size = 256
# config.intermediate_size = 1024
# config.num_attention_heads = 8

In [27]:
config.num_hidden_layers = 6
model = EncoderModel(config,pos_embedding_type='rope')

In [28]:
model

EncoderModel(
  (word_embeddings): Embedding(50265, 768, padding_idx=1)
  (all_layer): ModuleList(
    (0-5): 6 x EncoderLayer(
      (attention): EncoderAttention(
        (q): Linear(in_features=768, out_features=768, bias=True)
        (k): Linear(in_features=768, out_features=768, bias=True)
        (v): Linear(in_features=768, out_features=768, bias=True)
        (out): Linear(in_features=768, out_features=768, bias=True)
      )
      (feed_forward): FeedForward(
        (intermediate): Linear(in_features=768, out_features=3072, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (layerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (act_fn): GELU(approximate='none')
        (out): Linear(in_features=3072, out_features=768, bias=True)
      )
    )
  )
)

In [29]:
# mm = nn.AdaptiveAvgPool1d(256)

In [30]:
# out = mm(m.embeddings.word_embeddings.weight)

In [31]:
model.word_embeddings.weight = nn.Parameter(wb_new) #.embeddings.word_embeddings.weight #Ex768 to Ex256

In [32]:
class ClinicModel(nn.Module):
    def __init__(self,model,n_classes=n_classes):
        super(ClinicModel, self).__init__()
        self.model = model
        self.output = nn.Linear(768, n_classes)

    def forward(self, ids, mask):
        sequence_output = self.model(ids, mask).logits[:, 0, :]
        # print(sequence_output.size())
        #         sequence_output = sequence_output[:, 0, :]
        logits = self.output(sequence_output)
        return logits

In [33]:
train_texts = train["Text"].values.tolist()
val_texts = valid["Text"].values.tolist()
train_labels = train["Target"].values.tolist()
val_labels = valid["Target"].values.tolist()
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)


class ClinicDatasetV2(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return {
            "ids": item.get("input_ids"),
            "mask": item.get("attention_mask"),
            "labels": item.get("labels"),
        }

    def __len__(self):
        return len(self.labels)


train_loader = torch.utils.data.DataLoader(
    ClinicDatasetV2(train_encodings, train_labels),
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
)
val_loader = torch.utils.data.DataLoader(
    ClinicDatasetV2(val_encodings, val_labels),
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2,
)

In [34]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [35]:
def valid_func(model, val_loader):
    model.eval()
    loss_fn = torch.nn.CrossEntropyLoss()
    PROB = []
    TARGETS = []
    losses = []
    PREDS = []

    for batch_idx, data in enumerate(val_loader):
        input_ids = data["ids"].to(device)
        input_masks = data["mask"].to(device)
        targets = data["labels"].long().view(-1).to(device)
        with torch.no_grad():
            logits = model(input_ids, input_masks)
            
        # logits = logits.argmax(logits, 1)
            
        # logits, targets = accelerator.gather_for_metrics((logits, targets))

        PREDS += [torch.argmax(logits, 1).detach().cpu()]
        TARGETS += [targets.detach().cpu()]

        loss = loss_fn(logits, targets)
        losses.append(loss.item())

    PREDS = torch.cat(PREDS).cpu().numpy()
    TARGETS = torch.cat(TARGETS).cpu().numpy()
    accuracy = (PREDS == TARGETS).mean()

    loss_valid = np.mean(losses)
    return loss_valid, accuracy

In [36]:
model = ClinicModel(model)

In [37]:
model

ClinicModel(
  (model): EncoderModel(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (all_layer): ModuleList(
      (0-5): 6 x EncoderLayer(
        (attention): EncoderAttention(
          (q): Linear(in_features=768, out_features=768, bias=True)
          (k): Linear(in_features=768, out_features=768, bias=True)
          (v): Linear(in_features=768, out_features=768, bias=True)
          (out): Linear(in_features=768, out_features=768, bias=True)
        )
        (feed_forward): FeedForward(
          (intermediate): Linear(in_features=768, out_features=3072, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (layerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (act_fn): GELU(approximate='none')
          (out): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
  )
  (output): Linear(in_features=768, out_features=3, bias=True)
)

In [38]:
def main(model,train_loader,val_loader,lr=1e-4,num_epochs= 10,name='Rope_classification'):

    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    num_train_optimization_steps = int(EPOCHS * len(train_loader) / accumulation_steps)
    model.to(device)
    model.train()
    scheduler = torch.optim.lr_scheduler.LinearLR(optimizer,total_iters=5)
    use_amp=True
    if use_amp:
        scaler = torch.cuda.amp.GradScaler()
        
    for epoch in range(num_epochs):
        avg_loss = 0.0
        model.train()
        loss_list = []
        for step, data in enumerate(train_loader):
            input_ids = data["ids"].to(device)
            input_masks = data["mask"].to(device)
            targets = data["labels"].long().view(-1).to(device)
            if use_amp:
                with torch.cuda.amp.autocast():
                     pred = model(input_ids,input_masks)
                     loss = loss_fn(pred, targets)
                scaler.scale(loss).backward()
                if step % accumulation_steps == 0:
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()
            else:
                pred = model(input_ids,input_masks)
                loss = loss_fn(pred, targets)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
            loss_list.append(loss.detach().cpu().item())
                
        avg_loss = np.round(np.mean(loss_list), 4)
        scheduler.step()  
        vloss, vaccuracy = valid_func(model, val_loader)
        print(f'loss-{avg_loss}  vloss-{vloss}  vaccuracy-{vaccuracy} lr-{scheduler.get_lr()}')
        
    torch.save(model.state_dict(),'rope_classification_model.pt')

#clinc_oos

2. 
 #modified residuals slow convergence
 #Lr = 1e-4  torch.optim.lr_scheduler.LinearLR(optimizer,total_iters=5) 768 6 layers
#loss-3.519  vloss-1.416838092803955  vaccuracy-0.6712903225806451 lr-[6.533333333333334e-05]
# loss-0.7406  vloss-0.7829630851745606  vaccuracy-0.8187096774193549 lr-[7.714285714285715e-05]
# loss-0.2815  vloss-0.6723056697845459  vaccuracy-0.8435483870967742 lr-[8.962962962962965e-05]
# loss-0.1328  vloss-0.6958784103393555  vaccuracy-0.847741935483871 lr-[0.00010242424242424244]
# loss-0.076  vloss-0.7030221927165985  vaccuracy-0.8580645161290322 lr-[0.00011538461538461538]

3.
# No resudiols excpe norm(a+b)
 #Lr = 1e-4  torch.optim.lr_scheduler.LinearLR(optimizer,total_iters=5) 768 6 layers
# loss-4.6004  vloss-3.754952154159546  vaccuracy-0.28483870967741937 lr-[6.533333333333334e-05]
# loss-1.8394  vloss-1.0517843627929688  vaccuracy-0.7516129032258064 lr-[7.714285714285715e-05]
# loss-0.5617  vloss-0.6701418471336364  vaccuracy-0.847741935483871 lr-[8.962962962962965e-05]
# loss-0.2628  vloss-0.5679526901245118  vaccuracy-0.8625806451612903 lr-[0.00010242424242424244]
# loss-0.126  vloss-0.557751653790474  vaccuracy-0.8729032258064516 lr-[0.00011538461538461538]

4.
# No resudiols excpe norm(a+b)
 #Lr = 1e-4  torch.optim.lr_scheduler.LinearLR(optimizer,total_iters=5) 256-1024 8 heads 6 layers
# loss-5.0718  vloss-4.951975498199463  vaccuracy-0.02870967741935484 lr-[6.533333333333334e-05]
# loss-4.7267  vloss-4.321675539016724  vaccuracy-0.12870967741935485 lr-[7.714285714285715e-05]
# loss-3.2132  vloss-2.305281443595886  vaccuracy-0.58 lr-[8.962962962962965e-05]
# loss-1.822  vloss-1.5857904052734375  vaccuracy-0.7148387096774194 lr-[0.00010242424242424244]
# loss-1.1891  vloss-1.2349900889396668  vaccuracy-0.7706451612903226 lr-[0.00011538461538461538]
# loss-0.8445  vloss-1.0171792769432069  vaccuracy-0.8041935483870968 lr-[0.0001]
# loss-0.6212  vloss-0.9189858269691468  vaccuracy-0.8216129032258065 lr-[0.0001]
# loss-0.4807  vloss-0.8313448286056518  vaccuracy-0.8367741935483871 lr-[0.0001]
# loss-0.371  vloss-0.7519079637527466  vaccuracy-0.8406451612903226 lr-[0.0001]
# loss-0.2944  vloss-0.7201029181480407  vaccuracy-0.8519354838709677 lr-[0.0001]

5.
# No resudiols excpe norm(a+b)
#with roberta embbding load m.emd = orginal
 #Lr = 1e-4  torch.optim.lr_scheduler.LinearLR(optimizer,total_iters=5) 768 6 layers
# loss-4.6015  vloss-3.510575141906738  vaccuracy-0.4138709677419355 lr-[6.533333333333334e-05]
# loss-1.2933  vloss-0.6621843695640564  vaccuracy-0.8429032258064516 lr-[7.714285714285715e-05]
# loss-0.3632  vloss-0.4367950350046158  vaccuracy-0.8919354838709678 lr-[8.962962962962965e-05]
# loss-0.1829  vloss-0.41328111618757246  vaccuracy-0.9112903225806451 lr-[0.00010242424242424244]
# loss-0.0984  vloss-0.3604964795708656  vaccuracy-0.9187096774193548 lr-[0.00011538461538461538]
# loss-0.0554  vloss-0.36992620140314103  vaccuracy-0.9190322580645162 lr-[0.0001]
# loss-0.0323  vloss-0.39370779871940614  vaccuracy-0.9196774193548387 lr-[0.0001]
# loss-0.0211  vloss-0.37983833193778993  vaccuracy-0.9187096774193548 lr-[0.0001]
# loss-0.015  vloss-0.4007080364227295  vaccuracy-0.9219354838709677 lr-[0.0001]
# loss-0.0122  vloss-0.3864754791557789  vaccuracy-0.9248387096774193 lr-[0.0001]

6.
# No resudiols excpe norm(a+b)
#with roberta embbding load m.emd = orginal Ex256 nn.adaptive(256)
 #Lr = 1e-4  torch.optim.lr_scheduler.LinearLR(optimizer,total_iters=5) 256-1024 8 heads 6 layers
# loss-5.0258  vloss-4.900219707489014  vaccuracy-0.03225806451612903 lr-[6.533333333333334e-05]
# loss-4.7275  vloss-4.23483585357666  vaccuracy-0.2567741935483871 lr-[7.714285714285715e-05]
# loss-2.644  vloss-1.637240972518921  vaccuracy-0.7216129032258064 lr-[8.962962962962965e-05]
# loss-1.2201  vloss-1.0289009785652161  vaccuracy-0.8196774193548387 lr-[0.00010242424242424244]
# loss-0.7456  vloss-0.8052865409851074  vaccuracy-0.8412903225806452 lr-[0.00011538461538461538]
# loss-0.5062  vloss-0.6784856510162354  vaccuracy-0.8667741935483871 lr-[0.0001]
# loss-0.3587  vloss-0.5841309988498687  vaccuracy-0.8806451612903226 lr-[0.0001]
# loss-0.2681  vloss-0.5630187004804611  vaccuracy-0.8877419354838709 lr-[0.0001]
# loss-0.2035  vloss-0.5175435137748718  vaccuracy-0.8912903225806451 lr-[0.0001]
# loss-0.1571  vloss-0.5029506856203079  vaccuracy-0.8970967741935484 lr-[0.0001]

7.
# No resudiols excpe norm(a+b)
#with roberta embbding load m.emd = orginal Ex256 nn.adaptive(256)
# /kaggle/input/financial-sentiment-analysis
 #Lr = 1e-4  torch.optim.lr_scheduler.LinearLR(optimizer,total_iters=5) 256-1024 8 heads 6 layers
# loss-1.0498  vloss-0.9539654016494751  vaccuracy-0.5355004277159966 lr-[6.533333333333334e-05]
# loss-0.9553  vloss-0.9168255388736725  vaccuracy-0.5406330196749358 lr-[7.714285714285715e-05]
# loss-0.8969  vloss-0.8359969079494476  vaccuracy-0.6313088109495295 lr-[8.962962962962965e-05]
# loss-0.8281  vloss-0.7813791334629059  vaccuracy-0.6441402908468776 lr-[0.00010242424242424244]
# loss-0.7617  vloss-0.7287110447883606  vaccuracy-0.660393498716852 lr-[0.00011538461538461538]
# loss-0.672  vloss-0.7061142563819885  vaccuracy-0.688622754491018 lr-[0.0001]
# loss-0.5486  vloss-0.6293993830680847  vaccuracy-0.7245508982035929 lr-[0.0001]
# loss-0.422  vloss-0.6116037964820862  vaccuracy-0.7408041060735672 lr-[0.0001]
# loss-0.3374  vloss-0.619420051574707  vaccuracy-0.7450812660393499 lr-[0.0001]
# loss-0.2694  vloss-0.6812280535697937  vaccuracy-0.7416595380667237 lr-[0.0001]

7.0

In [39]:
main(model,train_loader,val_loader)

loss-1.1646  vloss-0.8836344659328461  vaccuracy-0.6313088109495295 lr-[6.533333333333334e-05]
loss-0.8567  vloss-0.7788777053356171  vaccuracy-0.6467065868263473 lr-[7.714285714285715e-05]
loss-0.7368  vloss-0.6653479874134064  vaccuracy-0.6988879384088965 lr-[8.962962962962965e-05]
loss-0.5746  vloss-0.5574774205684662  vaccuracy-0.7698887938408896 lr-[0.00010242424242424244]
loss-0.466  vloss-0.5204420030117035  vaccuracy-0.7741659538066724 lr-[0.00011538461538461538]
loss-0.3581  vloss-0.5468204647302628  vaccuracy-0.7741659538066724 lr-[0.0001]
loss-0.2688  vloss-0.5460979223251343  vaccuracy-0.7630453378956373 lr-[0.0001]
loss-0.1813  vloss-0.6542305141687393  vaccuracy-0.7399486740804107 lr-[0.0001]
loss-0.1506  vloss-0.75699542760849  vaccuracy-0.7305389221556886 lr-[0.0001]
loss-0.1448  vloss-0.805930107831955  vaccuracy-0.7339606501283148 lr-[0.0001]
