In [9]:
!mkdir src

mkdir: cannot create directory ‘src’: File exists


In [10]:
%%writefile ./src/__init__.py
from .classification_modeling import CustomClassifierEncoder

Overwriting ./src/__init__.py


In [11]:
%%writefile ./src/poolings.py
import torch
import torch.nn as nn
import torch.nn.functional as F

class ContextPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.pooler_hidden_size, config.pooler_hidden_size)
        self.dropout = StableDropout(config.pooler_dropout)
        self.config = config

    def forward(self, hidden_states, mask=None):
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.

        context_token = hidden_states[:, 0]
        context_token = self.dropout(context_token)
        pooled_output = self.dense(context_token)
        pooled_output = ACT2FN[self.config.pooler_hidden_act](pooled_output)
        return pooled_output

    @property
    def output_dim(self):
        return self.config.hidden_size


class XSoftmax(torch.autograd.Function):
    """
    Masked Softmax which is optimized for saving memory

    Args:
      input (:obj:`torch.tensor`): The input tensor that will apply softmax.
      mask (:obj:`torch.IntTensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation.
      dim (int): The dimension that will apply softmax

    Example::
      import torch
      from transformers.models.deberta import XSoftmax
      # Make a tensor
      x = torch.randn([4,20,100])
      # Create a mask
      mask = (x>0).int()
      y = XSoftmax.apply(x, mask, dim=-1)
    """

    @staticmethod
    def forward(self, input, mask, dim):
        self.dim = dim
        if version.Version(torch.__version__) >= version.Version("1.2.0a"):
            rmask = ~(mask.bool())
        else:
            rmask = (1 - mask).byte()  # This line is not supported by Onnx tracing.

        output = input.masked_fill(rmask, float("-inf"))
        output = torch.softmax(output, self.dim)
        output.masked_fill_(rmask, 0)
        self.save_for_backward(output)
        return output

    @staticmethod
    def backward(self, grad_output):
        (output,) = self.saved_tensors
        inputGrad = _softmax_backward_data(grad_output, output, self.dim, output)
        return inputGrad, None, None


class DropoutContext(object):
    def __init__(self):
        self.dropout = 0
        self.mask = None
        self.scale = 1
        self.reuse_mask = True


def get_mask(input, local_context):
    if not isinstance(local_context, DropoutContext):
        dropout = local_context
        mask = None
    else:
        dropout = local_context.dropout
        dropout *= local_context.scale
        mask = local_context.mask if local_context.reuse_mask else None

    if dropout > 0 and mask is None:
        if version.Version(torch.__version__) >= version.Version("1.2.0a"):
            mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).bool()
        else:
            mask = (1 - torch.empty_like(input).bernoulli_(1 - dropout)).byte()

    if isinstance(local_context, DropoutContext):
        if local_context.mask is None:
            local_context.mask = mask

    return mask, dropout


class XDropout(torch.autograd.Function):
    """Optimized dropout function to save computation and memory by using mask operation instead of multiplication."""

    @staticmethod
    def forward(ctx, input, local_ctx):
        mask, dropout = get_mask(input, local_ctx)
        ctx.scale = 1.0 / (1 - dropout)
        if dropout > 0:
            ctx.save_for_backward(mask)
            return input.masked_fill(mask, 0) * ctx.scale
        else:
            return input

    @staticmethod
    def backward(ctx, grad_output):
        if ctx.scale > 1:
            (mask,) = ctx.saved_tensors
            return grad_output.masked_fill(mask, 0) * ctx.scale, None
        else:
            return grad_output, None


class StableDropout(torch.nn.Module):
    """
    Optimized dropout module for stabilizing the training

    Args:

        drop_prob (float): the dropout probabilities

    """

    def __init__(self, drop_prob):
        super().__init__()
        self.drop_prob = drop_prob
        self.count = 0
        self.context_stack = None

    def forward(self, x):
        """
        Call the module

        Args:
            x (:obj:`torch.tensor`): The input tensor to apply dropout


        """
        if self.training and self.drop_prob > 0:
            return XDropout.apply(x, self.get_context())
        return x

    def clear_context(self):
        self.count = 0
        self.context_stack = None

    def init_context(self, reuse_mask=True, scale=1):
        if self.context_stack is None:
            self.context_stack = []
        self.count = 0
        for c in self.context_stack:
            c.reuse_mask = reuse_mask
            c.scale = scale

    def get_context(self):
        if self.context_stack is not None:
            if self.count >= len(self.context_stack):
                self.context_stack.append(DropoutContext())
            ctx = self.context_stack[self.count]
            ctx.dropout = self.drop_prob
            self.count += 1
            return ctx
        else:
            return self.drop_prob

class MeanPooling(nn.Module):
    def __init__(self, clamp_min=1e-9):
        super(MeanPooling, self).__init__()
        self.clamp_min = clamp_min

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=self.clamp_min)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings

class MinPooling(nn.Module):
    def __init__(self):
        super(MinPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = 1e4
        min_embeddings, _ = torch.min(embeddings, dim=1)
        return min_embeddings

class Multisample_Dropout(nn.Module):
    def __init__(self,drop_range=5,predrop=0.0,):
        super(Multisample_Dropout, self).__init__()
        self.dropout = nn.Dropout(predrop)
        self.dropouts = nn.ModuleList([nn.Dropout((i+1)*.1) for i in range(drop_range)])

    def forward(self, x, module):
        x = self.dropout(x)
        return torch.mean(torch.stack([module(dropout(x)) for dropout in self.dropouts],dim=0),dim=0)

class Multisample_StableDropout(nn.Module):
    def __init__(self,drop_range=5,predrop=0.0,):
        super(Multisample_Dropout, self).__init__()
        self.dropout = StableDropout(predrop)
        self.dropouts = nn.ModuleList([StableDropout((i+1)*.1) for i in range(drop_range)])

    def forward(self, x, module):
        x = self.dropout(x)
        return torch.mean(torch.stack([module(dropout(x)) for dropout in self.dropouts],dim=0),dim=0)

class WeightedLayerPooling(nn.Module):
    def __init__(self, layers = 12):
        super(WeightedLayerPooling, self).__init__()
        self.layers = layers
        self.layer_weights = nn.Parameter(
                torch.tensor([1] * layers, dtype=torch.float)
            )

    def forward(self, all_hidden_states):
        all_hidden_states = torch.stack(all_hidden_states, dim=0)
        all_layer_embedding = all_hidden_states[-self.layers:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average

class Weighted_Linear(nn.Module):
    def __init__(self, hidden_size, n_layers=12):
        super().__init__()
        self.hidden_size = hidden_size
        self.cat_size = hidden_size*3

        self.layer_pooler = WeightedLayerPooling(n_layers)
        self.sequence_pooler = MeanPooling()

    def forward(self, x, mask):
        x = self.layer_pooler(x.hidden_states)

        x = self.sequence_pooler(x, mask)

        return x

class Cat_LSTM(nn.Module):
    def __init__(self, hidden_size, n_layers):
        super().__init__()
        self.hidden_size = hidden_size
        self.cat_size = hidden_size*n_layers
        self.n_layers = n_layers

        self.sequence_pooler = MeanPooling(1e-9)
        self.rnn = Bi_RNN_FOUT(self.cat_size, self.cat_size//2)

    def forward(self, x, mask):
        
        x = torch.cat(x.hidden_states[-self.n_layers:], dim=-1)

        hidden_mask = mask.unsqueeze(-1).expand(x.size()).float()
        x = (x * hidden_mask)

        x = self.rnn(x)
        x = self.sequence_pooler(x, mask)

        return x

class LayerBaseLSTM(nn.Module):
    def __init__(self, hidden_size,n_layers,extra_head_instances):
        super().__init__()
        self.hidden_size = hidden_size
        self.cat_size = hidden_size*n_layers
        self.n_layers = n_layers
        self.pooler = LSTM_Layer_Pooling(hidden_size, num_hidden_layers=self.n_layers)

    def forward(self, x, mask):

        x = self.pooler(x.hidden_states, mask)

        return x

class LSTM_Layer_Pooling(nn.Module):
    def __init__(self, hidden_size, num_hidden_layers=12, is_lstm=True,bidirectional=True):
        super().__init__()
        self.num_hidden_layers = num_hidden_layers
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional

        self.is_lstm = is_lstm

        if self.is_lstm:
            self.lstm = nn.LSTM(
                self.hidden_size,
                self.hidden_size,
                bidirectional=self.bidirectional,
                batch_first=True
            )
        else:
            self.lstm = nn.GRU(
                self.hidden_size,
                self.hidden_size,
                bidirectional=self.bidirectional,
                batch_first=True
            )


        self.pooling = MeanPooling(.0)

    def forward(self, all_hidden_states, mask):

        hidden_states = torch.stack([self.pooling(layer_i, mask)
                                     for layer_i in all_hidden_states[-self.num_hidden_layers:]], dim=1)
        out, _ = self.lstm(hidden_states)
        out = out[:, -1, :]
        return out

class Bi_RNN(nn.Module):
    def __init__(self, size, hidden_size, layers=1):
        super().__init__()
        self.layers = layers
        self.hidden_size = hidden_size
        self.rnn = nn.LSTM(size, hidden_size, num_layers=layers, bidirectional=True, bias=False, batch_first=True)

    def forward(self, x):
        x, hidden = self.rnn(x)
        return torch.cat((x[:,-1,:self.hidden_size], x[:,0,self.hidden_size:]), dim=-1)

class Bi_RNN_FOUT(nn.Module):
    def __init__(self, size, hidden_size, layers=1):
        super().__init__()
        self.layers = layers
        self.hidden_size = hidden_size
        self.rnn = nn.LSTM(size, hidden_size, num_layers=layers, bidirectional=True, bias=False, batch_first=True)
        self.initialize_lstm(self.rnn)
    
    def initialize_lstm(self, lstm_layer):
        for name, param in lstm_layer.named_parameters():
            if 'weight_ih' in name:
                torch.nn.init.xavier_uniform_(param.data)
            elif 'weight_hh' in name:
                torch.nn.init.orthogonal_(param.data)
            elif 'bias' in name:
                param.data.fill_(0)

    def forward(self, x):
        x, hidden = self.rnn(x)
        return x


class AttentionPooling(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.attention = nn.Sequential(
            nn.Linear(hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        self._init_weights(self.attention)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, x, mask):
        last_hidden_states = x[0]
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        return feature

class LSTMPooling(nn.Module):
    def __init__(self, hidden_size,num_layers=1,drop=0.0):
        super().__init__()
        self.lstm = nn.LSTM(
            hidden_size,
            hidden_size//2,
            num_layers=1,
            dropout=drop,
            batch_first=True,
            bidirectional=True
        )
        self.pool = MeanPooling()
        self.initialize_lstm(self.lstm)

    def initialize_lstm(self, lstm_layer):
        for name, param in lstm_layer.named_parameters():
            if 'weight_ih' in name:
                torch.nn.init.xavier_uniform_(param.data)
            elif 'weight_hh' in name:
                torch.nn.init.orthogonal_(param.data)
            elif 'bias' in name:
                param.data.fill_(0)

    def forward(self, x, mask):
        last_hidden_states = x[0]
        feature, hc = self.lstm(last_hidden_states)
        feature = self.pool(feature, mask)
        return feature
    
class LSTMAttnPooling(nn.Module):
    def __init__(self, hidden_size,num_layers=1,drop=0.0):
        super().__init__()
        self.lstm = nn.LSTM(
            hidden_size,
            hidden_size//2,
            num_layers=1,
            dropout=drop,
            batch_first=True,
            bidirectional=True
        )
        self.pool = AttentionPooling(hidden_size)
        self.initialize_lstm(self.lstm)

    def initialize_lstm(self, lstm_layer):
        for name, param in lstm_layer.named_parameters():
            if 'weight_ih' in name:
                torch.nn.init.xavier_uniform_(param.data)
            elif 'weight_hh' in name:
                torch.nn.init.orthogonal_(param.data)
            elif 'bias' in name:
                param.data.fill_(0)

    def forward(self, x, mask):
        last_hidden_states = x[0]
        feature, hc = self.lstm(last_hidden_states)
        feature = self.pool([feature], mask)
        return feature

class Weighted_Linear_Attn(nn.Module):
    def __init__(self, hidden_size, n_layers=12):
        super().__init__()
        self.hidden_size = hidden_size
        self.cat_size = hidden_size*3

        self.layer_pooler = WeightedLayerPooling(n_layers)
        self.sequence_pooler = AttentionPooling(hidden_size)

    def forward(self, x, mask):
        x = self.layer_pooler(x.hidden_states)

        x = self.sequence_pooler([x],mask)

        return x

class Weighted_Linear_LSTM(nn.Module):
    def __init__(self, hidden_size, n_layers=12):
        super().__init__()
        self.hidden_size = hidden_size
        self.cat_size = hidden_size*3

        self.layer_pooler = WeightedLayerPooling(n_layers)
        self.sequence_pooler = LSTMPooling(hidden_size)

    def forward(self, x, mask):
        x = self.layer_pooler(x.hidden_states)

        x = self.sequence_pooler({'last_hidden_states':x},mask)

        return x

class LastTokenPooling(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.config = config
        
    def forward(self,input_ids,hidden_states):
        batch_size = input_ids.shape[0]
        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
                sequence_lengths = sequence_lengths % input_ids.shape[-1]
                sequence_lengths = sequence_lengths.to(hidden_states.device)
            else:
                sequence_lengths = -1

        pooled_logits = hidden_states[torch.arange(batch_size, device=hidden_states.device), sequence_lengths]
        return pooled_logits

def get_pooling(cfg,config=None):
    if cfg.pool == 'last_token':
        return LastTokenPooling(config)
    elif cfg.pool == 'mean':
        return MeanPooling()
    elif cfg.pool == 'max':
        return MaxPooling()
    elif cfg.pool == 'min':
        return MinPooling()
    elif cfg.pool == 'attention':
        return AttentionPooling(config.hidden_size)
    elif cfg.pool == 'lstm_simple':
        return LSTMPooling(config.hidden_size)
    elif cfg.pool == 'lstm_attention':
        return LSTMAttnPooling(config.hidden_size)
    elif cfg.pool == 'lstm_cat':
        return Cat_LSTM(config.hidden_size,config.num_hidden_layers)
    elif cfg.pool == 'lstm_layer_base':
        return LSTM_Layer_Pooling(config.hidden_size,config.num_hidden_layers)
    elif cfg.pool == 'weighted_linear_mean':
        return Weighted_Linear(config.hidden_size, config.num_hidden_layers)
    elif cfg.pool == 'weighted_linear_attn':
        return Weighted_Linear_Attn(config.hidden_size,config.num_hidden_layers)
    elif cfg.pool == 'weighted_linear_lstm':
        return Weighted_Linear_LSTM(config.hidden_size,config.num_hidden_layers)


Overwriting ./src/poolings.py


In [12]:
%%writefile ./src/classification_modeling.py
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional, Tuple, Union
#from transformers.models.deberta_v2.modeling_deberta_v2 import StableDropout
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from transformers.modeling_outputs import SequenceClassifierOutput
from .poolings import *
from transformers import AutoConfig, AutoModel, T5EncoderModel

class CustomClassifierEncoder(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.model)
        if self.config.pad_token_id is None:
            if type(self.config.eos_token_id) != list:
                self.config.pad_token_id = self.config.eos_token_id
            else:
                self.config.pad_token_id = self.config.eos_token_id[0]
        if cfg.turn_off_drop:
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
        
        if cfg.use_only_encoder:
            self.model = T5EncoderModel.from_pretrained(
                cfg.model,
                config=self.config,
                torch_dtype=cfg.torch_dtype
            )

        else:
            self.model = AutoModel.from_pretrained(
                cfg.model,
                config=self.config,
                torch_dtype=cfg.torch_dtype
            )
        if cfg.use_lora:
            peft_config = LoraConfig(
                r=cfg.lora.r,
                lora_alpha=cfg.lora.lora_alpha,
                lora_dropout=cfg.lora.lora_dropout,
                bias=cfg.lora.bias,
                #task_type='SEQ_CLS',
                use_dora=cfg.lora.use_dora,
                target_modules=cfg.lora.target_modules,
                #layers_to_transform=cfg.lora.layers_to_transform
            )
            self.model = get_peft_model(self.model, peft_config)
        
        self.pool = get_pooling(cfg,config=self.config)
        
        if cfg.cls_drop_type == 'stable':
            self.cls_drop = StableDropout(cfg.cls_drop)
        elif cfg.cls_drop_type == 'multi':
            self.cls_drop = Multisample_Dropout(cfg.multi_drop_range)
        else:
            self.cls_drop = nn.Dropout(cfg.cls_drop)
            
        if self.cfg.pool != 'lstm_cat':
            self.fc = nn.Linear(self.config.hidden_size,self.cfg.num_labels)
        #elif self.cfg.pool in ['lstm_attn','lstm_simple']:
        #    self.fc = nn.Linear(self.config.hidden_size // 2,self.cfg.num_labels)
        else:
            self.fc = nn.Linear(self.config.hidden_size * self.config.num_hidden_layers,self.cfg.num_labels)
        self._init_weights(self.fc)

    def _init_weights(self, module):
        if 'initializer_range' not in self.config.to_dict().keys():
            self.config.initializer_range = 0.02
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:

        outputs = self.model(
            input_ids,
            position_ids=position_ids,# comment for t5
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            token_type_ids=token_type_ids,
            output_attentions=output_attentions,
            output_hidden_states=True,
            return_dict=return_dict,
        )
        
        if self.cfg.pool == 'last_token':
            pooled_output = self.pool(input_ids, outputs[0]) 
        elif self.cfg.pool == 'mean':
            pooled_output = self.pool(outputs[0],attention_mask)
        else:
            pooled_output = self.pool(outputs,attention_mask)
            
        if self.cfg.cls_drop_type != 'multi':
            pooled_output = self.cls_drop(pooled_output)
            logits = self.fc(pooled_output)
        else:
            logits = self.cls_drop(pooled_output,self.fc)


        loss = None
        if labels is not None:
            pass
            #loss_fct = nn.CrossEntropyLoss(label_smoothing=self.cfg.label_smoothing)
            #loss = loss_fct(logits, labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

Overwriting ./src/classification_modeling.py


In [13]:
import os
import gc
import pandas as pd
import numpy as np
import pytorch_lightning as pl
from src import CustomClassifierEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, GroupKFold
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from scipy.stats import rankdata
from tqdm.auto import tqdm, trange
from sklearn.metrics import accuracy_score, f1_score
from transformers import get_cosine_schedule_with_warmup,get_linear_schedule_with_warmup, AutoModel, AutoTokenizer, AutoModelForMultipleChoice, AutoConfig
import wandb
pl.seed_everything(56)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [14]:
class CFG:
    class data:
        train_path = '/kaggle/input/t1-genii-text/train-14.csv'
        test_path = '/kaggle/input/t1-genii-text/test-11.csv'
        tokenizer = "deepvk/USER-bge-m3"#'microsoft/mdeberta-v3-base'
        num_workers = 4
        nfolds = 5
        batch_size = 16
        use_prefix = False
        max_length = 256 
        seed = 56
    class model:
        model = "deepvk/USER-bge-m3"#'microsoft/mdeberta-v3-base'
        optim = torch.optim.AdamW
        use_only_encoder = False
        grad_acum_steps = 1
        torch_dtype = None
        scheduler= 'cosine'
        warmup_steps = 0.0 #0.25
        num_labels = 3
        label_smoothing = 0.0
        lr = lr_fn = 1e-5
        cls_drop_type = None
        cls_drop = 0.0
        pool = 'attention'
        max_epoches = 10
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        turn_off_drop = True
        num_cycles = 0.5
        eps = 1e-7
        weight_decay = 0.0
        weight_decay_fn = 0.0
        betas = (0.9, 0.999)
        use_lora = False
    seed = 56
    fold_number = 0

def set_wandb_cfg():
    config = {}
    for k,v in CFG.model.__dict__.items():
        if '__' not in k:
            config[k] = v
    for k,v in CFG.data.__dict__.items():
        if '__' not in k:
            config[k] = v
    config['fold_number'] = CFG.fold_number
    return config

In [15]:
def make_df(path,is_test=False):
    data = pd.read_csv(path)
    df = pd.DataFrame()
    df['text'] = data['review']
    if not is_test:
        df['label'] = data['sentiment']
    else:
        df['label'] = 0
    return df

In [16]:
class PLDataset(Dataset):
    def __init__(self, df, tokenizer):
        super().__init__()
        self.cfg = CFG.data
        self.data = df
        self.tokenizer = tokenizer
        self.tokenizer.pad_token = self.tokenizer.eos_token
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]        
        
        encodes = self.tokenizer.encode_plus(
            row['text'],
            max_length=self.cfg.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        
        return {
            'input_ids': encodes.input_ids.squeeze(0),
            'attention_mask': encodes.attention_mask.squeeze(0),
            #'token_type_ids': encodes.token_type_ids.squeeze(0),
            'labels': torch.tensor(row['label'])
        }

In [17]:
class PLDataModule(pl.LightningDataModule):
    def __init__(self):
        super().__init__()
        self.cfg = CFG.data
        self.is_setup = False
        self.is_prepared = False
        
    def prepare_data(self):
        if self.is_prepared: return None
        self.df = make_df(self.cfg.train_path)
        self.test_df = make_df(self.cfg.test_path,is_test=True)
        self.test_df['text'] = self.test_df['text'].fillna('')
        self.tokenizer = AutoTokenizer.from_pretrained(self.cfg.tokenizer)
        self.is_prepared = True
        
    def setup(self, stage: str):
        if self.is_setup: return None
        kf = StratifiedKFold(n_splits=self.cfg.nfolds, shuffle=True, random_state=self.cfg.seed)
        splits = [(x,y) for x,y in  kf.split(self.df.values,np.stack(self.df['label'].values))][CFG.fold_number]
        self.train_df, self.val_df = self.df.iloc[splits[0]], self.df.iloc[splits[1]]
        self.train_dataset = PLDataset(self.train_df,self.tokenizer)
        self.val_dataset = PLDataset(self.val_df,self.tokenizer)
        self.predict_dataset = PLDataset(self.test_df,self.tokenizer)
        self.is_setup = True
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                         batch_size=self.cfg.batch_size,
                         num_workers=self.cfg.num_workers,
                         pin_memory=True,
                         shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.cfg.batch_size,
                          num_workers=self.cfg.num_workers,
                          pin_memory=True,
                          shuffle=False)
    
    def predict_dataloader(self):
        return DataLoader(self.predict_dataset,
                          batch_size=self.cfg.batch_size,
                          num_workers=self.cfg.num_workers,
                          pin_memory=True,
                          shuffle=False)

In [18]:
class AverageMeter():
    def __init__(self):
        self.preds = []
        self.labels = []
        self.history = []
    
    def update(self,y_t,y_p):
        self.labels += y_t
        self.preds += y_p
        
    def clean(self):
        self.preds = []
        self.labels = []

    def calc_metrics(self):
        metrics = {}
        
        metrics['accuracy'] = accuracy_score(self.labels, self.preds)
        metrics['f1_macro'] = f1_score(self.labels, self.preds,average='macro')
        metrics['f1_micro'] = f1_score(self.labels, self.preds,average='micro')
        
        self.history.append(metrics)
        return metrics

In [19]:
class PLModule(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.cfg = CFG.model
        self.model = CustomClassifierEncoder(self.cfg)
        self.avg_meter = AverageMeter()
        self.criterion = nn.CrossEntropyLoss()
        
    def forward(self, batch):
        output = self.model(**batch)
        return output

    def training_step(self, batch, i):
        logits = self(batch).logits
        loss = self.criterion(logits, batch['labels'])
        self.log('train_loss', loss.item())
        return loss
            
    def validation_step(self, batch, i):
        logits = self(batch).logits
        loss = self.criterion(logits, batch['labels'])
        self.log('val_loss',loss.item())
        
        preds = logits.cpu().argmax(dim=-1)
        labels = batch['labels'].cpu().tolist()
        
        self.avg_meter.update(labels,preds)
    
    def predict_step(self, batch, i):
        logits = self(batch).logits
        return logits.argmax(dim=-1)
                
    def on_validation_epoch_end(self):
        metrics = self.avg_meter.calc_metrics()
        self.log_dict(metrics)
        self.avg_meter.clean()
            
    def configure_optimizers(self):        
        optimizer_parameters = [
            {'params': [p for n, p in self.model.model.named_parameters() if not any(nd in n for nd in self.cfg.no_decay)],
             'lr': self.cfg.lr, 'weight_decay': self.cfg.weight_decay},
            {'params': [p for n, p in self.model.model.named_parameters() if any(nd in n for nd in self.cfg.no_decay)],
             'lr': self.cfg.lr, 'weight_decay': 0.0},
            {'params': [p for n, p in self.model.named_parameters() if "model" not in n],
             'lr': self.cfg.lr_fn, 'weight_decay': self.cfg.weight_decay_fn}
        ]
        
        optim = self.cfg.optim(
            optimizer_parameters,
            lr=self.cfg.lr,
            betas=self.cfg.betas,
            weight_decay=self.cfg.weight_decay,
            eps=self.cfg.eps
        )
        
        if self.cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(optim,
                                                        num_training_steps=self.cfg.num_training_steps,
                                                        num_warmup_steps=self.cfg.num_training_steps * self.cfg.warmup_steps,
                                                        num_cycles=self.cfg.num_cycles)
        elif self.cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(optim,
                                                        num_training_steps=self.cfg.num_training_steps,
                                                        num_warmup_steps=self.cfg.num_training_steps * self.cfg.warmup_steps)
        else:
            return optim
        
        scheduler = {'scheduler': scheduler,'interval': 'step', 'frequency': 1}

        return [optim], [scheduler]

In [20]:
def train_fold(fold_n=0):
    pl.seed_everything(56)
    CFG.fold_number = fold_n
    
    dm = PLDataModule()
    dm.prepare_data()
    dm.setup(0)
    dm.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    
    CFG.model.num_training_steps = len(dm.train_dataloader()) * CFG.model.max_epoches
    model = PLModule()
    
    wandb.login(key="31520b01739d418e5d77a11fd8a79a70b189b8bc")
    os.environ['WANDB_API_KEY'] = "31520b01739d418e5d77a11fd8a79a70b189b8bc"
    wandb.init(project='T1',name='labse_en_ru',config=set_wandb_cfg())
    
    lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='step')
    checkpoint_cb = pl.callbacks.ModelCheckpoint(
        dirpath='./outputs/',
        filename='model_{epoch:02d}-{accuracy:.4f}',
        monitor='accuracy',
        mode='max',
        save_last=True
    )

    trainer = pl.Trainer(
        accelerator="gpu",
        precision=32,
        callbacks = [lr_monitor],#[lr_monitor,checkpoint_cb],
        logger = pl.loggers.WandbLogger(save_code=True),
        log_every_n_steps=1,
        accumulate_grad_batches=CFG.model.grad_acum_steps,
        enable_checkpointing=False,
        min_epochs=1,
        devices=1,
        check_val_every_n_epoch=1,
        max_epochs=CFG.model.max_epoches
    )
    trainer.fit(model, datamodule=dm)
    val_preds = trainer.predict(model,dm.val_dataloader())
    test_preds = trainer.predict(model,dm.predict_dataloader())
    hist = model.avg_meter.history[-1]['accuracy']
    model = model.cpu()
    del model, trainer, dm, checkpoint_cb, lr_monitor
    gc.collect()
    torch.cuda.empty_cache()
    return hist,val_preds,test_preds

In [21]:
pl.seed_everything(56)
CFG.fold_number = 0

In [22]:
dm = PLDataModule()
dm.prepare_data()
dm.setup(0)
#dm.tokenizer.add_special_tokens({'pad_token': '[PAD]'})

tokenizer_config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.33M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/963 [00:00<?, ?B/s]

In [23]:
CFG.model.num_training_steps = len(dm.train_dataloader()) * CFG.model.max_epoches
model = PLModule()
    
wandb.login(key="31520b01739d418e5d77a11fd8a79a70b189b8bc")
os.environ['WANDB_API_KEY'] = "31520b01739d418e5d77a11fd8a79a70b189b8bc"
wandb.init(project='T1',name='labse_en_ru',config=set_wandb_cfg())

config.json:   0%|          | 0.00/697 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mandrewkhl[0m ([33mandlh[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [24]:
lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='step')
checkpoint_cb = pl.callbacks.ModelCheckpoint(
    dirpath='./outputs/',
    filename='model_{epoch:02d}-{accuracy:.4f}',
    monitor='accuracy',
    mode='max',
    save_last=True
)

trainer = pl.Trainer(
    accelerator="gpu",
    precision=32,
    callbacks = [lr_monitor],#[lr_monitor,checkpoint_cb],
    logger = pl.loggers.WandbLogger(save_code=True),
    log_every_n_steps=1,
    accumulate_grad_batches=CFG.model.grad_acum_steps,
    enable_checkpointing=False,
    min_epochs=1,
    devices=1,
    check_val_every_n_epoch=1,
    max_epochs=CFG.model.max_epoches
)

In [25]:
trainer.fit(model, datamodule=dm)

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

NameError: name 'exit' is not defined

In [26]:
preds = trainer.predict(model, dm.predict_dataloader())

Predicting: |          | 0/? [00:00<?, ?it/s]

In [27]:
df = pd.DataFrame()
df['index'] = pd.read_csv(CFG.data.test_path)['index']
df['sentiment'] = torch.cat(preds).numpy()

In [28]:
df.to_csv('t1_preds_text4.csv',index=False)

In [None]:
hist,val_preds,test_preds = train_fold(0)