In [1]:
import os
import pandas as pd
import numpy as np
import pytorch_lightning as pl
from src import CustomClassifierEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, GroupKFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from scipy.stats import rankdata
from tqdm.auto import tqdm, trange
from sklearn.metrics import accuracy_score
from transformers import get_cosine_schedule_with_warmup,get_linear_schedule_with_warmup, AutoModel, AutoTokenizer, AutoModelForMultipleChoice, AutoConfig
import wandb
pl.seed_everything(56)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

Seed set to 56


In [2]:
# for gemma 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
# for llama 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
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
class CFG:
    class data:
        train_path = 'train.csv'
        test_path = 'test.csv'
        tokenizer = 'BAAI/bge-multilingual-gemma2'#'microsoft/mdeberta-v3-base'
        num_workers = 8
        nfolds = 5
        batch_size = 4
        use_prefix = False
        max_length = 105 
        seed = 56
    class model:
        model = 'BAAI/bge-multilingual-gemma2'#'microsoft/mdeberta-v3-base'
        optim = torch.optim.AdamW
        use_only_encoder=False
        grad_acum_steps = 1
        torch_dtype = torch.bfloat16
        scheduler= 'cosine'
        warnap_steps = 0.0 #0.25
        num_labels = 50
        label_smoothing = 0.0
        lr = lr_fn = 1e-5 #1e-4
        cls_drop_type = None
        cls_drop = 0.0
        pool = 'last_token'
        max_epoches = 10
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        turn_off_drop = True
        num_cycles = 0.5
        eps = 1e-7
        weight_decay = 0.0
        weight_decay_fn = 0.0
        betas = (0.9, 0.999)
        use_lora = True
        class lora:
            r = 64
            lora_alpha = 128
            lora_dropout = 0.05
            bias = 'none'
            use_dora = False
            target_modules = ['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj']
            layers_to_transform = None#list(range(42))

    seed = 56
    fold_number = 0

def set_wandb_cfg():
    config = {}
    for k,v in CFG.model.__dict__.items():
        if '__' not in k:
            config[k] = v
    for k,v in CFG.data.__dict__.items():
        if '__' not in k:
            config[k] = v
    config['fold_number'] = CFG.fold_number
    return config

In [3]:
def make_df(path,is_test=False):
    data = pd.read_csv(path)
    df = pd.DataFrame()
    if is_test:
        df['label'] = [[0] * 50] * len(df)
    else:
        df['label'] = data.apply(lambda x: [x[f'trend_id_res{i}'] for i in range(50)],axis=1)
    df['text'] = data['text']
    return df

In [4]:
class PLDataset(Dataset):
    def __init__(self, df, tokenizer):
        super().__init__()
        self.cfg = CFG.data
        self.data = df
        self.tokenizer = tokenizer
        self.tokenizer.pad_token = self.tokenizer.eos_token
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]        
        
        encodes = self.tokenizer.encode_plus(
            row['text'],
            max_length=self.cfg.max_length,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        
        return {
            'input_ids': encodes.input_ids.squeeze(0),
            'attention_mask': encodes.attention_mask.squeeze(0),
            #'token_type_ids': encodes.token_type_ids.squeeze(0),
            'labels': torch.tensor(row['label'])
        }

In [5]:
class PLDataModule(pl.LightningDataModule):
    def __init__(self):
        super().__init__()
        self.cfg = CFG.data
        self.is_setup = False
        self.is_prepared = False
        
    def prepare_data(self):
        if self.is_prepared: return None
        self.df = make_df(self.cfg.train_path)
        self.test_df = make_df(self.cfg.test_path,is_test=True)
        self.test_df['text'] = self.test_df['text'].fillna('')
        self.tokenizer = AutoTokenizer.from_pretrained(self.cfg.tokenizer)
        self.is_prepared = True
        
    def setup(self, stage: str):
        if self.is_setup: return None
        kf = MultilabelStratifiedKFold(n_splits=self.cfg.nfolds, shuffle=True, random_state=self.cfg.seed)
        splits = [(x,y) for x,y in  kf.split(self.df.values,np.stack(dm.df['label'].values))][CFG.fold_number]
        self.train_df, self.val_df = self.df.iloc[splits[0]], self.df.iloc[splits[1]]
        self.train_dataset = PLDataset(self.train_df,self.tokenizer)
        self.val_dataset = PLDataset(self.val_df,self.tokenizer)
        self.predict_dataset = PLDataset(self.test_df,self.tokenizer)
        self.is_setup = True
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                         batch_size=self.cfg.batch_size,
                         num_workers=self.cfg.num_workers,
                         pin_memory=True,
                         shuffle=True)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.cfg.batch_size,
                          num_workers=self.cfg.num_workers,
                          pin_memory=True,
                          shuffle=False)
    
    def predict_dataloader(self):
        return DataLoader(self.predict_dataset,
                          batch_size=self.cfg.batch_size,
                          num_workers=self.cfg.num_workers,
                          pin_memory=True,
                          shuffle=False)

In [6]:
class AverageMeter():
    def __init__(self):
        self.preds = []
        self.labels = []
        self.history = []
    
    def update(self,y_t,y_p):
        self.labels += y_t
        self.preds += y_p
        
    def clean(self):
        self.preds = []
        self.labels = []

    def calc_metrics(self):
        metrics = {}

        preds = [list(map(lambda y: str(round(y)),x)) for x in self.preds]
        labels = [''.join(map(str,x)) for x in self.labels]
        for i in range(len(self.preds)):
            preds[i][np.argmax(self.preds[i])] = '1'
        preds = [''.join(x) for x in preds]
        
        metrics['accuracy'] = accuracy_score(labels, preds)
        self.history.append(metrics)
        return metrics

In [7]:
class PLModule(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.cfg = CFG.model
        self.model = CustomClassifierEncoder(self.cfg)
        self.avg_meter = AverageMeter()
        self.criterion = nn.BCEWithLogitsLoss()
        
    def forward(self, batch):
        output = self.model(**batch)
        return output

    def training_step(self, batch, i):
        logits = self(batch).logits
        loss = self.criterion(logits, batch['labels'].float())
        self.log('train_loss', loss.item())
        return loss
            
    def validation_step(self, batch, i):
        logits = self(batch).logits
        loss = self.criterion(logits, batch['labels'].float())
        self.log('val_loss',loss.item())
        
        preds = logits.sigmoid().tolist()
        labels = batch['labels'].tolist()
        
        self.avg_meter.update(labels,preds)
    
    def predict_step(self, batch, i):
        logits = self(batch).logits
        return logits.sigmoid().tolist()
                
    def on_validation_epoch_end(self):
        metrics = self.avg_meter.calc_metrics()
        self.log_dict(metrics)
        self.avg_meter.clean()
            
    def configure_optimizers(self):        
        optimizer_parameters = [
            {'params': [p for n, p in self.model.model.named_parameters() if not any(nd in n for nd in self.cfg.no_decay)],
             'lr': self.cfg.lr, 'weight_decay': self.cfg.weight_decay},
            {'params': [p for n, p in self.model.model.named_parameters() if any(nd in n for nd in self.cfg.no_decay)],
             'lr': self.cfg.lr, 'weight_decay': 0.0},
            {'params': [p for n, p in self.model.named_parameters() if "model" not in n],
             'lr': self.cfg.lr_fn, 'weight_decay': self.cfg.weight_decay_fn}
        ]
        
        optim = self.cfg.optim(
            optimizer_parameters,
            lr=self.cfg.lr,
            betas=self.cfg.betas,
            weight_decay=self.cfg.weight_decay,
            eps=self.cfg.eps
        )
        
        if self.cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(optim,
                                                        num_training_steps=self.cfg.num_training_steps,
                                                        num_warmup_steps=self.cfg.num_training_steps * self.cfg.warnap_steps,
                                                        num_cycles=self.cfg.num_cycles)
        elif self.cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(optim,
                                                        num_training_steps=self.cfg.num_training_steps,
                                                        num_warmup_steps=self.cfg.num_training_steps * self.cfg.warnap_steps)
        else:
            return optim
        
        scheduler = {'scheduler': scheduler,'interval': 'step', 'frequency': 1}

        return [optim], [scheduler]

In [8]:
dm = PLDataModule()
dm.prepare_data()
dm.setup(0)

tokenizer_config.json:   0%|          | 0.00/40.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [9]:
#dm.tokenizer.pad_token = dm.tokenizer.eos_token
dm.tokenizer.padding_side = 'right'

In [10]:
CFG.model.num_training_steps = len(dm.train_dataloader()) * CFG.model.max_epoches

In [11]:
model = PLModule()

config.json:   0%|          | 0.00/1.31k [00:00<?, ?B/s]

The repository for BAAI/bge-reranker-v2.5-gemma2-lightweight contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/BAAI/bge-reranker-v2.5-gemma2-lightweight.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


gemma_config.py:   0%|          | 0.00/3.20k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/BAAI/bge-reranker-v2.5-gemma2-lightweight:
- gemma_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


The repository for BAAI/bge-reranker-v2.5-gemma2-lightweight contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/BAAI/bge-reranker-v2.5-gemma2-lightweight.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


gemma_model.py:   0%|          | 0.00/35.5k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/BAAI/bge-reranker-v2.5-gemma2-lightweight:
- gemma_model.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/41.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/9.81G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/9.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/7.34G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
wandb.login(key="31520b01739d418e5d77a11fd8a79a70b189b8bc")
os.environ['WANDB_API_KEY'] = "31520b01739d418e5d77a11fd8a79a70b189b8bc"
wandb.init(project='DLS',name='BAAI/bge-reranker-v2.5-gemma2-lightweight',config=set_wandb_cfg())

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mandrewkhl[0m ([33mandlh[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [13]:
lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='step')
checkpoint_cb = pl.callbacks.ModelCheckpoint(
    dirpath='./outputs/',
    filename='model_{epoch:02d}-{accuracy:.4f}',
    monitor='accuracy',
    mode='max',
    save_last=True
)

trainer = pl.Trainer(
    accelerator="gpu",
    precision='bf16',
    callbacks = [lr_monitor],
    logger = pl.loggers.WandbLogger(save_code=True),
    log_every_n_steps=1,
    accumulate_grad_batches=CFG.model.grad_acum_steps,
    enable_checkpointing=False,
    min_epochs=1,
    devices=1,
    check_val_every_n_epoch=1,
    max_epochs=CFG.model.max_epoches
)

/usr/local/lib/python3.11/dist-packages/lightning_fabric/connector.py:571: `precision=bf16` is supported for historical reasons but its usage is discouraged. Please set your precision to bf16-mixed instead!
Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [14]:
trainer.fit(model,datamodule=dm)

You are using a CUDA device ('NVIDIA RTX A6000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                    | Params | Mode 
--------------------------------------------------------------
0 | model     | CustomClassifierEncoder | 9.5 B  | train
1 | criterion | BCEWithLogitsLoss       | 0      | train
--------------------------------------------------------------
216 M     Trainable params
9.2 B  

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [15]:
preds_val = trainer.predict(model,dm.val_dataloader())

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: |          | 0/? [00:00<?, ?it/s]

In [16]:
preds_test = trainer.predict(model,dm.predict_dataloader())

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: |          | 0/? [00:00<?, ?it/s]

In [None]:
np.save("BAAI-bge-multilingual-gemma2-val-fold_attn_lrnk-0.npy",np.concatenate(preds_val))
np.save("BAAI/bge-multilingual-gemma2-test-fold_attnlrnk-0.npy",np.concatenate(preds_test))

In [19]:
def prepare_predict(pred):
    preds_r = list(map(lambda y: str(round(y)),pred))
    preds_r[np.argmax(pred)] = '1'
    return ' '.join([str(i) for i,x in enumerate(preds_r) if x == '1'])

In [26]:
preds_pr = np.concatenate(preds)

In [29]:
preds_pr[0]

array([1.35973096e-07, 9.92187500e-01, 1.00000000e+00, 1.47223473e-05,
       4.50015068e-06, 3.46451998e-07, 9.53674316e-06, 1.04773790e-08,
       3.12328339e-05, 2.14576721e-05, 4.04357910e-04, 3.93018126e-07,
       9.60937500e-01, 1.52504072e-08, 5.14984131e-05, 3.52859497e-05,
       6.34463504e-09, 6.89178705e-07, 4.94765118e-09, 1.04773790e-08,
       1.18743628e-08, 2.11596489e-06, 5.69969416e-07, 3.25962901e-07,
       3.84170562e-09, 6.00703061e-08, 3.25962901e-07, 2.72691250e-06,
       3.25962901e-07, 1.20699406e-06, 1.29938126e-05, 2.23517418e-07,
       4.50015068e-06, 1.44354999e-07, 6.89178705e-07, 3.29315662e-06,
       1.66992188e-01, 5.78165054e-06, 4.45172191e-07, 1.44354999e-07,
       5.36441803e-07, 8.58562998e-10, 9.53674316e-06, 6.84522092e-08,
       2.99769454e-09, 7.72997737e-08, 3.25962901e-07, 2.11596489e-06,
       1.52504072e-08, 6.48200512e-07])

In [30]:
preds_pr = [prepare_predict(x) for x in preds_pr]

In [32]:
df = pd.read_csv('sample_submission_formated.csv')

In [34]:
df['target'] = preds_pr

In [37]:
df.to_csv('simple_subv2_nlp.csv',index=False)