In [1]:
import os
import pandas as pd
import numpy as np
import pytorch_lightning as pl
from src import CustomEncoder,HardNegativesCrossEntropy, SoftNegativesCrossEntropy, NoDublicateSampler
from sklearn.model_selection import train_test_split, StratifiedKFold, GroupKFold
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from scipy.stats import rankdata
from tqdm.auto import tqdm, trange
from sklearn.metrics import accuracy_score
from transformers import get_cosine_schedule_with_warmup,get_linear_schedule_with_warmup, AutoModel, AutoTokenizer, AutoModelForMultipleChoice, AutoConfig
import wandb
pl.seed_everything(5656)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

2024-09-26 12:26:56.008126: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-26 12:26:56.008237: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-26 12:26:56.009451: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-26 12:26:56.016333: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Seed set to 5656


In [2]:
class CFG:
    class data:
        train_path = 'candidates_bge_large.parquet'
        misconcepts = 'eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv'
        tokenizer = 'BAAI/bge-large-en-v1.5'
        num_workers = 8
        nfolds = 5
        batch_size = 4
        use_prefix = False
        max_length = 256 
        num_negs = 10
        seed = 56
    class model:
        model = 'BAAI/bge-large-en-v1.5'
        optim = torch.optim.AdamW
        grad_acum_steps = 2
        torch_dtype = None
        scheduler= 'cosine'
        warnap_steps = 0.0 #0.25
        label_smoothing = 0.0
        lr = 1e-5
        pool = 'mean'
        max_epoches = 5
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        turn_off_drop = True
        num_cycles = 0.5
        eps = 1e-7
        weight_decay = 0.0
        weight_decay_fn = 0.0
        betas = (0.9, 0.999)
        use_lora = False
    seed = 56
    fold_number = 0

def set_wandb_cfg():
    config = {}
    for k,v in CFG.model.__dict__.items():
        if '__' not in k:
            config[k] = v
    for k,v in CFG.data.__dict__.items():
        if '__' not in k:
            config[k] = v
    config['fold_number'] = CFG.fold_number
    return config

In [3]:
def parse_anchor(x):
    x = x.fillna('')
    return f"{x.answer_value}  {x.ConstructName} {x.QuestionText} {x.answer_value}"

def parse_negs(x,c=CFG.data.num_negs):
    return [i for i in x['candidates'] if i != x['label']][:c]

def make_df(train_path):
    data = pd.read_parquet(train_path)
    df = pd.DataFrame()
    
    df['anchor'] = data.apply(parse_anchor,axis=1)
    df['candidates'] = data['top25_candidates']
    df['candidates_scores'] = data['top25_scores']
    df['candidates_ranks'] = (-data['top25_scores']).apply(rankdata).map(lambda x:[int(i) for i in x])
    df['misc_in_condidates'] = data['id_in_candidates']
    df['label'] = data['misconcpts_id'].astype(int)
    df['negs'] = df.apply(parse_negs,axis=1)
    df['question_id'] = data['QuestionId']
    df['answer_id'] = data['answer_id'].apply(lambda x: {'AnswerAText':0,'AnswerBText':1,'AnswerCText':2,'AnswerDText':3}[x])
    
    return df

In [4]:
class PLDataset(Dataset):
    def __init__(self, df, miscocepts_maper, tokenizer):
        super().__init__()
        self.cfg = CFG.data
        self.data = df
        self.miscocepts_maper = miscocepts_maper
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.data)
    
    def simple_negative_sample(self,row):
        if row['misc_in_condidates']:
            return [x for x in row['candidates'] if x != row['label']][:self.cfg.num_negs]
        else:
            return row['candidates'][:self.cfg.num_negs].tolist()


    def __getitem__(self, index):
        row = self.data.iloc[index]
        neatives = [row['label']] + self.simple_negative_sample(row)
        anchor = row['anchor']
        
        encodes = [
            self.tokenizer.encode_plus(
                query,
                max_length=self.cfg.max_length,
                truncation=True,
                padding='max_length',
                return_tensors='pt'
            )
            for query in [anchor] + [self.miscocepts_maper[x] for x in neatives]
        ]
        return {
            'input_ids':torch.stack([x.input_ids.squeeze(0) for x in encodes]),
            'attention_mask':torch.stack([x.attention_mask.squeeze(0) for x in encodes]),
            'token_type_ids':torch.stack([x.token_type_ids.squeeze(0) for x in encodes])
        }

In [5]:
class PLDataModule(pl.LightningDataModule):
    def __init__(self):
        super().__init__()
        self.cfg = CFG.data
        self.is_setup = False
        self.is_prepared = False
        
    def prepare_data(self):
        if self.is_prepared: return None
        self.df = make_df(self.cfg.train_path)
        self.misconcepts_maper = pd.read_csv(self.cfg.misconcepts).set_index('MisconceptionId')['MisconceptionName'].to_dict()
        self.tokenizer = AutoTokenizer.from_pretrained(self.cfg.tokenizer)
        self.is_prepared = True
        
    def setup(self, stage: str):
        if self.is_setup: return None
        kf = GroupKFold(n_splits=self.cfg.nfolds)
        splits = [(x,y) for x,y in  kf.split(self.df,groups=self.df['question_id'])][CFG.fold_number]
        self.train_df, self.val_df = self.df.iloc[splits[0]], self.df.iloc[splits[1]]
        
        self.train_dataset = PLDataset(self.train_df, self.misconcepts_maper, self.tokenizer)
        self.val_dataset = PLDataset(self.val_df, self.misconcepts_maper, self.tokenizer)
        #self.predict_dataset = PLDataset(self.test_df, self.misconcepts_maper, self.tokenizer)
        self.is_setup = True
    
    def train_dataloader(self):
        return DataLoader(self.train_dataset,
                          #batch_size=self.cfg.batch_size,
                          num_workers=self.cfg.num_workers,
                          pin_memory=True,
                          batch_sampler = NoDublicateSampler(
                             self.train_dataset,
                             self.train_df['negs'].tolist(),
                             self.train_df['label'].tolist(),
                             self.cfg.batch_size
                         ),
                         #shuffle=False
                         )
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset,
                          batch_size=self.cfg.batch_size,
                          num_workers=self.cfg.num_workers,
                          pin_memory=True,
                          shuffle=False)
    
    def predict_dataloader(self):
        return DataLoader(self.predict_dataset,
                          batch_size=self.cfg.batch_size,
                          num_workers=self.cfg.num_workers,
                          pin_memory=True,
                          shuffle=False)

In [6]:
class AverageMeter():
    def __init__(self):
        self.preds = []
        self.labels = []
        self.history = []
    
    def update(self,y_t,y_p):
        self.labels += y_t
        self.preds += y_p
        
    def clean(self):
        self.preds = []
        self.labels = []

    def calc_metrics(self):
        metrics = {}
        metrics['accuracy'] = accuracy_score(self.labels, self.preds)
        self.history.append(metrics)
        
        return metrics

In [7]:
class PLModule(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.cfg = CFG.model
        self.model = CustomEncoder(self.cfg)
        self.avg_meter = AverageMeter()
        self.criterion = SoftNegativesCrossEntropy()
        
    def forward(self, batch):
        bs, num_negs, seq_ln = batch['attention_mask'].shape
        batch['attention_mask'] = batch['attention_mask'].view((bs * num_negs),seq_ln)
        batch['input_ids'] = batch['input_ids'].view((bs * num_negs),seq_ln)
        batch['token_type_ids'] = batch['token_type_ids'].view((bs * num_negs),seq_ln)

        output = self.model(**batch)
        output['pooler_output'] = output['pooler_output'].view(bs, num_negs, -1)
        output['last_hidden_state'] = output['last_hidden_state'].view(bs, num_negs, seq_ln, -1)
        
        return output

    def training_step(self, batch, i):
        out = self(batch)
        loss = self.criterion(out['pooler_output'])
        self.log('train_loss',loss.item())
        return loss
            
    def validation_step(self, batch, i):
        out = self(batch)
        loss = self.criterion(out['pooler_output'])
        self.log('val_loss',loss.item())
        logits = F.cosine_similarity(out['pooler_output'][:,0:1,:],out['pooler_output'][:,1:,:],dim=2).argmax(dim=-1).tolist()
        self.avg_meter.update([0] * len(logits),logits)
                
    def on_validation_epoch_end(self):
        metrics = self.avg_meter.calc_metrics()
        self.log_dict(metrics)
        self.avg_meter.clean()
            
    def configure_optimizers(self):        
        optimizer_parameters = [
            {'params': [p for n, p in self.model.model.named_parameters() if not any(nd in n for nd in self.cfg.no_decay)],
             'lr': self.cfg.lr, 'weight_decay': self.cfg.weight_decay},
            {'params': [p for n, p in self.model.model.named_parameters() if any(nd in n for nd in self.cfg.no_decay)],
             'lr': self.cfg.lr, 'weight_decay': 0.0},
        ]
        
        optim = self.cfg.optim(
            optimizer_parameters,
            lr=self.cfg.lr,
            betas=self.cfg.betas,
            weight_decay=self.cfg.weight_decay,
            eps=self.cfg.eps
        )
        
        if self.cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(optim,
                                                        num_training_steps=self.cfg.num_training_steps,
                                                        num_warmup_steps=self.cfg.num_training_steps * self.cfg.warnap_steps,
                                                        num_cycles=self.cfg.num_cycles)
        elif self.cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(optim,
                                                        num_training_steps=self.cfg.num_training_steps,
                                                        num_warmup_steps=self.cfg.num_training_steps * self.cfg.warnap_steps)
        else:
            return optim
        
        scheduler = {'scheduler': scheduler,'interval': 'step', 'frequency': 1}

        return [optim], [scheduler]

In [8]:
dm = PLDataModule()
dm.prepare_data()
dm.setup(0)

In [9]:
CFG.model.num_training_steps = len(dm.train_dataloader()) * CFG.model.max_epoches

In [10]:
model = PLModule().cuda()

In [11]:
wandb.login(key="31520b01739d418e5d77a11fd8a79a70b189b8bc")
os.environ['WANDB_API_KEY'] = "31520b01739d418e5d77a11fd8a79a70b189b8bc"
wandb.init(project='Kaggle_Eedi',name='bge_soft_cross_enropy ',config=set_wandb_cfg())

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mandrewkhl[0m ([33mandlh[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [12]:
lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='step')
checkpoint_cb = pl.callbacks.ModelCheckpoint(
    dirpath='./outputs/',
    filename='model_{epoch:02d}-{accuracy:.4f}',
    monitor='accuracy',
    mode='max',
    save_last=True
)

trainer = pl.Trainer(
    accelerator="gpu",
    precision=32,
    callbacks = [lr_monitor,checkpoint_cb],
    logger = pl.loggers.WandbLogger(save_code=True),
    log_every_n_steps=1,
    accumulate_grad_batches=CFG.model.grad_acum_steps,
    enable_checkpointing=True,
    min_epochs=1,
    devices=1,
    check_val_every_n_epoch=1,
    max_epochs=CFG.model.max_epoches
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [13]:
trainer.fit(model, datamodule=dm, ckpt_path="outputs/model_epoch=01-accuracy=0.5412.ckpt")

You are using a CUDA device ('NVIDIA RTX A6000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /notebooks/outputs exists and is not empty.
Restoring states from the checkpoint path at outputs/model_epoch=01-accuracy=0.5412.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                      | Params | Mode 
-------------------------

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Number of iters 20352


Training: |          | 0/? [00:00<?, ?it/s]

Number of iters 53128


Validation: |          | 0/? [00:00<?, ?it/s]

Number of iters 64211


Validation: |          | 0/? [00:00<?, ?it/s]

Number of iters 82095


Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.
