In [None]:
!pip install -q transformers sentencepiece

In [None]:
import os
import torch
import logging
import gc
import transformers

import pandas as pd
import numpy as np
import pytorch_lightning as pl
import torch.nn.functional as F
import plotly.express as px

from transformers import AutoModel, AutoTokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import f1_score
from torch import nn
from glob import glob

from torchmetrics.classification import BinaryAUROC, BinaryF1Score

PATH_DATA = '/kaggle/input/nlp-get-started-cleaning-text-tr'

SAVE_MODEL = 'model_folder'
LOG_MODEL = 'log_folder'
PATH_SUBMISSION = '/kaggle/input/nlp-getting-started/sample_submission.csv'

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
if not os.path.exists(SAVE_MODEL):
    os.makedirs(SAVE_MODEL)

In [None]:
config = {
    'batch_size': 8,
    'num_workers': 1,
    #huggingface model
    'model': 'microsoft/deberta-v3-base',
    #entire script debug run
    'debug_run': False,
    #enable dev run on py light
    'dev_run': False,
    'n_fold': 5,
    'random_state': 1024157,
    'max_epochs': 6,
    #number of step. disable with -1.
    'max_steps': -1,
    #trainer parameter --> check loss every n step. put 0.95 to disable this.
    'val_check_interval': 0.1,
    'accelerator': "gpu" if torch.cuda.is_available() else "cpu",
    'lr': 1e-6,
    #save last epoch model
    'save_model': True,
    #used for logging
    'version_experiment': 'deberta_baseline_lr_1e5'
}

In [None]:
logging.info('importing dataset')

if config['debug_run']:
    train = pd.read_pickle(os.path.join(PATH_DATA, 'train.pkl')).sample(150).reset_index(drop=True)
    assert train['fold_cv'].nunique() == 5
else:
    train = pd.read_pickle(os.path.join(PATH_DATA, 'train.pkl'))

In [None]:
if config['save_model']:
    print(f"Fitting model for exactly {config['max_epochs']} epoch and {config['max_steps']} step")
    logging.info(f"Fitting model for exactly {config['max_epochs']} epoch and {config['max_steps']} step")

In [None]:
transformers.logging.set_verbosity_error()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config['model'])

lengths = [
    len(tokenizer(text, add_special_tokens=True)['input_ids'])
    for text in train['text_keyword_cleaned'].fillna("").values
]
config['max_len'] = max(lengths)

In [None]:
def best_threshold(y_true, pred_proba, proba_range = np.arange(.1,.9,.001), verbose=False): 
    scores = []
    for prob in proba_range:
        pred = [int(p>prob) for p in pred_proba]
        score = f1_score(y_true,pred)
        scores.append(score)
        if verbose:
            print("INFO: prob threshold: {}.  score :{}".format(round(prob,3), round(score,5)))
    best_score = scores[np.argmax(scores)]
    optimal_threshold = proba_range[np.argmax(scores)]
    return (optimal_threshold, best_score)

In [None]:
class MeanPooling(nn.Module):
    
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [None]:
class TweetDataset(Dataset):
    def __init__(self, 
                 config, dataset, inference, 
                 text_col_name: str='text_keyword_cleaned', label_col_name: str = 'target_relabeled'
        ):
        self.max_len = config['max_len']
        self.tokenizer = AutoTokenizer.from_pretrained(config['model'])
        self.texts = dataset[text_col_name].values
        self.inference = inference
    
        if not inference:
            self.labels = dataset[label_col_name].values

    def prepare_input(self, text):
        inputs = self.tokenizer(
            text,
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=self.max_len,
            padding='max_length', truncation=True
        )

        return {key: torch.tensor(value, dtype=torch.long) for key, value in inputs.items()}

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = self.prepare_input(self.texts[item])
        
        if self.inference:
            return inputs
        else:
            label = torch.tensor(self.labels[item], dtype=torch.float)
        
            return inputs, label

In [None]:
class DeBertaClassifier(pl.LightningModule):
    def __init__(self, config):
        super().__init__()
        
        self.model = AutoModel.from_pretrained(config['model'])
        
        self.criterion = nn.BCEWithLogitsLoss()
        
        self.config = config
        
        if self.config['lr'] is not None:
            self.lr = self.config['lr']
        
        self.fc = nn.Linear(self.model.config.hidden_size, 1)
        self.pool = MeanPooling()
        
        self.auc = BinaryAUROC(pos_label=1)
        self.f1 = BinaryF1Score()
        
        self.step_outputs = {
            'train': [],
            'val': [],
            'test': []
        }
        self.save_hyperparameters()
        
    def __metric_step(self, pred, labels):
        pred = torch.sigmoid(torch.flatten(pred))
        
        auc_score = self.auc(pred, labels)
        f1_score = self.f1(pred, labels)
        
        return {'auc': auc_score, 'f1': f1_score}
    
    def __loss_step(self, pred, labels):

        pred = torch.flatten(pred)
        loss = self.criterion(pred, labels)
        
        return loss, pred, labels
    
    def training_step(self, batch, batch_idx):

        input_, labels = batch
            
        pred = self.forward(input_)
        loss, _, _ = self.__loss_step(pred, labels)
        self.step_outputs['train'].append(
            {'loss': loss}
        )

        return loss

    def validation_step(self, batch, batch_idx):
        input_, labels = batch
        pred = self.forward(input_)
        
        loss, pred, labels = self.__loss_step(pred, labels)
        self.step_outputs['val'].append(
            {'loss': loss, 'pred': pred, 'labels': labels}
        )
        
    def test_step(self, batch, batch_idx):
        input_, labels = batch
        pred = self.forward(input_)
        
        loss, pred, labels = self.__loss_step(pred, labels)
        self.step_outputs['test'].append(
            {'loss': loss, 'pred': pred, 'labels': labels}
        )

    def on_training_epoch_end(self):
        self.__share_eval_res('train')
        
    def on_validation_epoch_end(self):
        self.__share_eval_res('val')

    def on_test_epoch_end(self):
        self.__share_eval_res('test')
    
    def __log_loss_step(self, mode):
        loss = [out['loss'].reshape(1) for out in outputs]
        loss = torch.mean(torch.cat(loss))
        
        #initialize performance output
        res_dict = {
            f'{mode}_loss': loss
        }

    def __share_eval_res(self, mode: str):
        outputs = self.step_outputs[mode]
        loss = [out['loss'].reshape(1) for out in outputs]
        loss = torch.mean(torch.cat(loss))
        
        #initialize performance output
        res_dict = {
            f'{mode}_loss': loss
        }
        metric_message_list = [
            f'step: {self.trainer.global_step}',
            f'{mode}_loss: {loss:.5f}'
        ]
        #evaluate on all dataset
        if mode != 'train':
            preds = [out['pred'] for out in outputs]
            preds = torch.cat(preds)
            
            labels = [out['labels'] for out in outputs]
            labels = torch.cat(labels)
        
            metric_score = self.__metric_step(preds, labels)
            
            #calculate every metric on all batch
            metric_message_list += [
                f'{mode}_{metric}: {metric_value:.5f}'
                for metric, metric_value in metric_score.items()
            ]
            #get results
            res_dict.update(
                {
                    f'{mode}_{metric}': metric_value
                    for metric, metric_value in metric_score.items()
                }
            )
        else:
            pass

        if self.trainer.sanity_checking:
            pass
        else:
            print(', '.join(metric_message_list))
            self.log_dict(res_dict)
            
        #free memory
        self.step_outputs[mode].clear()
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optimizer
    
    def __encoder(self, inputs):

        outputs = self.model(**inputs)

        last_hidden_states = outputs['last_hidden_state']
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        
        return feature

    def forward(self, inputs):
        
        feature = self.__encoder(inputs)
        
        output = self.fc(feature)
        
        return output
    
    def predict_step(self, batch, batch_idx):
        pred = self.forward(batch)
        pred = torch.flatten(pred)
        pred = torch.sigmoid(pred)
        
        return pred

In [None]:
def train_folder(
    fold_: int, train_data: pd.DataFrame, valid_data: pd.DataFrame,
    config: dict = config, save_model: str = SAVE_MODEL, 
    log_model: str = LOG_MODEL
) -> None:
    model_folder = os.path.join(save_model, f'model_fold_{fold_}')
    log_folder = os.path.join(log_model, f'log_fold_{fold_}')
    
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)

    if not os.path.exists(log_folder):
        os.makedirs(log_folder)
    
    train_dataset = TweetDataset(config, train_data, inference=False)
    valid_dataset = TweetDataset(config, valid_data, inference=False)

    train_loader = DataLoader(
        train_dataset,
        batch_size=config['batch_size'],
        shuffle=True,
        pin_memory=True,
        drop_last=True
    )
    
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=config['batch_size']*2,
        shuffle=False,
        drop_last=False
    )
    
    model_ = DeBertaClassifier(config)
    loggers = pl.loggers.CSVLogger(
        save_dir=log_folder,
        name='csv_log',
        version=config['version_experiment']
    )

    trainer = pl.Trainer(
        max_epochs=config['max_epochs'],
        max_steps=config['max_steps'],
        fast_dev_run=config['dev_run'], 
        accelerator=config['accelerator'],
        val_check_interval=config['val_check_interval'],
        enable_progress_bar=False,
        logger=[loggers],
        enable_checkpointing=False
    )
    
    trainer.fit(model_, train_loader, valid_loader)
    
    if config['save_model']:
        trainer.save_checkpoint(os.path.join(model_folder, "model.ckpt"))
        
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
#score cv to get best cv epoch and after select num_epoch and retrain
for fold_ in range(config['n_fold']):
    train_data = train.loc[train['fold_cv']!=fold_].reset_index(drop=True)
    valid_data = train.loc[train['fold_cv']==fold_].reset_index(drop=True)

    print(f'\n\nStarting folder {fold_}\n\n')
    train_folder(fold_, train_data, valid_data)
    
    torch.cuda.empty_cache()
    gc.collect()

# Log metric

In [None]:
log_file_path = sorted(glob(LOG_MODEL+ f"/log_fold_*/csv_log/{config['version_experiment']}/*.csv"))
assert len(log_file_path) == config['n_fold']
log_list = pd.concat(
    [
        (pd.read_csv(file_path)).assign(fold=fold_)
        for fold_, file_path in enumerate(log_file_path)
    ], axis=0,ignore_index=True
)

In [None]:
log_list = log_list.groupby(
    ['step', 'epoch']
).agg(
    {
        col: 'mean'
        for col in log_list.columns if 'val_' in col
    }
).reset_index()

In [None]:
fig = px.line(log_list, x='step', y=[col for col in log_list if 'val_' in col], template='plotly_white')
fig.show()

In [None]:
best_pos = int(log_list['val_loss'].argmin())

best_epoch = log_list.loc[best_pos, 'epoch']
best_step = log_list.loc[best_pos, 'step']
best_score = log_list['val_loss'].min()

print(f'Best epoch: {best_epoch} Best step: {best_step}, CV-Loss: {best_score:.5f}\n')
print('Other information\n')
print(log_list.loc[best_pos])

best_result = {
    'best_epoch': best_epoch,
    'best_step': best_step,
    'best_score': best_score
}

# Predict valid and test

In [None]:
test = pd.read_pickle(os.path.join(PATH_DATA, 'test.pkl'))

In [None]:
#score cv to get best cv epoch and after select num_epoch and retrain
predictions_valid = np.zeros((train.shape[0]))
prediction_test = np.zeros((test.shape[0]))

trainer = pl.Trainer(accelerator=config['accelerator'])

test_dataset = TweetDataset(config, test, inference=True)

for fold_ in range(config['n_fold']):
    #load the model
    model_folder = os.path.join(SAVE_MODEL, f'model_fold_{fold_}', 'model.ckpt')
    model = DeBertaClassifier(config).load_from_checkpoint(model_folder)
    model.eval()

    #import oof data and test to predict
    mask_valid = train['fold_cv']==fold_
    valid_data = train.loc[mask_valid].reset_index(drop=True)
    
    valid_dataset = TweetDataset(config, valid_data, inference=True)
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=config['batch_size']*2,
        shuffle=False,
        drop_last=False
    )
        
    #predict valid
    pred_val = trainer.predict(model, valid_loader)
    pred_val = torch.concat(pred_val).numpy().reshape((-1))
    
    predictions_valid[mask_valid] = pred_val
    
    #predict test by resetting data loader
    test_loader = DataLoader(
        test_dataset,
        batch_size=config['batch_size']*2,
        shuffle=False,
        drop_last=False
    )

    pred_test = trainer.predict(model, test_loader)
    pred_test = torch.concat(pred_test).numpy().reshape((-1))
    
    prediction_test += pred_test/config['n_fold']

    torch.cuda.empty_cache()
    _ = gc.collect()

In [None]:
optimal_threshold, best_score = best_threshold(train['target_relabeled'].values, predictions_valid)

print(f'Optimal Threshold is {optimal_threshold:.3f}; with the F1 Score of {best_score:.4f}')

In [None]:
del train
_ = gc.collect()

# Inference

In [None]:
prediction_test = (prediction_test >= optimal_threshold).astype(int)

In [None]:
sub = pd.read_csv(PATH_SUBMISSION)
assert sub.shape[0] == prediction_test.shape[0]

In [None]:
sub['target'] = prediction_test

In [None]:
sub.to_csv('submission.csv', index=False)