In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt


import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
import torch
import torch.nn as nn
from torch.utils.checkpoint import checkpoint
from torch.utils.data import Dataset,DataLoader
from tokenizers import AddedToken

from sklearn import metrics
from torch.nn import functional as F
from tqdm import tqdm
from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding
)
import warnings

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

from torch.optim import lr_scheduler
from torch.nn import Parameter
import time

warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv("/kaggle/input/gods-4-0-dataset/train (8).csv")
test = pd.read_csv("/kaggle/input/gods-4-0-dataset/test (6).csv")
ss = pd.read_csv("/kaggle/input/gods-4-0-dataset/SampleSubmission (13).csv")

train.dropna(inplace=True)
test.dropna(inplace=True)

In [4]:
T2ID = {
    "relationship-and-family-issues": 0,
    "anxiety": 1,
    "depression": 2,
    "ptsd-and-trauma": 3,
    "suicidal-thoughts-and-self-harm": 4
}

ID2T = {v: k for k, v in T2ID.items()}

In [5]:
train["target_id"] = train["target"].map(T2ID)

# Config

In [6]:
cfg = {"model_name": "microsoft/deberta-v3-base",
    "max_len": 512,
    "freeze" : False,

    "fold_num": 5,
    "val_fold": 0,
    "learning_rate": 1e-05,
    "min_lr": 8e-6,
    "T_max": 600,
    "valid_batch_size": 16,
    'train_batch_size' : 8,
 
    "epochs": 2, 
    "accumulation_steps": 6,
    "val_steps": 375,
    
    "scheduler" : 'cosine',
    " warmup_epochs": 1,

    "gradient_checkpoint" : False,
    'tokenizer' : AutoTokenizer.from_pretrained("microsoft/deberta-v3-base"),
    
    "input": "input/",
    "output": "output/"
}

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

In [7]:
cfg['n_accumulate'] = 4#
cfg['dropout'] = 0.2
cfg['apex'] = True
cfg["grad_norm"] = 20
cfg["gradient_checkpoint"] = False

## Add Special Tokens (\n & \n\n)

In [8]:
cfg["tokenizer"].add_tokens([AddedToken("\n", normalized=False)])
cfg["tokenizer"].add_tokens([AddedToken("\n\n", normalized=False)])

1

## Helper Functions

In [9]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(42)

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
def prepare_input(cfg, text, tokenizer):

    inputs = tokenizer.encode_plus(
        text,
        return_tensors=None,
        add_special_tokens=True,
        max_length=cfg["max_len"],
        padding='max_length',
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long) 
    return inputs


def collate(inputs):

    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [9]:
def optimizer_scheduler(model,steps):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and p.requires_grad],
                "weight_decay": 0.003,
            },
            {
                "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and p.requires_grad],
                "weight_decay": 0.0,
            },
        ]
        opt = AdamW(optimizer_parameters, lr=cfg['learning_rate'])

        sch = get_cosine_schedule_with_warmup(
            opt,
            num_warmup_steps=0,
            num_training_steps=steps,#self.num_train_steps,
            last_epoch=-1,
        )
        return opt, sch

## Dataset

In [67]:
class Dataset(Dataset):
    def __init__(self, df, test_phase=False):
        self.test_phase = test_phase
        self.cfg = cfg
        if not self.test_phase:
            self.labels = df['target_id'].values
        self.tokenizer = cfg['tokenizer']
        self.sep_token = self.tokenizer.sep_token
        self.texts = df["title"].values + self.sep_token + df['content'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        output = {}
        output["inputs"] = prepare_input(self.cfg, self.texts[item], self.tokenizer)
        if not self.test_phase:
            output["labels"] = torch.tensor(self.labels[item], dtype=torch.float) 
        return output

## Pooling & Model

In [11]:
def get_last_hidden_state(backbone_outputs):
    last_hidden_state = backbone_outputs[0]
    return last_hidden_state


def get_all_hidden_states(backbone_outputs):
    all_hidden_states = torch.stack(backbone_outputs[1])
    return all_hidden_states


def get_input_ids(inputs):
    return inputs['input_ids']


def get_attention_mask(inputs):
    return inputs['attention_mask']


class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, inputs, backbone_outputs):
        attention_mask = get_attention_mask(inputs)
        last_hidden_state = get_last_hidden_state(backbone_outputs)

        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.drop = nn.Dropout(p=cfg["dropout"])
        
        self.config = AutoConfig.from_pretrained(cfg["model_name"])
        #self.config.hidden_dropout = 0.
        self.config.hidden_dropout_prob = 0.007
        #self.config.attention_dropout = 0.
        self.config.attention_probs_dropout_prob = 0.008

        self.model = AutoModel.from_pretrained(cfg["model_name"], config=self.config)
        self.model.resize_token_embeddings(len(cfg["tokenizer"]))

        #odd_layer_freeze(self.model)
        if cfg["gradient_checkpoint"]:
            print('Enabling Grad Checkpointing')
            self.model.gradient_checkpointing_enable()  
        if cfg["freeze"]:
            print('freezing params')
            for parameter in self.model.parameters():
                parameter.requires_grad = False
        self.pool = MeanPooling()
        self.fc = nn.Sequential(
            nn.Linear(self.config.hidden_size, 5),
        )
        
    def forward(self, inputs):        
        out = self.model(**inputs)
        out = self.pool(inputs, out)
        out = self.drop(out)
        out = self.fc(out)
        return out

In [14]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device):
    model.train()

    total = 0
    running_loss = 0.0
    losses = AverageMeter()
    scaler = torch.cuda.amp.GradScaler(enabled=cfg["apex"])
    lr = []
    bar = tqdm(dataloader, total=len(dataloader))
    steps = len(dataloader)
    
    all_preds = np.array([])
    all_groud_truth = np.array([])
    
    for step, data in enumerate(bar):
        
        inputs = data.pop("inputs")
        labels = data.pop("labels")
        labels = labels.long()
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        
        with torch.cuda.amp.autocast(enabled=cfg["apex"]):
            outputs = model(inputs)
            loss = criterion(outputs, labels)
        loss = loss / cfg['n_accumulate']
        
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        
        if (step + 1) % cfg['n_accumulate'] == 0 or step == steps:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            if scheduler:
                scheduler.step()


        epoch_loss = losses.avg
        #acc = correct / total

        bar.set_postfix(
            Loss=epoch_loss, LR=optimizer.param_groups[0]['lr'])
        
        lr.append(optimizer.param_groups[0]['lr'])

    return losses.avg

@torch.no_grad()
def evaluate(model, dataloader, device):
    model.eval()

    total = 0
    losses = AverageMeter()
    correct = 0
    preds = []
    y_test = []
    for data in dataloader:
        inputs = data.pop("inputs")
        labels = data.pop("labels")
        labels = labels.long()
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)

        outputs = model(inputs)

        loss = criterion(outputs, labels)

        losses.update(loss.item(), batch_size)

        preds.append(outputs.softmax(dim=1).argmax(dim=1).detach().cpu().numpy())
        y_test.append(labels.detach().cpu().numpy())
        
    
    epoch_loss = losses
    preds = np.concatenate(preds)
    y_test = np.concatenate(y_test)

    return losses.avg, preds, y_test

In [15]:
def start_training(model, optimizer, scheduler, device, num_epochs,train_loader,valid_loader,fold=0):
    start = time.time()
    best_score = 0
    history = {"Train Loss": [], "Valid Loss": [],"LR": []}
    

    for epoch in range(1, num_epochs + 1):
        print("Epoch: ", epoch)
        train_epoch_loss = train_one_epoch(
            model, optimizer, scheduler, dataloader=train_loader, device=cfg["device"]
        )

        val_epoch_loss, preds, y_test = evaluate(
            model, valid_loader, device=cfg["device"]
        )
        score = accuracy_score(preds, y_test)

        print(f"Acc Score: {score}")
        print(f"Train Loss: {train_epoch_loss}")
        print(f"Valid Loss: {val_epoch_loss}")


        history["Train Loss"].append(train_epoch_loss)
        history["Valid Loss"].append(val_epoch_loss)


        if score >= best_score:
            print(
                f"Score Improved ({best_score} ---> {score})"
            )
            best_score = score
            PATH = f"fold_{fold}.bin"
            torch.save(model.state_dict(), PATH)
            
            print(f"Model Saved")
            best_y = preds

        print()

    end = time.time()
    time_elapsed = end - start
    print(
        "Training complete in {:.0f}h {:.0f}m {:.0f}s".format(
            time_elapsed // 3600,
            (time_elapsed % 3600) // 60,
            (time_elapsed % 3600) % 60,
        )
    )
    print(
        "Best Score: {:.4f}".format(
            best_score
        )
    )


    return history, best_y

## Validation Strategy: Stratified K Folds

In [16]:
def make_folds(train_df, n_splits):

    train_df["fold"] = -1
    X = train_df["content"]
    y = train_df["target"]
    skf = StratifiedKFold(n_splits=5)

    for i, (train_index, val_index) in enumerate(skf.split(X, y)):
        train_df.loc[val_index, "fold"] = i
    return train_df

In [20]:
train = make_folds(train, 5)

In [21]:
train.reset_index(inplace=True, drop=True)

## Training

In [23]:
criterion = nn.CrossEntropyLoss()
def run_folds() :
    
    for fold in range(1, 5) :

        print(f'--------------------------------Training Fold {fold+1}/5---------------------------------')
        train_ = train[train.fold!=fold]
        valid_ = train[train.fold==fold]
        
        print(f'train shape : {len(train_)}')
        print(f'valid shape : {len(valid_)}')
        
        train_dataset = Dataset(
                                    train_
                            )
        valid_dataset = Dataset(
                                    valid_
        )
        train_loader = DataLoader(
                train_dataset,
                batch_size=cfg["train_batch_size"],
                num_workers=4,
                shuffle=True,
                pin_memory=True,
                drop_last=True
                    )
        valid_loader = DataLoader(
            valid_dataset,
            batch_size=cfg["valid_batch_size"],
            num_workers=4,
            shuffle=False,
            pin_memory=True,
        )
        
        model = Model()
        model.to(cfg['device'])
        
        steps = len(train_loader)
        
        optimizer,_ = optimizer_scheduler(model,steps)
        scheduler = lr_scheduler.CosineAnnealingLR(
                        optimizer, T_max=cfg['T_max'], eta_min=cfg['min_lr'])
        
        
        history = start_training(
                        model, optimizer, scheduler, cfg['device'], cfg['epochs'] ,train_loader=train_loader,valid_loader=valid_loader,fold=fold)
        torch.cuda.empty_cache()

In [24]:
cfg["device"] = "cuda"

In [25]:
run_folds()


--------------------------------Training Fold 2/5---------------------------------
train shape : 17527
valid shape : 4382


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Epoch:  1


100%|██████████| 2190/2190 [22:35<00:00,  1.62it/s, LR=8.04e-6, Loss=0.199]


Acc Score: 0.783888635326335
Train Loss: 0.19922036789867975
Valid Loss: 0.6022728112450151
Score Improved (0 ---> 0.783888635326335)
Model Saved

Epoch:  2


100%|██████████| 2190/2190 [22:34<00:00,  1.62it/s, LR=9.85e-6, Loss=0.14] 


Acc Score: 0.7832040164308535
Train Loss: 0.14010992081470142
Valid Loss: 0.5957958937126553

Training complete in 0h 49m 5s
Best Score: 0.7839
--------------------------------Training Fold 3/5---------------------------------
train shape : 17527
valid shape : 4382
Epoch:  1


100%|██████████| 2190/2190 [22:29<00:00,  1.62it/s, LR=8.04e-6, Loss=0.205]


Acc Score: 0.7809219534459151
Train Loss: 0.2047937544513511
Valid Loss: 0.6170639854378485
Score Improved (0 ---> 0.7809219534459151)
Model Saved

Epoch:  2


100%|██████████| 2190/2190 [22:27<00:00,  1.63it/s, LR=9.85e-6, Loss=0.144]


Acc Score: 0.792332268370607
Train Loss: 0.14358798552593685
Valid Loss: 0.5817745146768815
Score Improved (0.7809219534459151 ---> 0.792332268370607)
Model Saved

Training complete in 0h 48m 54s
Best Score: 0.7923
--------------------------------Training Fold 4/5---------------------------------
train shape : 17527
valid shape : 4382
Epoch:  1


100%|██████████| 2190/2190 [22:34<00:00,  1.62it/s, LR=8.04e-6, Loss=0.208]


Acc Score: 0.7850296668188042
Train Loss: 0.20797410196365287
Valid Loss: 0.5994100058693583
Score Improved (0 ---> 0.7850296668188042)
Model Saved

Epoch:  2


100%|██████████| 2190/2190 [22:32<00:00,  1.62it/s, LR=9.85e-6, Loss=0.144]


Acc Score: 0.7948425376540392
Train Loss: 0.14353907135523616
Valid Loss: 0.5814797155174802
Score Improved (0.7850296668188042 ---> 0.7948425376540392)
Model Saved

Training complete in 0h 49m 4s
Best Score: 0.7948
--------------------------------Training Fold 5/5---------------------------------
train shape : 17528
valid shape : 4381
Epoch:  1


100%|██████████| 2191/2191 [22:32<00:00,  1.62it/s, LR=8.04e-6, Loss=0.201]


Acc Score: 0.7772198128281215
Train Loss: 0.2009999109342837
Valid Loss: 0.6455964157364189
Score Improved (0 ---> 0.7772198128281215)
Model Saved

Epoch:  2


100%|██████████| 2191/2191 [22:36<00:00,  1.61it/s, LR=9.85e-6, Loss=0.14] 


Acc Score: 0.7858936315909609
Train Loss: 0.14014928839294838
Valid Loss: 0.5997025298217855
Score Improved (0.7772198128281215 ---> 0.7858936315909609)
Model Saved

Training complete in 0h 49m 6s
Best Score: 0.7859


## Inference

In [53]:
test = pd.read_csv("/kaggle/input/gods-4-0-dataset/test (6).csv")

In [55]:
test["content"] = test["content"].fillna(" ")

In [75]:
valid_dataset = Dataset(df = test, test_phase=True)
test_loader = DataLoader(
                valid_dataset,
                batch_size=cfg["valid_batch_size"],
                num_workers=4,
                shuffle=False,
                pin_memory=True,
            )

In [76]:
final_preds = []
for fold in range(5):
    model = Model()
    model.to(cfg['device'])
    path = "/kaggle/input/deberta-base-fold0/fold_0.bin" if fold ==0 else f"fold_{fold}.bin"

    model.load_state_dict(
        torch.load(path)
    )
    preds = infer(model, test_loader, 'cuda')
    final_preds.append(preds)

final_preds = np.mean(final_preds, 0) 

100%|██████████| 154/154 [01:05<00:00,  2.34it/s]
100%|██████████| 154/154 [01:06<00:00,  2.33it/s]
100%|██████████| 154/154 [01:05<00:00,  2.33it/s]
100%|██████████| 154/154 [01:05<00:00,  2.33it/s]
100%|██████████| 154/154 [01:05<00:00,  2.33it/s]


In [81]:
for i in range(5):
    test[f'deb_base_preds_class_{i}'] = final_preds[:, i]

In [84]:
test = test[[col for col in test.columns if col not in ["title", "content"]]]

In [85]:
test.to_csv("deb_base_test.csv", index=False)