In [1]:
import pandas as pd
import numpy as np
from transformers import AutoModel, AutoTokenizer, get_cosine_schedule_with_warmup, AutoConfig
import torch
import torch.nn as nn
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import Sampler, Dataset, DataLoader
from IPython.display import display
from accelerate import Accelerator
from tqdm.notebook import tqdm
import random
import os
import multiprocessing
import more_itertools

In [2]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [3]:
class cfg():
    max_len = 512
    model_name = "../input/deberta-v3-base/deberta-v3-base"
    train_batch_size = 2
    valid_batch_size = 16
    fold = 1
    device = "cuda" if torch.cuda.is_available() else "cpu"
    bilstm_hidden = 256
    epochs = 1
    n_folds = 5
    debug = False
    train_folds = [0]

In [4]:
real_df = pd.read_csv("../input/fake-and-real-news-dataset/Fake.csv")
fake_df = pd.read_csv("../input/fake-and-real-news-dataset/True.csv")

In [5]:
real_df["label"] = 1
fake_df["label"] = 0
full_df = pd.concat((real_df, fake_df))

In [6]:
if cfg.debug:
    cfg.train_batch_size=4
    cfg.valid_batch_size=8
    full_df = full_df[:100]

In [7]:
mskf = StratifiedKFold(n_splits=cfg.n_folds, shuffle=True, random_state=42)

full_df.reset_index(inplace=True)

for fold, (trn_, val_) in enumerate(mskf.split(full_df, full_df["label"])):
    print(len(trn_), len(val_))
    full_df.loc[val_, "kfold"] = fold
    
full_df["kfold"] = full_df["kfold"].astype(int)

35918 8980
35918 8980
35918 8980
35919 8979
35919 8979


In [8]:
class ClassificationDataset(torch.nn.Module):
    def __init__(self, df, tokenizer):
        self.texts = df["text"].values 
        self.labels = df["label"].values 
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer(text, truncation=True, max_length=cfg.max_len, padding="max_length")
        inputs = {k : torch.tensor(v) for (k, v) in inputs.items()}
        return inputs, torch.tensor(self.labels[idx])

In [9]:
class WeightedLayerPooling(torch.nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else torch.nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average
    
class FeedBackModel(torch.nn.Module):
    def __init__(self):
        super(FeedBackModel, self).__init__()
        tconfig = AutoConfig.from_pretrained(cfg.model_name)
        tconfig.update({'output_hidden_states':True})
        self.model = AutoModel.from_pretrained(cfg.model_name, config=tconfig)
        self.model.base_model.embeddings.requires_grad_(False)
        self.fc = torch.nn.Linear(tconfig.hidden_size, 1)
        self.pooler = WeightedLayerPooling(tconfig.num_hidden_layers, layer_start=9, layer_weights=None)
        self.fc_dp = torch.nn.Dropout(0.2)
        
    def forward(self, inputs):        
        out_e = self.model(**inputs)
        out = torch.stack(out_e["hidden_states"])
        out = self.pooler(out)
        outputs = self.fc(self.fc_dp(out[:, 0]))
        return outputs

In [10]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [11]:
def train_epoch(dataloader, model, optimizer, loss_fn, scheduler, epoch, fold):
    model.train()
    print("="*15, ">" f"Fold {fold+1} Epoch {epoch}", "<", "*"*15, "\n\n")
    losses = AverageMeter()
    for batch_idx, (example, labels) in tqdm(enumerate(dataloader), total=len(dataloader)):
        optimizer.zero_grad()
        inputs = {k : v.to(cfg.device) for (k, v) in example.items()}
        with torch.cuda.amp.autocast(enabled=True):
            out = model(inputs)
        
        loss = loss_fn(out.cpu().float().squeeze(), labels.float())
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        losses.update(loss.item(), cfg.train_batch_size)
        
        if (batch_idx+1) % 100 == 0:
            print(f"Epoch [{epoch}] | Batch Number: [{batch_idx+1}/{len(dataloader)}] | Loss: [{losses.avg}]\n")
            
    return losses.avg

In [12]:
def validate_fn(dataloader, model, loss_fn):
    model.eval()
    losses = AverageMeter()
    for batch_idx, (example, labels) in tqdm(enumerate(dataloader), total=len(dataloader)):
        inputs = {k : v.to(cfg.device) for (k, v) in example.items()}
        with torch.no_grad():
            out = model(inputs)
        loss = loss_fn(out.cpu().squeeze(), labels.float())
        losses.update(loss.item(), cfg.train_batch_size)
    return losses.avg

In [13]:
def train_fold(fold): 
    train_df = full_df[full_df["kfold"] != fold]
    valid_df = full_df[full_df["kfold"] == fold]
    
    global tokenizer 
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
    train_dataset = ClassificationDataset(train_df, tokenizer)
    train_dataloader = DataLoader(train_dataset, shuffle=True, num_workers=2, batch_size=cfg.train_batch_size)
    valid_dataset = ClassificationDataset(valid_df, tokenizer)
    valid_dataloader = DataLoader(valid_dataset, shuffle=True, num_workers=2, batch_size=cfg.valid_batch_size)
    
    model = FeedBackModel().to(cfg.device)
    optimizer = torch.optim.AdamW([
        {"params": model.fc.parameters(), "lr": 3e-5},
        {"params": model.pooler.parameters(), "lr": 3e-5},
        {"params": model.model.parameters(), "lr": 5e-6},
    ],
    lr=5e-4)
    scheduler = get_cosine_schedule_with_warmup(optimizer, 
                                                num_warmup_steps=0, 
                                                num_cycles=0.5, 
                                                num_training_steps=int(len(train_dataset) / cfg.train_batch_size * cfg.epochs))
    
    for epoch in range(cfg.epochs):
        train_loss = train_epoch(train_dataloader, model, optimizer, nn.BCEWithLogitsLoss(), scheduler, epoch+1, fold)
        valid_loss = validate_fn(valid_dataloader, model, nn.BCEWithLogitsLoss())
        print("="*15, ">" f"Fold {fold+1} Epoch {epoch+1} Results:", "<", "*"*15, "\n\n")
        print(f"Training Loss: {train_loss}")
        print(f"Validation Loss: {valid_loss}")
        torch.save(model.state_dict(), f"deberta_base_epoch_{epoch+1}_fold_{fold+1}.pth")

In [14]:
if __name__ == "__main__":
    for fold in cfg.train_folds:
        train_fold(fold)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at ../input/deberta-v3-base/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.classifer.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifer.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exact





  0%|          | 0/17959 [00:00<?, ?it/s]

Epoch [1] | Batch Number: [100/17959] | Loss: [0.6437284523248672]

Epoch [1] | Batch Number: [200/17959] | Loss: [0.4628923485800624]

Epoch [1] | Batch Number: [300/17959] | Loss: [0.34979303888976576]

Epoch [1] | Batch Number: [400/17959] | Loss: [0.2796166981989518]

Epoch [1] | Batch Number: [500/17959] | Loss: [0.23624375519156457]

Epoch [1] | Batch Number: [600/17959] | Loss: [0.20093147114074478]

Epoch [1] | Batch Number: [700/17959] | Loss: [0.17609144996851683]

Epoch [1] | Batch Number: [800/17959] | Loss: [0.15685757230152375]

Epoch [1] | Batch Number: [900/17959] | Loss: [0.14242995211078474]

Epoch [1] | Batch Number: [1000/17959] | Loss: [0.1365578143214807]

Epoch [1] | Batch Number: [1100/17959] | Loss: [0.12814203868239102]

Epoch [1] | Batch Number: [1200/17959] | Loss: [0.11926853398482005]

Epoch [1] | Batch Number: [1300/17959] | Loss: [0.1110254852559704]

Epoch [1] | Batch Number: [1400/17959] | Loss: [0.10377718660448279]

Epoch [1] | Batch Number: [1500/17

  0%|          | 0/562 [00:00<?, ?it/s]



Training Loss: 0.010950850711244997
Validation Loss: 0.0008322860623166206
