In [1]:

from config import config
import pandas as pd
from fastai.vision.all import *
import torch
from torch.utils.data import DataLoader,Dataset
import numpy as np
from transformers import AutoModel,AutoTokenizer
from tqdm import tqdm
from scipy.ndimage import convolve1d
from scipy.ndimage import gaussian_filter1d
from scipy.signal.windows import triang
def get_lds_kernel_window(kernel, ks, sigma):
    assert kernel in ['gaussian', 'triang', 'laplace']
    half_ks = (ks - 1) // 2
    if kernel == 'gaussian':
        base_kernel = [0.] * half_ks + [1.] + [0.] * half_ks
        kernel_window = gaussian_filter1d(base_kernel, sigma=sigma) / max(gaussian_filter1d(base_kernel, sigma=sigma))
    elif kernel == 'triang':
        kernel_window = triang(ks)
    else:
        laplace = lambda x: np.exp(-abs(x) / sigma) / (2. * sigma)
        kernel_window = list(map(laplace, np.arange(-half_ks, half_ks + 1))) / max(map(laplace, np.arange(-half_ks, half_ks + 1)))

    return kernel_window
class fpe_dataset(torch.utils.data.Dataset):
    def __init__(self,df,max_len,pretrained_path,train=True):
        self.train = train
        self.df = df.reset_index(drop=True)
        self.max_len = max_len
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
    def __len__(self):
        return len(self.df)
    def __getitem__(self,idx):
        
        target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions',]
        text_cols = ["full_text","train(en2ch2en)"]
        if self.train:
            text = self.df.loc[idx,text_cols[random.randint(0,len(text_cols)-1)]]
        else:
            text = self.df.loc[idx,"full_text"]


        labels = torch.tensor(self.df.loc[idx,target_cols].values.tolist()).float()
        tokens = self.tokenizer(
                text,
                None,
                add_special_tokens=True,
                padding='max_length',
                truncation=True,
                max_length=self.max_len,return_tensors="pt")
        X = {k:v.squeeze(0) for k,v in tokens.items()}
        X.update({"labels":labels})
        return X
        ('sqrt_inv', True, 'gaussian', 2)
    def _prepare_weights(self,label_cols ,reweight='sqrt_inv', max_target=51, lds=True, lds_kernel='gaussian', lds_ks=5, lds_sigma=2):
        assert reweight in {'none', 'inverse', 'sqrt_inv'}
        assert reweight != 'none' if lds else True, \
            "Set reweight to \'sqrt_inv\' (default) or \'inverse\' when using LDS"

        value_dict = {x: 0 for x in range(max_target)}
        labels = self.df[label_cols].tolist()
        # mbr
        for label in labels:
            value_dict[min(max_target - 1, int(label))] += 1
        if reweight == 'sqrt_inv':
            value_dict = {k: np.sqrt(v) for k, v in value_dict.items()}
        elif reweight == 'inverse':
            value_dict = {k: np.clip(v, 5, 1000) for k, v in value_dict.items()}  # clip weights for inverse re-weight
        num_per_label = [value_dict[min(max_target - 1, int(label))] for label in labels]
        if not len(num_per_label) or reweight == 'none':
            return None
        print(f"Using re-weighting: [{reweight.upper()}]")

        if lds:
            lds_kernel_window = get_lds_kernel_window(lds_kernel, lds_ks, lds_sigma)
            print(f'Using LDS: [{lds_kernel.upper()}] ({lds_ks}/{lds_sigma})')
            smoothed_value = convolve1d(
                np.asarray([v for _, v in value_dict.items()]), weights=lds_kernel_window, mode='constant')
            num_per_label = [smoothed_value[min(max_target - 1, int(label))] for label in labels]

        weights = [np.float32(1 / x) for x in num_per_label]
        scaling = len(weights) / np.sum(weights)
        weights = [scaling * x for x in weights]
        return weights

    


namespace(accumulate_grad_batches=None, backbone_lr=2e-05, bert='microsoft/deberta-v3-base', bs=4, bt='/home/wangjingqi/input/dataset/fpell/train', ck='/home/wangjingqi/input/ck/fpell', crop=0.6, ddp=True, device_ids=[4, 5, 6, 7], epochs=7, head_lr=4e-05, layer_start=-1, llrd=0.2, log='/home/wangjingqi/fpell-pl/log', max_len=640, min_lr=2e-07, model_fname='deberta_ld2crop6-1-640-2022', nfolds=5, num_warmup_steps=0.05, patience=10, precision=16, prefix='ld2crop6-1', reinit_layers=1, seed=2022, submit='/home/wangjingqi/input/dataset/fpell/sample_submission.csv', test='/home/wangjingqi/input/dataset/fpell/test.csv', train='/home/wangjingqi/input/dataset/fpell/train.csv', unfreeze=None, used_folds=[0, 1, 2, 3, 4], val_check_interval=0.2, wd=2)


In [2]:
from config import config
import pandas as pd
train = pd.read_csv(config.train)


train_dataset = fpe_dataset(train,max_len=config.max_len,pretrained_path=config.bert,train=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
train_dataset._prepare_weights("cohesion")

Using re-weighting: [SQRT_INV]
Using LDS: [GAUSSIAN] (5/2)


[0.9604951835318458,
 1.0228360731897475,
 0.9604951835318458,
 1.0428850822989122,
 1.0228360731897475,
 0.9604951835318458,
 0.9604951835318458,
 1.0228360731897475,
 0.9604951835318458,
 0.9604951835318458,
 0.9604951835318458,
 0.9604951835318458,
 0.9604951835318458,
 0.9604951835318458,
 1.0428850822989122,
 1.0228360731897475,
 0.9604951835318458,
 0.9604951835318458,
 0.9604951835318458,
 0.9604951835318458,
 1.0228360731897475,
 1.0228360731897475,
 0.9604951835318458,
 1.0428850822989122,
 1.0428850822989122,
 1.0228360731897475,
 0.9604951835318458,
 1.0428850822989122,
 0.9604951835318458,
 1.0228360731897475,
 1.0428850822989122,
 0.9604951835318458,
 0.9604951835318458,
 0.9604951835318458,
 1.0228360731897475,
 1.0428850822989122,
 0.9604951835318458,
 1.0428850822989122,
 1.0428850822989122,
 1.389548683643053,
 1.0428850822989122,
 1.0228360731897475,
 1.0228360731897475,
 0.9604951835318458,
 1.0428850822989122,
 1.0228360731897475,
 1.0428850822989122,
 0.96049518353