In [1]:
!python -m pip install --no-index --find-links '/kaggle/input/seqeval/' -r '/kaggle/input/seqeval/requirements.txt'

Looking in links: /kaggle/input/seqeval/
Processing /kaggle/input/seqeval/seqeval-1.2.2.tar.gz (from -r /kaggle/input/seqeval/requirements.txt (line 1))
  Preparing metadata (setup.py) ... [?25l- \ | done
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l- \ | done
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=a2c62ff892b1ce6e175ea406d86680b9d74779c3961eb8173210c1dd47d54826
  Stored in directory: /root/.cache/pip/wheels/be/6d/82/87acaf836bed90667f77936325c0a4b631944650898dee7802
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [2]:
import warnings
warnings.filterwarnings('ignore')

import math
import numpy as np
import pandas as pd
import inspect
import gc
from tqdm import tqdm
from contextlib import nullcontext
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModelForTokenClassification

from seqeval.metrics import recall_score, precision_score, classification_report, f1_score

In [3]:
##############################
eval_iters = 50
iter_num = 0
## training params
n_embd = 768
n_hidden  = n_embd*2
gradient_accumulation_steps = 8 # to simulate a larger batch size
batch_size = 4
k_random = 20
# micro step if gradient_accumulation_steps > 0 
dropout = .1
# optimizer
learning_rate = 2e-5
decay_lr = True 
lr_decay_iters = 650 # make equal to max_iters usually
min_lr = 1e-6 # learning_rate / 10 usually
warmup_iters = 10 #
max_iters = 650 
betas = (0.9,0.99)
weight_decay = .01
grad_clip = 1.0

# ctx
device='cuda' if torch.cuda.is_available() else 'cpu'
dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = nullcontext() if device == 'cpu' else torch.amp.autocast(device_type=device, dtype=ptdtype)

# base model
model_checkpoint = '/kaggle/input/debertav3base'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [4]:
data = pd.read_json('/kaggle/input/pii-detection-removal-from-educational-data/train.json')

In [5]:
def converttokenstodebert(sample,test=False):
    new_tokens = []
    new_labels = []
    mask = []
    for i,t in enumerate(tokenizer(sample.tokens)['input_ids']):
        if t==[]:
            continue
        new_tokens.extend(t[1:-1])
        mask.extend([i]*(len(t)-2))
        if not test:
            new_labels.extend([sample.labels[i]]*(len(t)-2))
    
    if test:
        return [new_tokens,mask] 
    else:
        return [new_tokens,new_labels,mask]

In [6]:
data.loc[:,['new_tokens','new_labels','mask']] = pd.DataFrame(data.apply(lambda x: converttokenstodebert(x),axis=1).tolist()).values

In [7]:
# get the set of unique labels in the train dataset
unique_labels = set()
for i in data['new_labels'].apply(lambda x: set(x)):
    unique_labels = unique_labels.union(i)
unique_labels = list(unique_labels)
unique_labels.sort()

In [8]:
# Identify the samples that have some target in the labels.
data = data.assign(Has_label = data.apply(lambda x: False in ['O'==i for i in x['new_labels']],axis=1))
data.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,new_tokens,new_labels,mask,Has_label
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...","[2169, 12103, 270, 3513, 28310, 4593, 341, 737...","[O, O, O, O, O, O, O, O, O, O, B-NAME_STUDENT,...","[0, 1, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 10, 12, ...",True
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...","[4941, 60488, 2169, 12103, 28525, 51146, 9395,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...","[0, 1, 3, 4, 5, 7, 8, 10, 11, 12, 14, 15, 16, ...",True
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O...","[16514, 568, 293, 102829, 44365, 22496, 6738, ...","[O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, I-NA...","[0, 1, 3, 4, 5, 5, 7, 9, 10, 11, 12, 13, 14, 1...",True
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT...","[2169, 12103, 270, 8432, 63632, 608, 3365, 260...","[O, O, O, O, B-NAME_STUDENT, B-NAME_STUDENT, I...","[0, 1, 2, 3, 5, 5, 6, 6, 8, 8, 9, 11, 12, 14, ...",True
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST...","[28525, 877, 51146, 45730, 22543, 293, 877, 51...","[O, O, O, O, O, O, O, B-NAME_STUDENT, I-NAME_S...","[0, 1, 3, 5, 7, 9, 10, 12, 13, 15, 16, 18, 20,...",True


In [9]:
data_train, data_test = train_test_split(data,test_size=0.1, stratify=data['Has_label'], random_state=12)

In [10]:
#model


class Classifier(nn.Module):

    def __init__(self,model_checkpoint):
        super().__init__()

        self.model_checkpoint = AutoModelForTokenClassification.from_pretrained(model_checkpoint).deberta
        
        self.dropout = nn.Dropout(dropout)
        self.clf = nn.Linear(n_embd,len(unique_labels))
        self.filter_ = torch.tensor([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0],requires_grad=False).to(device) # 'O' target


    def forward(self,x,targets=None,k_random=None):
        
        x = self.model_checkpoint(**x).last_hidden_state
        x = self.dropout(x)
        logits = self.clf(x)
        
        logits = logits.softmax(dim=-1)
        
        # training
        if targets is not None:
            # Identify the positions in the logits sequence that predict a target.
            mask_logits = logits.argmax(-1)!=12

            # do the same with the label sequence.
            mask_targets = torch.sum(targets!=self.filter_,-1)!=0

            # for the first 250 iterations, the cost function is applied to targets predicted by the model and the actual ones.
            if iter_num<250:
                mask = mask_targets+mask_logits
                # after 150 iterations, negative predictions are also being added.
                if np.random.rand() > 0.5 and iter_num >150:
                    random_positions = torch.randint(0,mask.shape[1],(iter_num//k_random,1))
                    mask[:,random_positions]=True
            else:
                # combine the two masks
                mask = mask_targets+mask_logits
                # add extra random tokens
                random_positions = torch.randint(0,mask.shape[1],(iter_num//k_random,1))
                mask[:,random_positions]=True

            logits = logits[mask]
            targets= targets[mask]
            
            loss = F.cross_entropy(logits,targets)
        
        # inference
        else:
            loss=None
            mask=None
            
        return logits, loss, mask

    def configure_optimizers(self,weight_decay, learning_rate, betas, device_type):

        # the parameters to which regularization is applied are separated.
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (nn.Linear,)
        blacklist_weight_modules = (nn.LayerNorm, nn.Embedding)

        for mn, m in self.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn,pn) if mn else pn

                if pn.endswith('bias'):
                    no_decay.add(fpn)
                    continue
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    no_decay.add(fpn)

        param_dict = {pn:p for pn, p in self.named_parameters()}
        # validation
        inter_params = decay & no_decay
        union_params = decay | no_decay
        assert len(inter_params) == 0, "Los parámetros %s están en los dos conjuntos decay y no_decay!" % str(inter_params)
        assert len(param_dict.keys() - union_params) == 0, "Los parámetros %s, no están en nungún conjunto decay/no_decay!" % str(param_dict.keys() - union_params)

        # optimization objects
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay":weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay":0.0}
        ]
        
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == 'cuda'
        extra_args = dict(fused=True) if use_fused else dict()

        optimizer = torch.optim.AdamW(optim_groups,lr=learning_rate,betas=betas)

        return optimizer
    
    @torch.no_grad()
    def predict(self,test):
        self.eval()
        preds = []
        for i in tqdm(range(0,test.shape[0],batch_size)):
            X = get_batch(test,test=True,i=i)
            
            logits,_,_ = self(X)
            pred = logits.cpu().argmax(-1)
            preds.append(pred)
        
        self.train()
        return preds

In [11]:
# training model
model = Classifier(model_checkpoint)
model.to(device)

# final model for inference
best_model = Classifier(model_checkpoint)
best_model.eval()

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at /kaggle/input/debertav3base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at /kaggle/input/debertav3base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Classifier(
  (model_checkpoint): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
        

In [12]:
# initialize a GradScaler. If enabled=False scaler is a no-op
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))

# optimizer
optimizer = model.configure_optimizers(weight_decay, learning_rate, betas, device)

In [13]:
# data collator
def get_batch(data,test=False,i=0):
    
    if not test:
        MAX_LENGHT = 1024
        # get random samples during training
        # start adding negative labels after 200 iters
        if iter_num<200:
            batch = data.query('Has_label==1')[['new_tokens','new_labels']].sample(batch_size)
        else:
            
            batch = data[['new_tokens','new_labels']].sample(batch_size) 
        MAX_INPUT_LENGHT = batch.new_tokens.apply(len).max()
    else:
        MAX_LENGHT=10000
        # get samples in sequential order in inference
        batch = data.iloc[i:i+batch_size][['new_tokens']]
        MAX_INPUT_LENGHT = batch.new_tokens.apply(len).max()

    # pad the lists to MAX_INPUT_LENGHT
    input_ids = torch.Tensor(batch['new_tokens'].apply(lambda x: [1]+x+[2] + [0]*(MAX_INPUT_LENGHT - len(x))).tolist()[:MAX_LENGHT]).to(torch.long)
    # mask attention
    attention_mask = torch.Tensor(batch['new_tokens'].apply(lambda x: [1]*(len(x)+2) + [0]*(MAX_INPUT_LENGHT - len(x))).tolist()[:MAX_LENGHT]).to(torch.long)
    # input fed into the model
    X = {'input_ids':input_ids[:,:MAX_LENGHT].to(device),
         'attention_mask':attention_mask[:,:MAX_LENGHT].to(device)}
    if not test:
        # convert the labels ['O','O',...] into one hot sparse lists [[0,0,1,0,..],[0,..]...]
        y = batch['new_labels'].apply(lambda x: x + ['O']*(MAX_INPUT_LENGHT - len(x)))
        y = y.apply(lambda x: [[0]*12+[1]] + [[0]*(unique_labels.index(l)+1-1)+[1]+(len(unique_labels)-unique_labels.index(l)-1)*[0] for l in x]+[[0]*12+[1]])
        y = torch.Tensor(y.tolist())[:,:MAX_LENGHT].to(device)
    
        return X,y  # X format ~ {'input_ids':tensor(...),'att_mask':tensor(0,..)}
    
    else:
        return X

In [14]:
# learning rate warmup with cosine decay
def get_lr(it):
    # 1) linear warmup for the first warmup_iters steps
    if it < warmup_iters:
        return learning_rate #* it / warmup_iters
    # 2) if it > lr_decay_iters, return min learning rate
    if it > lr_decay_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)

In [15]:
## evaluation function
np.seterr(invalid='ignore')
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        recall = torch.zeros(eval_iters)
        precision = torch.zeros(eval_iters)
        f1_scores = torch.zeros(eval_iters)
        for k in tqdm(range(eval_iters), ascii=True, desc=split):
            X,Y = get_batch(data_train) if split=='train' else get_batch(data_test)
            with ctx:
                logits, loss, mask = model(X,Y,k_random)
            losses[k] = loss.item()
            # f5 score
            predictions = (pd.Series(logits.cpu().argmax(-1).tolist())
                           .apply(lambda x: unique_labels[x])
                           .tolist()
                          )
            
            y_true = (pd.Series(Y[mask].cpu().argmax(-1).tolist())
                           .apply(lambda x: unique_labels[x])
                           .tolist()
                     )
            
            recall[k] = recall_score([predictions],[y_true],zero_division=0)
            precision[k] = precision_score([predictions], [y_true],zero_division=0)
            f1_scores[k] = (1 + 5*5) * recall[k] * precision[k] / (5*5*precision[k] + recall[k])
        out[split] = np.nanmean(losses)
        out[split+'_precision'] = torch.mean(precision)
        out[split+'_recall'] = torch.mean(recall)
        out[split+'_pred_f'] = np.nanmean(f1_scores)
    model.train()
    return out

In [16]:
X,Y = get_batch(data_train)
best_f5_score = 0
best_val_score = 0.85
accuracy_bak = []
loss_bak = []
while True:
    
    lr = get_lr(iter_num) if decay_lr else learning_rate
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    # evaluation step 
    if iter_num % 5 ==0:
        print('Evaluating...')
        losses = estimate_loss()
        print(f"""step {iter_num}: train loss {losses['train']:.4f},train f1 {losses['train_pred_f']:.4f}, 
            val loss {losses['val']:.4f}, val precision {losses['val_precision']:.4f}, val recall {losses['val_recall']:.4f}, val f1 {losses['val_pred_f']:.4f}, learning rate {lr*10**4:.4f}\n""")
        
        if losses['val_pred_f'] > best_val_score or losses['val_pred_f'] > 0.97:
            best_val_score = losses['val_pred_f']
            ## Making predictions 
            batch_size = 1
            
            predictions = model.predict(data_test)
            # return batch size
            batch_size=4
            
            list_preds = []
            for tensor in predictions:
                for l in tensor.tolist():
                    list_preds.extend([[unique_labels[i] for i in l]])
            # calculating metrics              
            recall = recall_score(list_preds,data_test.new_labels.apply(lambda x: ['O']+x+['O']).tolist(),zero_division=0)
            precision = precision_score(list_preds,data_test.new_labels.apply(lambda x: ['O']+x+['O']).tolist(),zero_division=0)
            f5_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
            print(f"step: {iter_num}, precision: {precision}, recall: {recall}, f5: {f5_score}")
            
            # save the best model
            if f5_score > best_f5_score:
                best_f5_score = f5_score
                print("Model checkpoint")
                best_model.load_state_dict(model.state_dict())
            
            del predictions, list_preds
            gc.collect()
            torch.cuda.empty_cache()
            
    # one forward backward step with gradient accumulation to simulate larger batch size
    for micro_step in range(gradient_accumulation_steps):

        # forward step with autocast
        with ctx:
            _,loss,_ = model(X,Y,k_random)
            loss = loss/gradient_accumulation_steps
        
        # remove X and Y from memory
        del X, Y
        gc.collect()
        torch.cuda.empty_cache()
        
        # get the batch for the next step     
        X, Y = get_batch(data)
        # backward step with GradScaler when training is in fp16 to prevent "overflow"
        scaler.scale(loss).backward()
        
    # optional clip the gradients
    if grad_clip != 0.0:
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        
    # step the optimizer and scaler if training in fp16
    scaler.step(optimizer)
    scaler.update()
    optimizer.zero_grad(set_to_none=True)

    # report loss train in each step
    loss_bak.append(loss.cpu().detach().numpy())
    # mean values of last 10 steps
    print(f"Iter {iter_num}: Loss: {np.nanmean(loss_bak[-10:]):.4f},")
    
    iter_num +=1
    
    if iter_num == 101:
        break
        

# delete training model, no needed anymore
del model.model_checkpoint, model.clf, model, optimizer, scaler
gc.collect()
torch.cuda.empty_cache()
        

Evaluating...


train: 100%|##########| 50/50 [00:16<00:00,  3.07it/s]
val: 100%|##########| 50/50 [00:15<00:00,  3.26it/s]


step 0: train loss 2.4943,train f1 0.0070, 
            val loss 2.4972, val precision 0.0004, val recall 0.0000, val f1 0.0020, learning rate 0.2000

Iter 0: Loss: 0.3119,
Iter 1: Loss: 0.3106,
Iter 2: Loss: 0.3108,
Iter 3: Loss: 0.3133,
Iter 4: Loss: 0.3150,
Evaluating...


train: 100%|##########| 50/50 [00:14<00:00,  3.50it/s]
val: 100%|##########| 50/50 [00:14<00:00,  3.35it/s]


step 5: train loss 2.6038,train f1 nan, 
            val loss 2.6164, val precision 0.0000, val recall 0.0000, val f1 nan, learning rate 0.2000

Iter 5: Loss: 0.3168,
Iter 6: Loss: 0.3178,
Iter 7: Loss: 0.3173,
Iter 8: Loss: 0.3172,
Iter 9: Loss: 0.3177,
Evaluating...


train: 100%|##########| 50/50 [00:14<00:00,  3.49it/s]
val: 100%|##########| 50/50 [00:14<00:00,  3.40it/s]


step 10: train loss 2.5682,train f1 nan, 
            val loss 2.5669, val precision 0.0000, val recall 0.0000, val f1 nan, learning rate 0.2000

Iter 10: Loss: 0.3184,
Iter 11: Loss: 0.3188,
Iter 12: Loss: 0.3188,
Iter 13: Loss: 0.3177,
Iter 14: Loss: 0.3166,
Evaluating...


train: 100%|##########| 50/50 [00:14<00:00,  3.41it/s]
val: 100%|##########| 50/50 [00:15<00:00,  3.31it/s]


step 15: train loss 2.4950,train f1 0.4925, 
            val loss 2.4770, val precision 0.0032, val recall 0.0800, val f1 0.3354, learning rate 0.2000

Iter 15: Loss: 0.3146,
Iter 16: Loss: 0.3122,
Iter 17: Loss: 0.3117,
Iter 18: Loss: 0.3115,
Iter 19: Loss: 0.3098,
Evaluating...


train: 100%|##########| 50/50 [00:14<00:00,  3.45it/s]
val: 100%|##########| 50/50 [00:15<00:00,  3.28it/s]


step 20: train loss 2.4368,train f1 0.3300, 
            val loss 2.3960, val precision 0.1980, val recall 0.3226, val f1 0.4606, learning rate 0.1999

Iter 20: Loss: 0.3079,
Iter 21: Loss: 0.3067,
Iter 22: Loss: 0.3053,
Iter 23: Loss: 0.3038,
Iter 24: Loss: 0.3025,
Evaluating...


train: 100%|##########| 50/50 [00:14<00:00,  3.49it/s]
val: 100%|##########| 50/50 [00:14<00:00,  3.39it/s]


step 25: train loss 2.3507,train f1 0.2229, 
            val loss 2.3398, val precision 0.2440, val recall 0.2231, val f1 0.2513, learning rate 0.1997

Iter 25: Loss: 0.3012,
Iter 26: Loss: 0.3004,
Iter 27: Loss: 0.2973,
Iter 28: Loss: 0.2947,
Iter 29: Loss: 0.2938,
Evaluating...


train: 100%|##########| 50/50 [00:14<00:00,  3.37it/s]
val: 100%|##########| 50/50 [00:15<00:00,  3.27it/s]


step 30: train loss 2.3018,train f1 0.3455, 
            val loss 2.2618, val precision 0.4185, val recall 0.3865, val f1 0.3993, learning rate 0.1995

Iter 30: Loss: 0.2927,
Iter 31: Loss: 0.2904,
Iter 32: Loss: 0.2905,
Iter 33: Loss: 0.2894,
Iter 34: Loss: 0.2886,
Evaluating...


train: 100%|##########| 50/50 [00:14<00:00,  3.45it/s]
val: 100%|##########| 50/50 [00:15<00:00,  3.23it/s]


step 35: train loss 2.2242,train f1 0.4505, 
            val loss 2.2099, val precision 0.5324, val recall 0.4146, val f1 0.4204, learning rate 0.1993

Iter 35: Loss: 0.2895,
Iter 36: Loss: 0.2896,
Iter 37: Loss: 0.2866,
Iter 38: Loss: 0.2859,
Iter 39: Loss: 0.2849,
Evaluating...


train: 100%|##########| 50/50 [00:14<00:00,  3.45it/s]
val: 100%|##########| 50/50 [00:14<00:00,  3.46it/s]


step 40: train loss 2.1163,train f1 0.6297, 
            val loss 2.0452, val precision 0.8315, val recall 0.7379, val f1 0.7374, learning rate 0.1990

Iter 40: Loss: 0.2828,
Iter 41: Loss: 0.2856,
Iter 42: Loss: 0.2816,
Iter 43: Loss: 0.2817,
Iter 44: Loss: 0.2768,
Evaluating...


train: 100%|##########| 50/50 [00:14<00:00,  3.38it/s]
val: 100%|##########| 50/50 [00:14<00:00,  3.37it/s]


step 45: train loss 2.0250,train f1 0.7086, 
            val loss 2.0019, val precision 0.8614, val recall 0.7354, val f1 0.7364, learning rate 0.1986

Iter 45: Loss: 0.2713,
Iter 46: Loss: 0.2685,
Iter 47: Loss: 0.2709,
Iter 48: Loss: 0.2661,
Iter 49: Loss: 0.2629,
Evaluating...


train: 100%|##########| 50/50 [00:14<00:00,  3.35it/s]
val: 100%|##########| 50/50 [00:14<00:00,  3.35it/s]


step 50: train loss 1.9524,train f1 0.7569, 
            val loss 1.9055, val precision 0.8104, val recall 0.8484, val f1 0.8440, learning rate 0.1982

Iter 50: Loss: 0.2634,
Iter 51: Loss: 0.2562,
Iter 52: Loss: 0.2579,
Iter 53: Loss: 0.2546,
Iter 54: Loss: 0.2566,
Evaluating...


train: 100%|##########| 50/50 [00:14<00:00,  3.47it/s]
val: 100%|##########| 50/50 [00:15<00:00,  3.26it/s]


step 55: train loss 1.9191,train f1 0.7589, 
            val loss 1.8492, val precision 0.8637, val recall 0.8374, val f1 0.8371, learning rate 0.1977

Iter 55: Loss: 0.2540,
Iter 56: Loss: 0.2546,
Iter 57: Loss: 0.2498,
Iter 58: Loss: 0.2515,
Iter 59: Loss: 0.2514,
Evaluating...


train: 100%|##########| 50/50 [00:14<00:00,  3.35it/s]
val: 100%|##########| 50/50 [00:14<00:00,  3.36it/s]


step 60: train loss 1.9023,train f1 0.7931, 
            val loss 1.8482, val precision 0.8213, val recall 0.8575, val f1 0.8539, learning rate 0.1972



100%|██████████| 681/681 [00:38<00:00, 17.62it/s]


step: 60, precision: 0.8039702233250621, recall: 0.5567010309278351, f5: 0.5633652109944491
Model checkpoint
Iter 60: Loss: 0.2489,
Iter 61: Loss: 0.2469,
Iter 62: Loss: 0.2427,
Iter 63: Loss: 0.2426,
Iter 64: Loss: 0.2426,
Evaluating...


train: 100%|##########| 50/50 [00:14<00:00,  3.55it/s]
val: 100%|##########| 50/50 [00:15<00:00,  3.26it/s]


step 65: train loss 1.8726,train f1 0.8175, 
            val loss 1.8613, val precision 0.7843, val recall 0.8567, val f1 0.8409, learning rate 0.1966

Iter 65: Loss: 0.2431,
Iter 66: Loss: 0.2443,
Iter 67: Loss: 0.2436,
Iter 68: Loss: 0.2399,
Iter 69: Loss: 0.2365,
Evaluating...


train: 100%|##########| 50/50 [00:14<00:00,  3.36it/s]
val: 100%|##########| 50/50 [00:14<00:00,  3.36it/s]


step 70: train loss 1.9813,train f1 0.8096, 
            val loss 1.8604, val precision 0.8233, val recall 0.9069, val f1 0.8970, learning rate 0.1959



100%|██████████| 681/681 [00:38<00:00, 17.61it/s]


step: 70, precision: 0.6972704714640199, recall: 0.7113924050632912, f5: 0.7108386845689824
Model checkpoint
Iter 70: Loss: 0.2332,
Iter 71: Loss: 0.2333,
Iter 72: Loss: 0.2313,
Iter 73: Loss: 0.2283,
Iter 74: Loss: 0.2235,
Evaluating...


train: 100%|##########| 50/50 [00:14<00:00,  3.44it/s]
val: 100%|##########| 50/50 [00:14<00:00,  3.40it/s]


step 75: train loss 1.8395,train f1 0.8149, 
            val loss 1.7954, val precision 0.9129, val recall 0.8904, val f1 0.8899, learning rate 0.1952

Iter 75: Loss: 0.2234,
Iter 76: Loss: 0.2184,
Iter 77: Loss: 0.2221,
Iter 78: Loss: 0.2222,
Iter 79: Loss: 0.2279,
Evaluating...


train: 100%|##########| 50/50 [00:14<00:00,  3.44it/s]
val: 100%|##########| 50/50 [00:14<00:00,  3.40it/s]


step 80: train loss 1.8645,train f1 0.7854, 
            val loss 1.7863, val precision 0.8981, val recall 0.9027, val f1 0.9018, learning rate 0.1944



100%|██████████| 681/681 [00:38<00:00, 17.61it/s]


step: 80, precision: 0.8436724565756824, recall: 0.4207920792079208, f5: 0.429063728583216
Iter 80: Loss: 0.2304,
Iter 81: Loss: 0.2326,
Iter 82: Loss: 0.2330,
Iter 83: Loss: 0.2323,
Iter 84: Loss: 0.2341,
Evaluating...


train: 100%|##########| 50/50 [00:14<00:00,  3.36it/s]
val: 100%|##########| 50/50 [00:15<00:00,  3.27it/s]


step 85: train loss 1.8221,train f1 0.8820, 
            val loss 1.8092, val precision 0.8469, val recall 0.9052, val f1 0.9020, learning rate 0.1936



100%|██████████| 681/681 [00:38<00:00, 17.60it/s]


step: 85, precision: 0.7617866004962779, recall: 0.5984405458089669, f5: 0.6034169942546114
Iter 85: Loss: 0.2330,
Iter 86: Loss: 0.2321,
Iter 87: Loss: 0.2300,
Iter 88: Loss: 0.2304,
Iter 89: Loss: 0.2277,
Evaluating...


train: 100%|##########| 50/50 [00:13<00:00,  3.61it/s]
val: 100%|##########| 50/50 [00:14<00:00,  3.38it/s]


step 90: train loss 1.8510,train f1 0.8403, 
            val loss 1.7809, val precision 0.9453, val recall 0.8990, val f1 0.8990, learning rate 0.1928

Iter 90: Loss: 0.2251,
Iter 91: Loss: 0.2291,
Iter 92: Loss: 0.2287,
Iter 93: Loss: 0.2327,
Iter 94: Loss: 0.2310,
Evaluating...


train: 100%|##########| 50/50 [00:14<00:00,  3.46it/s]
val: 100%|##########| 50/50 [00:14<00:00,  3.36it/s]


step 95: train loss 1.8218,train f1 0.8705, 
            val loss 1.8415, val precision 0.8904, val recall 0.8287, val f1 0.8280, learning rate 0.1918

Iter 95: Loss: 0.2313,
Iter 96: Loss: 0.2302,
Iter 97: Loss: 0.2304,
Iter 98: Loss: 0.2357,
Iter 99: Loss: 0.2387,
Evaluating...


train: 100%|##########| 50/50 [00:14<00:00,  3.45it/s]
val: 100%|##########| 50/50 [00:15<00:00,  3.27it/s]


step 100: train loss 1.8236,train f1 0.9048, 
            val loss 1.8396, val precision 0.8555, val recall 0.9185, val f1 0.9115, learning rate 0.1909



100%|██████████| 681/681 [00:38<00:00, 17.63it/s]


step: 100, precision: 0.7890818858560794, recall: 0.6162790697674418, f5: 0.6215139442231075
Iter 100: Loss: 0.2427,


In [17]:
## Making predictions
batch_size = 1
best_model.to(device)
predictions = best_model.predict(data_test)

100%|██████████| 681/681 [00:38<00:00, 17.60it/s]


In [18]:
list_preds = []
for tensor in predictions:
    for l in tensor.tolist():
        list_preds.extend([[unique_labels[i] for i in l]])

In [19]:
recall = recall_score(list_preds,data_test.new_labels.apply(lambda x: ['O']+x+['O']).tolist(),zero_division=0)
precision = precision_score(list_preds,data_test.new_labels.apply(lambda x: ['O']+x+['O']).tolist(),zero_division=0)
f5_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
print(f"precision: {precision}, recall: {recall}, f5: {f5_score}")

precision: 0.6972704714640199, recall: 0.7113924050632912, f5: 0.7108386845689824


In [20]:
test = pd.read_json('/kaggle/input/pii-detection-removal-from-educational-data/test.json')
submissions = pd.read_csv('/kaggle/input/pii-detection-removal-from-educational-data/sample_submission.csv')

In [21]:
# transform test data
test.loc[:,['new_tokens','mask']] = pd.DataFrame(test.apply(lambda x: converttokenstodebert(x,True),axis=1).tolist()).values

In [22]:
## Making predictions
batch_size = 1
predictions = best_model.predict(test)

100%|██████████| 10/10 [00:00<00:00, 14.27it/s]


In [23]:
# reverse the transformation done by converttokenstodebert
positions = []
for i,pred in enumerate(predictions):
    
    # where there is a positive target
    mask_ = pred[0,1:-1]!=12
    
    # row_id: initialize the column with zeros, then we set it equal to the index.
    # document: document number from the 'document' column.
    # token: token position that we know thanks to the mask calculated in the transformation.
    positions = positions + [[0, test.iloc[i].document, test.iloc[i]['mask'][j],unique_labels[pred[0,j+1].item()], test.iloc[i].tokens[test.iloc[i]['mask'][j]]] for j,pos in enumerate(mask_) if pos]
    

submit = pd.DataFrame(positions,columns=submissions.columns.tolist()+['word']).drop_duplicates()
submit.row_id = submit.index
submit

Unnamed: 0,row_id,document,token,label,word
0,0,7,9,B-NAME_STUDENT,Nathalie
1,1,7,10,I-NAME_STUDENT,Sylla
3,3,7,482,B-NAME_STUDENT,Nathalie
4,4,7,483,I-NAME_STUDENT,Sylla
6,6,7,741,B-NAME_STUDENT,Nathalie
7,7,7,742,I-NAME_STUDENT,Sylla
9,9,10,0,B-NAME_STUDENT,Diego
10,10,10,1,I-NAME_STUDENT,Estrada
11,11,10,464,B-NAME_STUDENT,Diego
12,12,10,465,I-NAME_STUDENT,Estrada


In [24]:
submit[['row_id','document','token','label']].to_csv('submission.csv',index=False)


In [25]:
submit[['row_id','document','token','label']]

Unnamed: 0,row_id,document,token,label
0,0,7,9,B-NAME_STUDENT
1,1,7,10,I-NAME_STUDENT
3,3,7,482,B-NAME_STUDENT
4,4,7,483,I-NAME_STUDENT
6,6,7,741,B-NAME_STUDENT
7,7,7,742,I-NAME_STUDENT
9,9,10,0,B-NAME_STUDENT
10,10,10,1,I-NAME_STUDENT
11,11,10,464,B-NAME_STUDENT
12,12,10,465,I-NAME_STUDENT
