In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import json
from pydoc import locate
import numpy as np
import pandas as pd
import sys
from tqdm.notebook import tqdm
from urllib.parse import urlparse

test_data_file = "/kaggle/input/pii-detection-removal-from-educational-data/test.json"
weghts_path = "/kaggle/input/piid-modelweights/"
chunk_size = 5_000
scaler = 0.02
tokenizer_stride = 32

device = ["cpu", "cuda"][1]

model_params = [
    {"path":"model_387", "max_tokens":512},
    {"path":"model_539", "max_tokens":1024},
    {"path":"model_543", "max_tokens":1024},
    {"path":"model_560", "max_tokens":2048},
    {"path":"model_563", "max_tokens":2048},
    {"path":"model_572", "max_tokens":1024},
]



In [None]:
def chunk(l, n): 
    for i in range(0, len(l), n):  
        yield l[i:i + n] 

def decode_targets(targets:list, doc_ids:list) -> list:
    df = pd.DataFrame({
        "target": targets,
        "document": doc_ids
    })
    df["prev_target"] = df.groupby("document")["target"].shift(1).values
    cond = (df["prev_target"] == df["target"]) & (~df["prev_target"].isnull())
    df["target"] += 100*cond.astype(int)

    df["target"] = df["target"].map({
        0: "O",
        1: "B-NAME_STUDENT",
        2: "B-URL_PERSONAL",
        3: "B-ID_NUM",
        4: "B-EMAIL",
        5: "B-STREET_ADDRESS",
        6: "B-PHONE_NUM",
        7: "B-USERNAME",
        100: "O",
        101: "I-NAME_STUDENT",
        102: "I-URL_PERSONAL",
        103: "I-ID_NUM",
        104: "I-EMAIL",
        105: "I-STREET_ADDRESS",
        106: "I-PHONE_NUM",
        107: "I-USERNAME",
    })

    return df["target"].values.tolist()

with open(test_data_file) as f:
    test = json.load(f)

In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

def tokenize_and_batch_record(record, tokenizer, max_n_tokens, stride):
    max_len = min(tokenizer.model_max_length, max_n_tokens)
    tokenized_inputs = tokenizer(record["tokens"], return_offsets_mapping=False,
                                verbose=False, is_split_into_words=True, add_special_tokens=True,
                                max_length=max_len, stride=stride, truncation=True, return_overflowing_tokens=True)

    if "labels" in record:
        targets_map = {i:v for i,v in enumerate(encode_targets(record["labels"]))}
    else:
        targets_map = {}

    batch = [{
        "input_ids": tokenized_inputs["input_ids"][i],
        "attention_mask": tokenized_inputs["attention_mask"][i],
        "word_ids": [-100 if x is None else x for x in tokenized_inputs.word_ids(i)],
        "targets": [targets_map.get(x, -100) for x in tokenized_inputs.word_ids(i)],
        "document": [record["document"]] * len(tokenized_inputs["input_ids"][i]),
    } for i in range(len(tokenized_inputs["input_ids"]))]

    return batch

def tokenize_and_batch(sample, tokenizer, max_n_tokens, stride):
    tokenized_sample = [tokenize_and_batch_record(rec, tokenizer, max_n_tokens, stride) for rec in tqdm(sample)]
    tokenized_sample = [x for xs in tokenized_sample for x in xs]
    return tokenized_sample

class ListDataset(Dataset):
    def __init__(self, data_list):
        self.data = data_list

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

def collate_fn(batch):
    keys = batch[0].keys()
    seq = {k:[torch.tensor(x[k]) for x in batch] for k in keys}
    return {k:torch.nn.utils.rnn.pad_sequence(v, True, 0) for k,v in seq.items()}
    
def create_dataloader(test_data, max_n_tokens, tokenizer_stride):
    tokenizer = AutoTokenizer.from_pretrained(weghts_path + "tokenizer")
    tokenized_data = tokenize_and_batch(test_data, tokenizer, max_n_tokens, tokenizer_stride)
    return DataLoader(
        ListDataset(tokenized_data),
        batch_size = 4,
        num_workers = 2,
        pin_memory = True,
        shuffle = False,
        collate_fn = collate_fn,
    )


In [None]:
class HFModel(torch.nn.Module):
    def __init__(self, load_path):
        super().__init__()
        self.backbone = AutoModelForTokenClassification.from_pretrained(load_path)

    def forward(self, d):
        preds = self.backbone(input_ids = d["input_ids"], attention_mask = d["attention_mask"])
        return {"logits": preds.logits}

def results_to_df(predictions, data):
    probs = torch.nn.functional.softmax(predictions["logits"], -1)
    probs = probs.flatten(0,1).cpu().numpy()
    probs = pd.DataFrame(probs, columns = [f"prob_{i}" for i in range(8)])
    res = pd.DataFrame({
        "document": data["document"].cpu().flatten(),
        "word_ids": data["word_ids"].cpu().flatten(),
        "targets": data["targets"].cpu().flatten(),
    })
    res = pd.concat([res, probs], axis=1)
    return res

def inference(model_cfg, test_data):
    model = HFModel(f"{weghts_path}/{model_cfg['path']}")
    model.to(device)
    model.eval()
    
    dl = create_dataloader(test_data, model_cfg['max_tokens'], tokenizer_stride)

    res_df = pd.DataFrame()
    with torch.no_grad():
        for data in dl:
            for k in data.keys():
                data[k] = data[k].to(device)
            preds = model(data)
            res_df = pd.concat([res_df, results_to_df(preds, data)])
    return res_df


In [None]:
test_df = None

for test_chunk in chunk(test, chunk_size):
    preds_df = None
    for cfg in model_params:
        preds_df = pd.concat([preds_df, inference(cfg, test_chunk)])

    preds_df = preds_df[preds_df["word_ids"] != -100]
    preds_df = preds_df.groupby(["document", "word_ids"]).agg(**{f"prob_{i}":(f'prob_{i}', 'mean') for i in range(8)}).reset_index()

    test_chunk_df = {}
    test_chunk_df["tokens"] = [x for xs in [rec["tokens"] for rec in test_chunk] for x in xs]
    test_chunk_df["document"] = [x for xs in [[rec["document"]]*len(rec["tokens"]) for rec in test_chunk] for x in xs]
    test_chunk_df["word_ids"] = [x for xs in [range(len(rec["tokens"])) for rec in test_chunk] for x in xs]
    test_chunk_df = pd.DataFrame(test_chunk_df)

    probs = preds_df[[f"prob_{i}" for i in range(8)]].values
    probs[:,0] *= scaler
    preds_df["preds"] = probs.argmax(-1)

    test_chunk_df = test_chunk_df.merge(preds_df[["document", "word_ids", "preds"]], how = "left")
    test_chunk_df["preds"] = test_chunk_df["preds"].fillna(0).astype(int)
    
    # postp1
    test_chunk_df.loc[test_chunk_df["tokens"] == "\n", "preds"] = 5

    # postp2
    test_chunk_df.loc[(test_chunk_df["preds"] == 1) & (~test_chunk_df["tokens"].str.istitle()), "preds"] = 0
    test_chunk_df = test_chunk_df.merge(test_chunk_df.groupby(["document", "tokens"])["preds"].max().rename("max_preds").reset_index(), how="left")
    test_chunk_df.loc[test_chunk_df["max_preds"] == 1, "preds"] = 1

    test_chunk_df["label"] = decode_targets(test_chunk_df["preds"].values, test_chunk_df["document"].values)
    test_chunk_df = test_chunk_df[test_chunk_df["label"] != "O"][["document", "word_ids", "label"]].rename({"word_ids":"token"}, axis=1)

    test_df = pd.concat([test_df, test_chunk_df])


test_df = test_df.reset_index(drop=True)
sub_df = test_df.reset_index().rename({"index":"row_id"}, axis=1)

sub_df.to_csv("submission.csv", index=False)
sub_df
