In [1]:
import pandas as pd
import numpy as np
import os
import torch
import re
import json


from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F
import torch.nn as nn

# import maplotlib.pyplot as plt
# import seaborn as sns
import random
from tqdm import tqdm

In [2]:
# Session settings
# pd.set_option('display.max_rows', 15)

# plt.rcParams['font.weight'] = 'semibold'
# plt.rcParams['figure.figsize'] = 14, 8
# plt.rcParams['font.size'] = 18
# plt.rcParams['savefig.format'] = 'pdf'

# make results reproducable
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(SEED)

SEED = 69
set_seed(SEED)

In [3]:
with open('/content/NER_data.json', 'r') as f:
    lines = f.readlines()
    data = []
    for line in lines:
        js = json.loads(line)
        data.append(js)

In [4]:
df = pd.DataFrame.from_dict(data).drop(columns=["extras"])
df.sample(5)

Unnamed: 0,content,annotation
68,Roshan Sinha\nApplication Developer - SAP ABAP...,"[{'label': ['Skills'], 'points': [{'start': 32..."
166,"Rahul Tayade\nGlobal Production Support Lead, ...","[{'label': ['Skills'], 'points': [{'start': 11..."
196,Raktim Podder\n6+ Exp in banking operations an...,"[{'label': ['Skills'], 'points': [{'start': 88..."
32,Kavitha K\nSenior System Engineer - Infosys Li...,"[{'label': ['Graduation Year'], 'points': [{'s..."
190,"Jatin Arora\nSDET Automation Engineer, Infosys...","[{'label': ['College Name'], 'points': [{'star..."


In [5]:
df["annotation"] = df["annotation"].apply(lambda x: [i for i in x if len(i["label"]) > 0 and i["label"][0] != "UNKNOWN"])

ents = list(df["annotation"].apply(
    lambda x:
        [
            (np.nan if len(i["label"]) == 0 else i["label"][0]) for i in x
        ]
).explode().unique())

#adding chunking
chunked_ents = [f'I_{ent}' for ent in ents] + [f'B_{ent}' for ent in ents] + ['O']
label2id = {k : i for i, k in enumerate(chunked_ents)}

In [6]:
from pprint import pprint

def preprocess(txt):
    # pat = r'(?<!\n)\n'
    # pat = r'[?=\n]\n'
    n_pat = r"\n"
    s_pat = r" {2,10}"
    # b_l_pat = r"(?<=[^\s])?(\()(?=[^\s])?"
    # b_r_pat = r"(?<=[^\s])?(\))(?=[^\s])?"
    # b_l_pat = r"(?<=[^\s])(\()"
    # b_r_pat = r"(\))(?=[^\s])"
    # txt = re.sub(b_l_pat, ' (', txt)
    # txt = re.sub(b_r_pat, ') ', txt)
    txt = re.sub(n_pat, ' ', txt)
    txt = re.sub(s_pat, ' ', txt)
    return txt
    # return txt

In [7]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("lakshyakh93/deberta_finetuned_pii")
model = AutoModelForTokenClassification.from_pretrained("lakshyakh93/deberta_finetuned_pii")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
num_params = 0
for p in model.parameters():
    num_params += p.nelement()
num_params // 10e6

13.0

In [9]:
def clean_labels(x):
    x = sorted(x, key = lambda x: x['points'][0]['start'])
    r = 0
    new_x = []
    for l in x:
        if r > l['points'][0]['start']: continue
        r = l['points'][0]['end']
        new_x.append(l)

    return new_x

df["annotation"] = df["annotation"].apply(clean_labels)

In [10]:
# labels = df["annotation"][0]
# txt = df["content"][0]

# input_ids = []
# new_labels = []
# last_r = 0
# for ent in labels:

#     #getting plain text and entity text
#     o = preprocess(txt[last_r:ent["points"][0]['start']])
#     ent_txt = preprocess(ent["points"][0]["text"])

#     #updating right border
#     last_r = ent["points"][0]['end'] + 1

#     # getting labels ids
#     label = ent["label"][0]
#     b_label_id = label2id[f'B_{label}']
#     i_label_id = label2id[f'I_{label}']
#     o_label_id = label2id["O"]

#     #tokenizing plain text
#     if len(o.strip()) != 0:
#         o_tokens = tokenizer(o, add_special_tokens=False)["input_ids"]
#         input_ids.extend(o_tokens)
#         new_labels.extend([o_label_id] * len(o_tokens))

#     #by space tokenization
#     ent_txt_words = ent_txt.split(" ")
#     b_ent = ent_txt[0]
#     i_ent = ent_txt[1:]

#     #tokenizing beggining of entity
#     b_ent_tokens = tokenizer(b_ent, add_special_tokens=False)["input_ids"]
#     input_ids.extend(b_ent_tokens)
#     new_labels.extend([b_label_id] * len(b_ent_tokens))

#     #tokenizing inner part of entity
#     i_ent_tokens = tokenizer(i_ent, add_special_tokens=False)["input_ids"]
#     input_ids.extend(i_ent_tokens)
#     new_labels.extend([i_label_id] * len(i_ent_tokens))

In [10]:
class NERDataset(Dataset):
    def __init__(
            self,
            texts,
            labels_lists,
            tokenizer,
            max_len=512
        ):
        self.labels = labels_lists
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len=max_len

    def __getitem__(self, idx):
        txt = self.texts[idx]
        labels = self.labels[idx]

        input_ids = []
        new_labels = []
        last_r = 0
        for ent in labels:

            #getting plain text and entity text
            o = preprocess(txt[last_r:ent["points"][0]['start']])
            ent_txt = preprocess(ent["points"][0]["text"])

            #updating right border
            last_r = ent["points"][0]['end'] + 1

            # getting labels ids
            label = ent["label"][0]
            b_label_id = label2id[f'B_{label}']
            i_label_id = label2id[f'I_{label}']
            o_label_id = label2id["O"]

            #tokenizing plain text
            if len(o.strip()) != 0:
                o_tokens = tokenizer(o, add_special_tokens=False)["input_ids"]
                input_ids.extend(o_tokens)
                new_labels.extend([o_label_id] * len(o_tokens))

            #by space tokenization
            ent_txt_words = ent_txt.split(" ")
            b_ent = ent_txt_words[0]
            i_ent = ent_txt_words[1:]

            #tokenizing beggining of entity
            b_ent_tokens = tokenizer(b_ent, add_special_tokens=False)["input_ids"]
            input_ids.extend(b_ent_tokens)
            new_labels.extend([b_label_id] * len(b_ent_tokens))

            #tokenizing inner part of entity
            i_ent_tokens = tokenizer(i_ent, is_split_into_words=True, add_special_tokens=False)["input_ids"]
            input_ids.extend(i_ent_tokens)
            new_labels.extend([i_label_id] * len(i_ent_tokens))

        #Truncating
        input_ids = input_ids[:self.max_len - 2]
        new_labels = new_labels[:self.max_len - 2]

        #adding special tokens_ids CLS ->
        input_ids = [1] + input_ids + [2]
        new_labels = [0] + new_labels + [0]

        #adding paddings and attention mask
        input_ids = input_ids + [0] * (self.max_len - len(input_ids))
        new_labels = new_labels + [0] * (self.max_len - len(new_labels))
        attention_mask = [1] * (len(new_labels)) + [0] * (self.max_len - len(new_labels))

        return {
            "input_ids" : torch.tensor(input_ids, dtype=torch.long),
            "labels" : torch.tensor(new_labels, dtype=torch.float),
            "attention_mask" : torch.tensor(attention_mask, dtype=torch.long)
        }

    def __len__(self, ):
        return len(self.texts)

In [11]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(df, test_size=0.2, random_state=SEED)

train, val = train.reset_index(drop=True), val.reset_index(drop=True)

train_ds = NERDataset(train["content"], train["annotation"], tokenizer=tokenizer, max_len=512)
val_ds = NERDataset(val["content"], val["annotation"], tokenizer=tokenizer, max_len=512)

In [13]:
# Setting model for finetuning

model.classifier = nn.Linear(768, len(label2id.keys()))

In [16]:
model

DebertaForTokenClassification(
  (deberta): DebertaModel(
    (embeddings): DebertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=0)
      (LayerNorm): DebertaLayerNorm()
      (dropout): StableDropout()
    )
    (encoder): DebertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaLayer(
          (attention): DebertaAttention(
            (self): DisentangledSelfAttention(
              (in_proj): Linear(in_features=768, out_features=2304, bias=False)
              (pos_dropout): StableDropout()
              (pos_proj): Linear(in_features=768, out_features=768, bias=False)
              (pos_q_proj): Linear(in_features=768, out_features=768, bias=True)
              (dropout): StableDropout()
            )
            (output): DebertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): DebertaLayerNorm()
              (dropout): StableDropout()
            )
          )
          (int

In [14]:
class NERworker:

    def __init__(self, device):
        self.device = device
        if (torch.cuda.is_available()):
          print(torch.cuda.get_device_name(0))

    def init_loaders(self, train, test, batch_size=16):

        self.train_loader = DataLoader(
            train,
            batch_size=batch_size,
            shuffle=True,
            # num_workers=0,
            # pin_memory=True
        )
        self.test_loader = DataLoader(
            test,
            batch_size=batch_size,
            shuffle=False,
            # num_workers=0,
            # pin_memory=True
        )

    def init_model(self, model):
        model.eval()
        self.model = model.to(self.device)

    def train_one_epoch(self):

        self.model.train()
        stream = tqdm(self.train_loader)
        self.lossi_train = []
        self.mean_loss_train = []

        for batch in stream:
            # batch = {k : i.to(self.device) for k, i in batch.items()}
            # out = self.model(
            #     input_ids = batch["input_ids"],
            #     attention_mask = batch["attention_mask"]
            # ).logits
            # print(batch["labels"].shape)
            # print(out.shape)
            # loss = nn.functional.cross_entropy(out, batch["labels"].view(len(batch["attention_mask"]), -1, 1))
            # self.lossi_train.append(loss.item())

            # self.optimizer.zero_grad()
            # loss.backward()
            # self.optimizer.step()

            # stream.set_prefix(train_loss=np.mean(self.lossi_train))

            # self.mean_loss_train.append(np.mean(self.lossi_train))
            pass

    @torch.no_grad()
    def eval_one_epoch(self):

        self.model.eval()

        stream = tqdm(self.test_loader)

        self.lossi_val = []
        self.mean_loss_val = []

        for batch in stream:

            batch = {k : i.to(self.device) for k, i in batch.items()}

            out = self.model(
                input_ids = batch["input_ids"],
                attention_mask = batch["attention_mask"]
            ).logits

            loss = nn.functional.cross_entropy(out, batch["labels"])
            self.lossi_val.append(loss.item())

            stream.set_prefix(eval_loss=np.mean(self.lossi_val))

            self.mean_loss_val.append(np.mean(self.lossi_val))

        return np.mean(self.lossi_val)



    def train(self, num_epochs, optimizer_name, lr):

        self.optimizer = getattr(torch.optim, optimizer_name)(self.model.parameters(), lr)

        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, factor=0.9, patience=5)

        for epoch in range(1, num_epochs + 1):

            print("EPOCH: ", epoch)

            self.train_one_epoch()
            val_loss = self.eval_one_epoch()

            scheduler.step(val_loss)



worker = NERworker(device=torch.device("cpu"))
worker.init_loaders(train_ds, val_ds, batch_size=4)
worker.init_model(model)

In [15]:
for batch in worker.train_loader:
  pass

In [16]:
# 1/0
torch.cuda.empty_cache()
worker.train(
    num_epochs=10,
    optimizer_name="Adam",
    lr=3e-3,
)

EPOCH:  1


100%|██████████| 44/44 [00:02<00:00, 18.75it/s]
  0%|          | 0/11 [00:05<?, ?it/s]


KeyboardInterrupt: 

In [17]:
del worker

In [18]:
import gc
gc.collect()

294

In [19]:
n_params
model.()

AttributeError: 'DebertaForTokenClassification' object has no attribute 'size'