# Data preprocess

In [1]:
import json
import torch

from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast, BertForTokenClassification, TrainingArguments, Trainer

2024-04-28 16:40:50.536165: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-28 16:40:50.536282: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-28 16:40:50.659848: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
DATA_VARIANT = ['train', 'dev', 'test']

In [3]:
def get_json_lines_dict(data_variant: str):
    with open(f"/kaggle/input/nlp-assignment3-testphase-dataset/{data_variant}.jsonl", encoding="utf8") as f:
        for line in f:
            doc = json.loads(line)
            yield doc


dataset = {'train': [], 'dev': [], 'test': []}

for var in DATA_VARIANT:
    for doc in get_json_lines_dict(var):
        dataset[var].append(doc)
        
print(dataset['train'][0])

print("Train samples: ", len(dataset['train']))
print("Dev samples: ", len(dataset['dev']))
print("Test samples: ", len(dataset['test']))

{'ners': [[0, 5, 'CITY'], [16, 23, 'PERSON'], [34, 41, 'PERSON'], [46, 62, 'LOCATION'], [115, 136, 'EVENT'], [138, 147, 'AGE'], [149, 164, 'PERSON'], [195, 213, 'EVENT'], [215, 223, 'DATE'], [273, 314, 'ORGANIZATION'], [316, 324, 'DATE'], [328, 333, 'ORDINAL'], [360, 368, 'AGE'], [370, 385, 'PERSON'], [400, 410, 'EVENT'], [414, 423, 'CITY'], [457, 465, 'CITY'], [714, 720, 'CITY'], [842, 849, 'ORGANIZATION'], [852, 891, 'ORGANIZATION'], [894, 914, 'ORGANIZATION'], [917, 939, 'ORGANIZATION'], [979, 981, 'ORGANIZATION'], [1026, 1034, 'AGE'], [1044, 1073, 'ORGANIZATION'], [1075, 1088, 'PERSON'], [1115, 1123, 'CITY'], [1157, 1163, 'CITY'], [1208, 1226, 'PROFESSION'], [1248, 1262, 'PERSON'], [1277, 1286, 'ORGANIZATION'], [1294, 1307, 'PERSON'], [1372, 1377, 'ORDINAL'], [1413, 1415, 'COUNTRY'], [1437, 1447, 'DATE'], [1476, 1486, 'DATE'], [1537, 1543, 'EVENT'], [1548, 1566, 'EVENT'], [1598, 1600, 'NUMBER'], [1628, 1630, 'NUMBER'], [1642, 1650, 'NUMBER'], [7, 14, 'EVENT'], [65, 78, 'DATE'], [97

In [4]:
entity_types = load_dataset('MalakhovIlya/RuNNE', 'ent_types')['ent_types']

ENTITIES = len(entity_types)
print(entity_types)
print(ENTITIES)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/3.37k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/2.35k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/236 [00:00<?, ?B/s]

Generating ent_types split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['type'],
    num_rows: 29
})
29


# Baseline solution

## Train

In [5]:
!python -m spacy download ru_core_news_lg


import spacy
from spacy.lang.ru.examples import sentences 


nlp = spacy.load("ru_core_news_lg")

Collecting ru-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.7.0/ru_core_news_lg-3.7.0-py3-none-any.whl (513.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m513.4/513.4 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting pymorphy3>=1.0.0 (from ru-core-news-lg==3.7.0)
  Downloading pymorphy3-2.0.1-py3-none-any.whl.metadata (1.8 kB)
Collecting dawg-python>=0.7.1 (from pymorphy3>=1.0.0->ru-core-news-lg==3.7.0)
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3>=1.0.0->ru-core-news-lg==3.7.0)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl.metadata (2.0 kB)
Downloading pymorphy3-2.0.1-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.2/53.2 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Down

In [6]:
from collections import Counter


vocab = dict()

for data in dataset['train']:
    for start_i, end_i, entity in data['ners']:
        token = data['sentences'][start_i: end_i + 1]
        
        if token not in vocab:
            vocab[token] = Counter()

        if entity in vocab[token]:
            vocab[token][entity] += 1
        else:
            vocab[token][entity] = 1

## Test

In [7]:
def process_dataset(dataset, nlp, vocab):
    test_ners = []

    for data in dataset['test']:
        tokens = nlp(data['senences'])
        ners = []

        for token in tokens:
            start_i = token.idx
            end_i = start_i + len(token)
            text = token.text
            end_i -= 1

            if text in vocab:
                label = vocab[text].most_common(1)[0][0]
                ners.append([start_i, end_i, label])

        test_ners.append({'id': data['id'], 'ners': ners})
        
    return test_ners

test_ners = process_dataset(dataset, nlp, vocab)
print(str(test_ners[0]))

{'id': 584, 'ners': [[30, 34, 'NUMBER'], [40, 45, 'PENALTY'], [64, 69, 'PERSON'], [128, 134, 'DATE'], [137, 137, 'NUMBER'], [145, 147, 'EVENT'], [149, 156, 'STATE_OR_PROVINCE'], [158, 167, 'EVENT'], [298, 302, 'NUMBER'], [320, 329, 'PENALTY'], [350, 358, 'AGE'], [382, 389, 'EVENT'], [403, 404, 'NUMBER'], [406, 414, 'NUMBER'], [472, 475, 'DATE'], [480, 483, 'DATE'], [485, 488, 'DATE'], [534, 537, 'DATE'], [567, 575, 'NUMBER']]}


## Prepare submission

In [8]:
with open('test.jsonl', 'w') as f:
    for sample in test_ners:
        json.dump(sample, f)
        f.write('\n')

!zip test test.jsonl

  adding: test.jsonl (deflated 76%)


# Main solution

ref: https://huggingface.co/docs/transformers/model_doc/luke

In [5]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [25]:
from transformers import AutoTokenizer, LukeForEntitySpanClassification


tokenizer = AutoTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
model = LukeForEntitySpanClassification.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003", num_labels=ENTITIES, ignore_mismatched_sizes=True).to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at studio-ousia/luke-large-finetuned-conll-2003 were not used when initializing LukeForEntitySpanClassification: ['luke.embeddings.position_ids']
- This IS expected if you are initializing LukeForEntitySpanClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeForEntitySpanClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LukeForEntitySpanClassification were not initialized from the model checkpoint at studio-ousia/luke-large-finetuned-conll-2003 and are newly initialized because the shapes did not match:

# Preprocess data for LUKE model

In [26]:
MAX_SEQ_LENGTH = 400
MAX_ENTITIES = 16

In [27]:
def map_entity_to_int(entity: str) -> int:
    return entity_types['type'].index(entity)


def map_int_to_entity(idx: int) -> str:
    return entity_types['type'][idx]

In [28]:
def apply_max_seq_length(dataset: dict, data_variant: str, max_seq_length: int) -> {}:
    """
    'Cuts' the given data until the max_seq_length is reached.
    """
    
    data = {'text': [], 'entities': [], 'entity_spans': []}
    dataset = dataset[data_variant]
    
    # Shorten text
    for data_sample in dataset:
        text_spans = []
        text_entities = []
        
        if data_variant == 'train':
            for span in data_sample['ners']:
                start_i, end_i, entity = span[0], span[1], map_entity_to_int(span[2])

                if end_i < max_seq_length:
                    text_entities.append(entity)
                    text_spans.append((start_i, end_i))

            if len(text_spans) > 0 and len(text_entities) > 0:
                data['text'].append(data_sample['sentences'][:max_seq_length])
                data['entities'].append(text_entities)
                data['entity_spans'].append(text_spans)
        else:
            data['text'].append(data_sample['senences'][:max_seq_length])
    
    return data

In [29]:
train_data = apply_max_seq_length(dataset, 'train', MAX_SEQ_LENGTH)
val_data = apply_max_seq_length(dataset, 'dev', MAX_SEQ_LENGTH)
test_data = apply_max_seq_length(dataset, 'test', MAX_SEQ_LENGTH)

In [30]:
print("Text: ", train_data['text'][0])
print("Entities: ", train_data['entities'][0])

Text:  Бостон взорвали Тамерлан и Джохар Царнаевы из Северного Кавказа

19 апреля 2013 года в пригороде Бостона  проходит спецоперация по поимке 19-летнего Джохара Царнаева, подозреваемого в теракте на Бостонском марафоне 15 апреля и в смертельном ранении полицейского на кампусе Массачусетского технологического института 18 апреля.

Второй подозреваемый, его брат, 26-летний Тамерлан Царнаев, был ранен в 
Entities:  [2, 22, 22, 14, 8, 0, 22, 8, 5, 19, 5, 18, 0, 22, 8, 5, 2, 4, 4, 26, 8, 22, 10, 14, 14, 2]


In [31]:
print(len(train_data['entities']))
print(len(train_data['entity_spans']))

519
519


# Model for extracting entities

In [37]:
from transformers import BertForTokenClassification, AdamW
from transformers import BertTokenizer


extraction_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
extraction_model = BertForTokenClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2, # We need two labels for start and end positions
    output_attentions=False,
    output_hidden_states=False,
).to(device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader


EPOCHS = 3
BATCH_SIZE = 8

inputs = extraction_tokenizer(train_data['text'], return_tensors="pt", padding='max_length').to(device)
labels = torch.tensor(train_data['entity_spans']).to(device)

# Create a DataLoader
dataset = TensorDataset(inputs, labels)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# Define loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
for epoch in range(EPOCHS): # Number of epochs
    for batch in dataloader:
        
        input_ids, label_ids = batch
        outputs = model(**input_ids)
        
        loss = loss_function(outputs.logits.view(-1, model.config.num_labels), label_ids.view(-1))
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# Train LUKE

In [None]:
from torch.nn import CrossEntropyLoss
from transformers import AdamW
from tqdm import tqdm


EPOCHS = 1
loss_function = CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
for epoch in tqdm(range(EPOCHS)):
    for i in tqdm(range(len(train_data['text']))):
        text, entity_spans, entities = train_data['text'][i], train_data['entity_spans'][i], train_data['entities'][i]        
        
        inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt", padding='max_length').to(device)
        
        labels = torch.tensor(entities).to(device)
        
        outputs = model(**inputs)
        shortened_logits = outputs.logits[:, :labels.shape[0]].reshape(-1, ENTITIES)
        loss = loss_function(shortened_logits, labels)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Test model

In [None]:
text, entity_spans, entities = train_data['text'][0], train_data['entity_spans'][0], train_data['entities'][0]     
inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt", padding='max_length').to(device)

outputs = model(**inputs)

logits = outputs.logits

predicted_class_indices = logits.argmax(-1).squeeze().tolist()

for span, predicted_class_idx in zip(entity_spans, predicted_class_indices):
    if predicted_class_idx != 0:
        print(text[span[0] : span[1]], map_int_to_entity(predicted_class_idx))