In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/kse-ua-location-extraction-2025/ru_geo_dataset.csv
/kaggle/input/kse-ua-location-extraction-2025/README.md
/kaggle/input/kse-ua-location-extraction-2025/labeling_sample.csv
/kaggle/input/kse-ua-location-extraction-2025/test.csv
/kaggle/input/kse-ua-location-extraction-2025/uk_geo_dataset.csv
/kaggle/input/kse-ua-location-extraction-2025/uk_geo_dataset_processed_v1.parquet


# EDA

In [2]:
uk_data = pd.read_csv('/kaggle/input/kse-ua-location-extraction-2025/uk_geo_dataset.csv')
#other_data = pd.read_csv('/kaggle/input/kse-ua-location-extraction-2025/ru_geo_dataset.csv')
labeling_sample = pd.read_csv('/kaggle/input/kse-ua-location-extraction-2025/labeling_sample.csv')
test_data = pd.read_csv('/kaggle/input/kse-ua-location-extraction-2025/test.csv')
uk_geo_dataset = pd.read_parquet('/kaggle/input/kse-ua-location-extraction-2025/uk_geo_dataset_processed_v1.parquet')

In [3]:
# print(uk_geo_dataset.head())
print("Сolumns", uk_geo_dataset.columns.tolist())
print(uk_geo_dataset.dtypes)

Сolumns ['tokens', 'labels', 'is_valid']
tokens      object
labels      object
is_valid     int64
dtype: object


In [4]:
# print(uk_data.head())
# print("Сolumns", uk_data.columns.tolist())
# print(uk_data.dtypes)

In [5]:
# print(uk_geo_dataset.head())
# print("Сolumns", uk_geo_dataset.columns.tolist())
# print(uk_geo_dataset.dtypes)

In [4]:
uk_data['loc_markers'] = uk_data['loc_markers'].apply(eval)
uk_data['num_locs'] = uk_data['loc_markers'].apply(len)
uk_data['has_loc'] = uk_data['num_locs'] > 0

In [5]:
print(f"With loc: {uk_data['has_loc'].sum()} ({uk_data['has_loc'].mean()*100:.1f}%)")
print(f"Without loc: {(~uk_data['has_loc']).sum()} ({(~uk_data['has_loc']).mean()*100:.1f}%)")

uk_data['words'] = uk_data['text'].str.split().str.len()
print(f"Average length: {uk_data['words'].mean():.1f} words")


print("Excample:")
for i in range(5):
    row = uk_data[uk_data['has_loc']].iloc[i]
    locs = [row['text'][s:e] for s,e in row['loc_markers']]
    print(f"\n{row['text'][:60]}...")
    print(f"> {locs}")

With loc: 233421 (23.1%)
Without loc: 776579 (76.9%)
Average length: 14.6 words
Excample:

Подібні розіграші проводили в Великій Британії....
> ['Великій Британії']

У Львові 34-річний мешканець Яворівського району під час к...
> ['Львові', 'Яворівського району']

Нагадаємо, президент України Володимир Зеленський скликав по...
> ['України']

Слід зауважити, що протягом останнього часу в пунктах пропус...
> ['України', 'Білорусі']

Тим часом, О.Паращій вважає, що зміна глави Мінфіну навряд ч...
> ['України']


In [8]:
# fig, axes = plt.subplots(1, 2, figsize=(12, 4))
# uk_data['num_locs'].value_counts().head(6).sort_index().plot(kind='bar', ax=axes[0], color='teal')
# axes[0].set_title('Number of locations in text')

# uk_data['words'].hist(bins=30, ax=axes[1], color='salmon')
# axes[1].set_title('Lenth of text (words)')
# axes[1].set_xlim(0, 50)
# axes[1].grid(False)

# plt.tight_layout()
# plt.show()

## Analyze the proposed target metric

The competition uses **entity-level F1-score**, where each entity is a text span (start, end).
The model receives a score of 1 only when it has completely correctly restored the location boundaries.
Partial matches (for example, finding “Львів” instead of “місто Львів”) are counted as errors.

**Advantages:**

* A classic approach to NER, consistent with standards (CoNLL, spaCy).
* Clearly penalizes incorrect boundaries, which encourages high-quality sequence labeling.
* Stable and understandable metric for comparing models.

**Disadvantages:**

* Complete dependence on exact span boundaries: even a 1-character offset = 0 points.
* Does not take partial information into account - a model that “almost guessed right” gets the same score as a model that missed completely.
* Very sensitive to tokenization (Byte Pair Encoding can cut names into components -> F1 drop).

**Edge cases:**

* Complex toponyms such as “Яворівського району” may have different segmentations -> risk of incorrect boundaries.
* Locations with declension (Львів/Львові/Львова) - the model may consider them different tokens.
* Several locations in a sentence next to each other -> risk of confusing intervals.
* Phrases with quotation marks or punctuation (“в місті Києві,”) - often errors on commas.
* 

## Suggest complementary metrics to capture other aspects of performance


Since the main metric is **entity-level F1**, which only takes into account exact span matches, it is useful to additionally monitor other metrics that show another side of quality:

### **1. Token-level F1/Token Accuracy**

Measures quality at the level of individual tokens rather than spans.
Shows whether the model correctly finds parts of locations, even if the boundaries are not yet perfect.

It is easy to see what is happening within a sentence — whether the model is generally heading in the right direction or is completely confused.

### **2. Precision/Recall at the local token level**

Sometimes the model is:

* either too “cautious” (high precision, low recall)
* or “aggressive” (low precision, high recall)

This helps to control the balance and avoid situations where F1 is stable but the model behaves unpredictably.

### **3. Partial match/Overlap-based F1 (soft F1)**

Takes into account partial matches between the predicted span and ground truth.

In Entity-Level F1, partial matches = 0.
A "soft F1" will show whether at least some of the predicted names are correct.

### **4. Character-level F1**

Accuracy assessment at the character level.

This is especially useful for Ukrainian cases and complex names, where a difference of 1–2 characters does not mean that the model completely misunderstands the context.

### **5. Exact Match Rate (EMR)**

The percentage of texts for which the model found all locations 100% correctly.

This shows “user” quality: in a real system, we are interested in cases where the model did not make any mistakes.

## Propose an alternative target metric, with justification.

### **1. Character-level F1-score (character F1)**

### **Why it is the better choice:**

1. **Less dependent on exact span borders.**
   If the model shifts the border by a few characters (comma, space, case), character-F1 still records partial correctness.

2. **More resistant to different tokenizers (BPE, char-level).**
   Different word breaks do not affect the metric as much.

3. **Better reflects the “quality of information.”**
   If the model finds a partial country name (“Ukrain” instead of “Ukraine”), Entity-F1 gives a complete zero, while Character-F1 evaluates the actual progress.

4. **Less penalty for boundary errors**, which are often random.

5. **More fair to Ukrainian cases**, which complicate accurate span.

### **2. Soft Overlap F1 (partial matching F1)**

This metric counts an entity as correct if **the overlap is greater than N%** (for example, 50%).
This also reduces sensitivity to boundary shifts.

### Brief rationale

**Character-level F1** would be a more stable and informative metric for the task of location tagging because it:
* reduces the impact of boundary errors,
* works better with case and punctuation,
* adequately evaluates partially correct predictions,* is less sensitive to differences in tokenization.


# Validation

The task already provides for an official split via the is_valid field, where:

is_valid = 0 -> training data

is_valid = 1 -> validation data

This is not a random split - it was formed by the authors of the dataset in order to:

* maintain the same distribution of location types in train and validation;

* divide sentences from one document into different parts, which prevents data leakage;

* simulate the expected statistics on the test, which ensures correct correlation with the leaderboard.

Therefore, using this particular split (is_valid) is the most reliable strategy, consistent with the official Kaggle evaluation.

In [6]:
print("is_valid uk_data:")
print(uk_data['is_valid'].value_counts())

print("is_valid uk_geo_dataset parquet:")
print(uk_geo_dataset['is_valid'].value_counts())

is_valid uk_data:
is_valid
0    1000000
1      10000
Name: count, dtype: int64
is_valid uk_geo_dataset parquet:
is_valid
0    1000000
1      10000
Name: count, dtype: int64


## Motivation for the chosen strategy

The selected split is_valid has a high correlation with Kaggle LB because:
* it was created by the organizers specifically for the test structure;
* train/val/test have similar distributions of texts and location types;
* the division is made by documents -> no leakage;
* random split or k-fold almost guarantee an overestimated CV due to repeated fragments of the same documents.

Conclusion:

is_valid is the only strategy that gives a realistic validation score and accurately reflects the behavior of the model on Kaggle LB.

In [7]:
train = uk_geo_dataset[uk_geo_dataset['is_valid'] == 0] 
val = uk_geo_dataset[uk_geo_dataset['is_valid'] == 1]   

print(f"Train: {len(train):,} samples")
print(f"Val: {len(val):,} samples")

Train: 1,000,000 samples
Val: 10,000 samples


## Possible reasons for low/high correlation with Kaggle LB

Why correlation may be low:

* Data leakage in your CV if you accidentally changed the split.
* Difference in tokenization between local code and official verification script.
* Overfitting on validation - the model adapts to is_valid but does not generalize.
* Different text structure in the test (longer/shorter, different types of names).

Why correlation may be high:
* is_valid is used.
* train/val/test have similar statistics.
* The model is stable, without hard retraining.
* Tokenization and post-processing are the same for CV and submission.

## Adversarial validation

Adversarial validation shows how statistically similar train and test are.

The idea is simple: we train the model to distinguish train sentences from test sentences.

If the model cannot do this, the distributions are close, and CV <-> LB will be stable.

If the model easily recognizes test, the distributions differ, and another strategy or post-processing is needed.

In this competition:
* train and test are formed very similarly;
* is_valid already mimics the test set;
* the amount of data is huge — no significant shifts are expected.

What we expect from adversarial validation:
* ROC-AUC ≈ 0.50-0.60 -> train/test are similar
* ROC-AUC > 0.75 -> there is a distribution shift
* ROC-AUC ~ 0.90 -> critical shift, CV does not match LB

Adversarial validation is a formal check that is_valid is a correct split.

In most official NER datasets, train/test are close, so the model's ability to distinguish between them is expected to be low (close to random guessing).

In [11]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import roc_auc_score

# train_texts = uk_data['text'].astype(str)
# test_texts = test_data['text'].astype(str)

# df_adv = pd.DataFrame({
#     'text': pd.concat([train_texts, test_texts]),
#     'label': [0]*len(train_texts) + [1]*len(test_texts)
# })

# tfidf = TfidfVectorizer(
#     max_features=50000,
#     ngram_range=(1,2)
# )
# X = tfidf.fit_transform(df_adv['text'])
# y = df_adv['label']

# clf = LogisticRegression(max_iter=200)
# clf.fit(X, y)

# preds = clf.predict_proba(X)[:,1]

# auc = roc_auc_score(y, preds)
# print("Adversarial Validation ROC-AUC:", round(auc, 4))

Adversarial Validation ROC-AUC: 0.9732


# Transformer Encoder model:

In [8]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=e7cf320bd3d352326643765cce5030ff9072d8d4a0848bd2c3f4ee621da2bd54
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [13]:
# !pip uninstall -y transformers
# !pip install transformers==4.36.0 -q

In [9]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
    EarlyStoppingCallback
)
from datasets import Dataset as HFDataset
from seqeval.metrics import f1_score, precision_score, recall_score

2025-12-09 11:37:35.140584: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765280255.318418      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765280255.369790      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [11]:
import wandb

In [12]:
os.environ['WANDB_API_KEY'] = '6b462b2f87d400f7a62c6246ef43f027d6c8adfb'

In [14]:
import json

class NERDataProcessor:
    def __init__(self, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label2id = {'O': 0, 'LOC': 1}
        self.id2label = {0: 'O', 1: 'LOC'}
        self.num_labels = 2 
    
    def tokenize_and_encode(self, examples):
        all_input_ids = []
        all_attention_masks = []
        all_labels = []
        
        for tokens_str, labels_str in zip(examples['tokens'], examples['labels']):
            tokens = json.loads(tokens_str)
            labels = json.loads(labels_str)
            
            encoded = self.tokenizer(
                tokens,
                is_split_into_words=True,
                truncation=True,
                padding='max_length',
                max_length=self.max_length
            )
            
            word_ids = encoded.word_ids()
            label_ids = []
            
            for word_id in word_ids:
                if word_id is None:
                    label_ids.append(-100)
                else:
                    if word_id < len(labels):
                        label_ids.append(self.label2id[labels[word_id]])
                    else:
                        label_ids.append(self.label2id['O'])
            
            all_input_ids.append(encoded['input_ids'])
            all_attention_masks.append(encoded['attention_mask'])
            all_labels.append(label_ids)
        
        return {
            'input_ids': all_input_ids,
            'attention_mask': all_attention_masks,
            'labels': all_labels
        }
    
    def prepare_dataset(self, df):
        dataset = HFDataset.from_pandas(df[['tokens', 'labels']])
        tokenized = dataset.map(
            self.tokenize_and_encode,
            batched=True,
            remove_columns=dataset.column_names,
            desc="Tokenizing"
        )
        return tokenized

In [15]:
processor = NERDataProcessor(
    AutoTokenizer.from_pretrained('bert-base-multilingual-cased'),
    max_length=128
    #max_length=256
)

train_dataset = processor.prepare_dataset(train)
val_dataset = processor.prepare_dataset(val)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Tokenizing:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [16]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)
    id2label = {0: 'O', 1: 'LOC'}
    
    true_labels, pred_labels = [], []
    for pred, label in zip(predictions, labels):
        true_seq, pred_seq = [], []
        for p, l in zip(pred, label):
            if l != -100:
                true_seq.append(id2label[l])
                pred_seq.append(id2label[p])
        true_labels.append(true_seq)
        pred_labels.append(pred_seq)
    
    return {
        'f1': f1_score(true_labels, pred_labels),
        'precision': precision_score(true_labels, pred_labels),
        'recall': recall_score(true_labels, pred_labels)
    }

In [17]:
def train_transformer_ner(model_name, train_dataset, val_dataset, run_name, config=None):
    
    wandb.init(
        project='dl-assignment4-ner',
        name=run_name,
        config=config,
        reinit=True
    )
    
    print(f"Training: {model_name}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    label2id = {'O': 0, 'LOC': 1}
    id2label = {0: 'O', 1: 'LOC'}
    
    model = AutoModelForTokenClassification.from_pretrained(
        model_name,
        num_labels=2,
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True
    )
    
    training_args = TrainingArguments(
        output_dir=f'./results/{run_name}',
        num_train_epochs=config['epochs'],
        per_device_train_batch_size=config['batch_size'],
        per_device_eval_batch_size=config['batch_size'] * 2,
        learning_rate=config['learning_rate'],
        weight_decay=config['weight_decay'],
        warmup_ratio=config['warmup_ratio'],
        eval_strategy='epoch',
        save_strategy='epoch',
        load_best_model_at_end=False,
        metric_for_best_model='f1',
        logging_steps=50,
        report_to='wandb',
        fp16=torch.cuda.is_available(),
        save_total_limit=2,
        #lr_scheduler_type='cosine',
        #optim='adamw_torch',
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=DataCollatorForTokenClassification(tokenizer, padding=True),
        compute_metrics=compute_metrics
    )
    
    print("Starting training...")
    trainer.train()
    
    print("Evaluating...")
    results = trainer.evaluate()
    
    print(f"Results:")
    print(f"  F1: {results['eval_f1']:.4f}")
    print(f"  Precision: {results['eval_precision']:.4f}")
    print(f"  Recall: {results['eval_recall']:.4f}")
    
    wandb.finish()  
    
    return trainer, results

In [18]:
train_sample = train.sample(n=5000, random_state=42).reset_index(drop=True)
val_sample = val.sample(n=1000, random_state=42).reset_index(drop=True)

print(f"Original - Train: {len(train)}, Val: {len(val)}")
print(f"Sample - Train: {len(train_sample)}, Val: {len(val_sample)}")

Original - Train: 1000000, Val: 10000
Sample - Train: 5000, Val: 1000


In [19]:
processor = NERDataProcessor(
    AutoTokenizer.from_pretrained('bert-base-multilingual-cased'),
    max_length=128
    #max_length=256
)

train_dataset_small = processor.prepare_dataset(train_sample)
val_dataset_small = processor.prepare_dataset(val_sample)

Tokenizing:   0%|          | 0/5000 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [20]:
config = {
    'batch_size': 16,
    'learning_rate': 2e-5,
    'epochs': 3,
    'warmup_ratio': 0.1,
    'weight_decay': 0.01
}

In [24]:
# trainer, results = train_transformer_ner(
#     'bert-base-multilingual-cased',
#     train_dataset_small,
#     val_dataset_small,
#     'mbert-baseline',
#     config
# )

## Experiment with different BERT-like multilingual encoders

In [21]:
trainer_mbert, results_mbert = train_transformer_ner(
    'bert-base-multilingual-cased',
    train_dataset_small,
    val_dataset_small,
    'mbert-baseline',
    config
)

[34m[1mwandb[0m: Currently logged in as: [33miandroshchuk[0m ([33miandroshchuk-kse[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Training: bert-base-multilingual-cased


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...




Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.0204,0.022375,0.828383,0.856655,0.801917
2,0.0145,0.024068,0.847201,0.804598,0.894569
3,0.0045,0.024578,0.835913,0.810811,0.86262


Evaluating...


Results:
  F1: 0.8359
  Precision: 0.8108
  Recall: 0.8626


0,1
eval/f1,▁█▄▄
eval/loss,▁▆██
eval/precision,█▁▂▂
eval/recall,▁█▆▆
eval/runtime,▁▅█▆
eval/samples_per_second,█▄▁▃
eval/steps_per_second,█▄▁▃
train/epoch,▁▁▂▂▃▃▃▃▄▄▅▅▅▆▆▆▇▇▇████
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▅▆▆▆▇▇▇████
train/grad_norm,▂▂▄▄█▂▃▁▁▅▃▂▁▃▁▁▁▂

0,1
eval/f1,0.83591
eval/loss,0.02458
eval/precision,0.81081
eval/recall,0.86262
eval/runtime,3.7984
eval/samples_per_second,263.267
eval/steps_per_second,8.425
total_flos,979862837760000.0
train/epoch,3.0
train/global_step,939.0


In [22]:
trainer_xlm_base, results_xlm_base = train_transformer_ner(
    'xlm-roberta-base',
    train_dataset_small,
    val_dataset_small,
    'xlm-roberta-base-baseline',
    config
)

Training: xlm-roberta-base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.1244,0.117202,0.0,0.0,0.0
2,0.116,0.091467,0.231441,0.365517,0.169329
3,0.0814,0.086924,0.275194,0.349754,0.226837


Evaluating...


Results:
  F1: 0.2752
  Precision: 0.3498
  Recall: 0.2268


0,1
eval/f1,▁▇██
eval/loss,█▂▁▁
eval/precision,▁███
eval/recall,▁▆██
eval/runtime,▄▁▂█
eval/samples_per_second,▅█▇▁
eval/steps_per_second,▅██▁
train/epoch,▁▁▂▂▃▃▃▃▄▄▅▅▅▆▆▆▇▇▇████
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▅▆▆▆▇▇▇████
train/grad_norm,▃▁▁▂▂▁▂▂▃▃▃█▆█▅▅▃▅

0,1
eval/f1,0.27519
eval/loss,0.08692
eval/precision,0.34975
eval/recall,0.22684
eval/runtime,3.8648
eval/samples_per_second,258.746
eval/steps_per_second,8.28
total_flos,979862837760000.0
train/epoch,3.0
train/global_step,939.0


In [26]:
trainer_mdeberta, results_mdeberta = train_transformer_ner(
    'microsoft/mdeberta-v3-base',
    train_dataset_small,
    val_dataset_small,
    'mdeberta-v3-baseline',
    config
)

Training: microsoft/mdeberta-v3-base


Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [28]:
# all_results = [
#     {'model': 'mbert', 'val_f1': results_mbert['eval_f1']},
#     {'model': 'xlm-roberta', 'val_f1': results_xlm_base['eval_f1']},
#     {'model': 'mdeberta', 'val_f1': results_mdeberta['eval_f1']},
# ]

# results_df = pd.DataFrame(all_results)
# print(results_df)

## THRESHOLD

In [24]:
# import torch
# import torch.nn.functional as F

# import wandb
# wandb.init(
#     project="dl-assignment4-ner",
#     name="threshold-search",
#     reinit=True
# )

# pred_output = trainer_mbert.predict(val_dataset_small)
# logits = pred_output.predictions
# label_ids = pred_output.label_ids

# probs_loc = F.softmax(torch.from_numpy(logits), dim=-1)[:, :, 1].numpy()
# def spans_from_seq(seq):
#     spans = []
#     start = None
#     for i, v in enumerate(seq):
#         if v == 1:
#             if start is None:
#                 start = i
#             end = i
#         else:
#             if start is not None:
#                 spans.append((start, end))
#                 start = None
#     if start is not None:
#         spans.append((start, end))
#     return spans

# def entity_f1_from_probs(probs_loc, label_ids, threshold):
#     tp = 0
#     fp = 0
#     fn = 0
#     for pl, lab in zip(probs_loc, label_ids):
#         valid_mask = lab != -100
#         true_seq = lab[valid_mask]
#         pred_seq = (pl[valid_mask] > threshold).astype(int)
#         true_spans = set(spans_from_seq(true_seq))
#         pred_spans = set(spans_from_seq(pred_seq))
#         tp += len(true_spans & pred_spans)
#         fp += len(pred_spans - true_spans)
#         fn += len(true_spans - pred_spans)
#     if tp == 0:
#         return 0.0
#     precision = tp / (tp + fp)
#     recall = tp / (tp + fn)
#     return 2 * precision * recall / (precision + recall)
    
# thresholds = np.linspace(0.4, 0.8, 9)
# best_t = None
# best_f1 = -1.0

# for t in thresholds:
#     f1 = entity_f1_from_probs(probs_loc, label_ids, t)
#     print("t =", round(t, 2), "F1 =", round(f1, 4))
#     if f1 > best_f1:
#         best_f1 = f1
#         best_t = t

# print("Best threshold:", best_t, "with F1:", best_f1)

# wandb.finish()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


## Entity-Merging - вийшов провал, скіп

In [30]:
# from transformers import AutoTokenizer
# import torch
# import torch.nn.functional as F
# import numpy as np
# import json
# import pandas as pd
# from datasets import Dataset as HFDataset

# tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

# test_inputs = []
# test_offset_mappings = []

# for text in test_data['text']:
#     encoded = tokenizer(
#         text,
#         truncation=True,
#         padding='max_length',
#         max_length=128,
#         return_tensors=None,
#         return_offsets_mapping=True  
#     )
    
#     test_inputs.append({
#         'input_ids': encoded['input_ids'],
#         'attention_mask': encoded['attention_mask']
#     })
#     test_offset_mappings.append(encoded['offset_mapping'])

# test_dataset = HFDataset.from_dict({
#     'input_ids': [x['input_ids'] for x in test_inputs],
#     'attention_mask': [x['attention_mask'] for x in test_inputs]
# })

# model = trainer_mbert.model
# model.eval()

# all_probs_loc = []

# with torch.no_grad():
#     for i in range(0, len(test_dataset), 16):
#         batch = test_dataset[i:min(i+16, len(test_dataset))]

#         input_ids = torch.tensor(batch['input_ids']).to(model.device)
#         attention_mask = torch.tensor(batch['attention_mask']).to(model.device)

#         outputs = model(input_ids=input_ids, attention_mask=attention_mask)

#         probs = F.softmax(outputs.logits, dim=-1)[:, :, 1]
#         all_probs_loc.append(probs.cpu().numpy())

# all_probs_loc = np.concatenate(all_probs_loc, axis=0)

# all_locations = []

# for sample_probs, offsets, text in zip(all_probs_loc, test_offset_mappings, test_data['text']):
#     location_spans = []
#     current_start = None

#     for p_loc, (start, end) in zip(sample_probs, offsets):
#         if start == 0 and end == 0:
#             continue

#         is_loc = p_loc > best_t

#         if is_loc:
#             if current_start is None:
#                 current_start = start
#             current_end = end
#         else:
#             if current_start is not None:
#                 location_spans.append((current_start, current_end))
#                 current_start = None
    
#     if current_start is not None:
#         location_spans.append((current_start, current_end))

#     location_texts = [text[s:e].strip() for s, e in location_spans]
#     location_texts = [loc for loc in location_texts if loc]

#     all_locations.append(location_texts)

# submission = pd.DataFrame({
#     "text_id": test_data["text_id"],
#     "locations": [json.dumps(locs, ensure_ascii=False) for locs in all_locations]
# })

# submission.to_csv('submission_threshold_50-10_16_5.csv', index=False)

# print("Submission with threshold saved!")
# print(submission.head(10))

In [31]:
p1 = trainer_mbert.predict(val_dataset_small)
p2 = trainer_xlm.predict(val_dataset_small)
p3 = trainer_mdeberta.predict(val_dataset_small)

probs = (
    F.softmax(torch.from_numpy(p1.predictions), dim=-1) + 
    F.softmax(torch.from_numpy(p2.predictions), dim=-1) + 
    F.softmax(torch.from_numpy(p3.predictions), dim=-1)
) / 3

preds = np.argmax(probs.numpy(), axis=-1)

id2label = {0: 'O', 1: 'LOC'}
true_labels, pred_labels = [], []

for pred, label in zip(preds, p1.label_ids):
    true_seq, pred_seq = [], []
    for p, l in zip(pred, label):
        if l != -100:
            true_seq.append(id2label[l])
            pred_seq.append(id2label[p])
    if true_seq:
        true_labels.append(true_seq)
        pred_labels.append(pred_seq)

ensemble_f1 = f1_score(true_labels, pred_labels)

print(f"mBERT:    {results_mbert['eval_f1']:.4f}")
print(f"XLM-R:    {results_xlm['eval_f1']:.4f}")
print(f"DeBERTa:  {results_mdeberta['eval_f1']:.4f}")
print(f"ENSEMBLE: {ensemble_f1:.4f}")
print(f"+{ensemble_f1 - max(results_mbert['eval_f1'], results_xlm['eval_f1'], results_mdeberta['eval_f1']):.4f}")


NameError: name 'trainer_mbert' is not defined