In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-ua-locations-extractions/ru_geo_dataset.csv
/kaggle/input/nlp-ua-locations-extractions/README.md
/kaggle/input/nlp-ua-locations-extractions/labeling_sample.csv
/kaggle/input/nlp-ua-locations-extractions/test.csv
/kaggle/input/nlp-ua-locations-extractions/uk_geo_dataset.csv
/kaggle/input/nlp-ua-locations-extractions/uk_geo_dataset_processed_v1.parquet
/kaggle/input/data-for-training/small_valid_processed.json
/kaggle/input/data-for-training/small_train_processed.json
/kaggle/input/tokenized-data/small_valid_processed.json
/kaggle/input/tokenized-data/small_train_processed.json
/kaggle/input/tokenized-data/valid_processed.json
/kaggle/input/medium-dataset/medium_train_processed.json


In [0]:
import re

In [3]:
def preprocess_text(text):
    # Remove links
    text = re.sub(r'http\S+|www.\S+', '', text)

    # Special remove telegram links
    pattern = r"(?:https?:\/\/)?(?:www\.)?(?:t\.me\/\S+|telegram\.me\/\S+|telegram\.dog\/\S+)"
    text = re.sub(pattern, '', text)

    # Remove phone numbers
    phone_regex = r'\(?\+?\d{0,3}\)?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{2}[-.\s]?\d{2}'
    text = re.sub(phone_regex, '', text)

    # Remove special characters
    text = re.sub(r'[\n\t\r]', ' ', text)

    # Remove tags
    text = re.sub(r'@\w+', '', text)

    # Remove emojis
    emoji_pattern = re.compile(
        pattern="["
                u"\U0001F600-\U0001F64F"  # emoticons
                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                u"\U00002702-\U000027B0"
                u"\U000024C2-\U0001F251"
                u"\U0001f926-\U0001f937"
                u'\U00010000-\U0010ffff'
                u"\u200d"
                u"\u2640-\u2642"
                u"\u2600-\u2B55"
                u"\u23cf"
                u"\u23e9"
                u"\u231a"
                u"\u3030"
                "]+", flags=re.UNICODE
    )
    text = emoji_pattern.sub(r'', text)

    # Remove multiple spaces
    text = re.sub(r' +', ' ', text)

    return text

# Data preparing

# Training

In [4]:
!pip install evaluate
!pip install seqeval
!pip install sacremoses

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.1
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16165 sha256=9ee702402bdbf2c15f5bd90bf5c4dcf9791dd5932cb9b9a79660aa5638ab7186
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Collecting sacremoses


In [5]:
import evaluate
import numpy as np
from transformers import AutoTokenizer

# model_name = 'xlm-mlm-100-1280'
model_name = 'bert-base-multilingual-cased'

tokenizer = AutoTokenizer.from_pretrained(model_name)

label2id = {'O': 0, 'B-LOC': 1, 'I-LOC': 2}
id2label = {v: k for k, v in label2id.items()}
label_names = list(label2id.keys())

metric = evaluate.load("seqeval")


def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs


def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }




Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [7]:
import torch
from datasets import load_dataset

from transformers import AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification

from transformers import Trainer, TrainingArguments

from transformers import AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup



raw_datasets_ua = load_dataset(
    "json",
    data_files={
        'train': '/kaggle/input/medium-dataset/medium_train_processed.json',
        'val': '/kaggle/input/tokenized-data/valid_processed.json'
    }
)

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    id2label=id2label,
    label2id=label2id,
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


tokenized_datasets_ua = raw_datasets_ua.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets_ua["train"].column_names,
)

optimizer = AdamW(
    [
        {'params': list(model.bert.parameters()), 'lr': 1e-5},
        {'params': list(model.classifier.parameters()), 'lr': 1e-3}
    ]
)

train_batch_size = 16
val_batch_size = 16

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0.1 * 3 * (tokenized_datasets_ua['train'].num_rows / train_batch_size),
    num_training_steps=3 * (tokenized_datasets_ua['train'].num_rows / val_batch_size)
)

args = TrainingArguments(
    "bert-ua-loc-ner",
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=val_batch_size,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    fp16=True,
    fp16_full_eval=True,
    no_cuda=False,

)

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-03d45eb6c6ece586/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-03d45eb6c6ece586/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/100 [00:00<?, ?ba/s]

  0%|          | 0/14 [00:00<?, ?ba/s]



In [8]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets_ua["train"],
    eval_dataset=tokenized_datasets_ua["val"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    optimizers=(optimizer, scheduler)
)

In [9]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0071,0.015691,0.900445,0.905781,0.903105,0.996767
2,0.0051,0.013596,0.902842,0.918239,0.910476,0.996994


KeyboardInterrupt: 

In [10]:
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "./bert-ua-loc-ner/checkpoint-12500/"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

In [11]:
test_df = pd.read_csv('/kaggle/input/nlp-ua-locations-extractions/test.csv')
test_df.head()

Unnamed: 0,text_id,text,locations
0,0,"❗️Кількість поранених зросла до трьох, – Кличк...",[]
1,1,"🥤В Києві за 91,13 млн гривень починаються робо...",[]
2,2,▪️Сьогодні вночі росіяни завдали ракетного уда...,[]
3,3,Наразі у запасах росіян найбільше балістичних ...,[]
4,4,"⛸В один день, 29 серпня, ДП ""Центральна учбово...",[]


In [12]:
test_df['clean_text'] = test_df['text'].apply(preprocess_text)

In [13]:
def predict(text):
    locs = token_classifier(text)
    locs = [loc['word'] for loc in locs]
    locs = [loc for loc in locs if not loc.startswith('#')]
    return locs

In [14]:
preds = test_df['clean_text'].apply(predict)

In [15]:
df = pd.DataFrame({
    'text_id': test_df['text_id'],
    'locations': preds
})
df.head()

Unnamed: 0,text_id,locations
0,0,[]
1,1,"[Києві, Шулявського шляхопроводу, Шулявського ..."
2,2,"[Гоголеве, Миргородського району, Полтавської ..."
3,3,[]
4,4,"[проспекті Академіка Глушкова, 9]"


In [16]:
df.to_csv('/kaggle/working/submission2.csv', index=False)

The submission score is 0.4860.