## Init

In [8]:
import torch
from torch import nn
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import random
from torch.nn import CrossEntropyLoss
from tqdm import tqdm

In [2]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [9]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/xlm-roberta-large-en-ru")
base_model = AutoModel.from_pretrained("DeepPavlov/xlm-roberta-large-en-ru")

In [10]:
import json
# Open and load JSON file
with open('D:\\project_IRM\\robustLLM\\samples_8.json', 'r', encoding="utf-8") as file:
    data = json.load(file)

In [11]:
unique_labels = ['address', 'email', 'fio', 'ip', 'ipv6', 'login', 'org', 'password', 'phone']

In [13]:
import random
from itertools import chain
from typing import Any, Dict, List, Tuple

def sample_and_split_diverse(
    data: Dict[Any, Dict[Any, Dict[Any, Dict[Any, List[Dict]]]]],
    n_samples: int,
    val_ratio: float,
    seed: int = None
) -> Tuple[Tuple[List[str], List[Dict]], Tuple[List[str], List[Dict]]]:
    """
    Perform stratified sampling across (log_env, length) groups so that
    both train and val contain entries from each group.

    Args:
        data: nested dict of shape
              data[log_env][length][template][lang] -> list of {text, spans}
        n_samples: total number of examples (train + val)
        val_ratio: fraction of examples for validation
        seed: random seed for reproducibility

    Returns:
        ((train_texts, train_spans), (val_texts, val_spans))
    """
    if seed is not None:
        random.seed(seed)

    # 1. Flatten into groups
    groups: List[Tuple[Tuple[Any, Any], List[Dict]]] = []
    for log_env_val, lengths in data.items():
        for length_val, templates in lengths.items():
            entries = []
            for tmpl in templates.values():
                for lang_list in tmpl.values():
                    entries.extend(lang_list)
            if entries:
                groups.append(((log_env_val, length_val), entries))

    # Compute total available entries
    total_entries = sum(len(entries) for _, entries in groups)
    print('total_entries:', total_entries)
    if total_entries < n_samples:
        raise ValueError(f"Not enough total examples: {total_entries} < {n_samples}")

    n_val = int(n_samples * val_ratio)
    n_train = n_samples - n_val

    train_samples: List[Dict] = []
    val_samples:   List[Dict] = []

    # 2. For each group, allocate proportional samples then split
    acc_train, acc_val = 0, 0
    for (env_len, entries) in groups:
        group_size = len(entries)
        # number of samples to take from this group
        group_n = max(1, int(group_size / total_entries * n_samples))
        # adjust last group to exactly fill
        if acc_train + acc_val + group_n > n_samples:
            group_n = n_samples - (acc_train + acc_val)

        # number of val samples from this group
        group_val_n = int(group_n * val_ratio)
        group_train_n = group_n - group_val_n

        # sample without replacement
        sampled = random.sample(entries, group_n)
        val_subset   = sampled[:group_val_n]
        train_subset = sampled[group_val_n:]

        val_samples.extend(val_subset)
        train_samples.extend(train_subset)
        acc_val   += len(val_subset)
        acc_train += len(train_subset)

    # 3. If rounding left us short, fill remaining slots from the pools
    def fill(samples: List[Dict], pool: List[Dict], need: int):
        if need <= 0:
            return
        remaining = [e for e in pool if e not in samples]
        samples.extend(random.sample(remaining, need))

    fill(val_samples,   list(chain.from_iterable(g for _, g in groups)), n_val - len(val_samples))
    fill(train_samples, list(chain.from_iterable(g for _, g in groups)), n_train - len(train_samples))

    # 4. Unpack texts and spans
    train_texts = [e['text'] for e in train_samples]
    train_spans = [e['spans'] for e in train_samples]
    val_texts   = [e['text'] for e in val_samples]
    val_spans   = [e['spans'] for e in val_samples]

    return ( (train_texts, train_spans), (val_texts, val_spans) )


In [14]:
(data_train, ner_train), (data_val, ner_val) = sample_and_split_diverse(data, 10, 0.05)

total_entries: 167600


In [15]:
len(data_train), data_train[14], ner_train[14]

IndexError: list index out of range

In [97]:
data_log = ['80.97.168.58 - gorshkovnikon [Тимофеев Серафим Антонович] (email: viktor15@example.net, pass: _7Ys%IVQp2) phone=8 826 587 31 88 addr="к. Армавир, наб. Грибоедова, д. 9/3 стр. 5/3, 937143" org=Медведева и партнеры ipv6=64be:ec33:33f1:1207:5d95:7f1:8710:a5','A support ticket was created by user jeffrey97 from organization Hill-Hall. The ticket relates to an issue reported by (575)679-5357x44829. The affected client has IP address 9468:d568:5c7e:3ece:55c2:8542:2c96:8ca1 and client ID 79dbb385-04e2-43ca-a6ec-dbd0ca04b741.']
ner_data = [[
  {
    "label": "ip",
    "start": 1,
    "end": 13,
    "value": "80.97.168.58"
  },
  {
    "label": "login",
    "start": 16,
    "end": 29,
    "value": "gorshkovnikon"
  },
  {
    "label": "fio",
    "start": 31,
    "end": 57,
    "value": "Тимофеев Серафим Антонович"
  },
  {
    "label": "email",
    "start": 67,
    "end": 87,
    "value": "viktor15@example.net"
  },
  {
    "label": "password",
    "start": 95,
    "end": 105,
    "value": "_7Ys%IVQp2"
  },
  {
    "label": "phone",
    "start": 113,
    "end": 128,
    "value": "8 826 587 31 88"
  },
  {
    "label": "address",
    "start": 135,
    "end": 187,
    "value": "к. Армавир, наб. Грибоедова, д. 9/3 стр. 5/3, 937143"
  },
  {
    "label": "org",
    "start": 193,
    "end": 213,
  },
  {
    "label": "ipv6",
    "start": 219,
    "end": 256,
    "value": "64be:ec33:33f1:1207:5d95:7f1:8710:a5a"
  }
],
[{'label': 'login', 'start': 38, 'end': 47, 'value': 'jeffrey97'},
 {'label': 'org', 'start': 66, 'end': 75, 'value': 'Hill-Hall'},
 {'label': 'phone', 'start': 120, 'end': 139, 'value': '(575)679-5357x44829'},
 {'label': 'ipv6',
  'start': 176,
  'end': 215,
  'value': '9468:d568:5c7e:3ece:55c2:8542:2c96:8ca1'}]]

In [16]:
def prepared_data(data_log,ner_data,unique_labels):
    df = pd.DataFrame({
    "sentence": data_log,
    "ner": ner_data,
  })
  #Creat label dictionary
    unique_labels_ner = ["O"] + sorted([f"B-{e}" for e in unique_labels] + [f"I-{e}" for e in unique_labels])
    label2id = {label: idx for idx, label in enumerate(unique_labels_ner)}
    id2label = {v: k for k, v in label2id.items()}
  
  # Tokenize text
    tokenized = df["sentence"].apply(lambda x: tokenizer.encode_plus(
        x,
        add_special_tokens=True,  # Adds [CLS] and [SEP]
        max_length=256,           # Pad/truncate to max length
        padding='max_length',
        truncation=True,
        return_offsets_mapping=True,
        return_attention_mask=True,
        return_tensors='pt'       # Return PyTorch tensors
    ))
    df["input_ids"] = tokenized.apply(lambda x: x["input_ids"].squeeze(0))
    df["attention_mask"] = tokenized.apply(lambda x: x["attention_mask"].squeeze(0))
    df["offset_mapping"] = tokenized.apply(lambda x: x["offset_mapping"].squeeze(0))
    return df,label2id,id2label,unique_labels_ner

df,label2id,id2label,unique_labels_ner = prepared_data(data_train,ner_train,unique_labels)
df_val,_,_,_ = prepared_data(data_val,ner_val,unique_labels)

In [17]:
def mark_label(labels,idx,label2id,label_ner,start_ner =-1,start_token = 1):
    if (start_token < start_ner):
        labels[idx] = label2id[f"B-{label_ner}"]
    else:
        labels[idx] = label2id[f"I-{label_ner}"]
    return labels

In [18]:
def create_labels(ners,offsets,label2id):
    total_labels = []
    for ner,offset in tqdm(zip(ners,offsets)):
        labels = [label2id["O"]] * len(offset)
        idx = 0
        for item in offset:
            start_token = item[0]
            end_token = item[1]
            if start_token == 0 and end_token == 0:
                labels[idx] = -100 
            else:
                for entity in ner:
                    start_ner = entity["start"]
                    end_ner = entity["end"]
                    label_ner = entity["label"]
                    if end_token >= start_ner and end_token < end_ner:
                        labels = mark_label(labels,idx,label2id,label_ner,start_ner,start_token)
                        break
                    elif end_ner > (start_token +1) and start_token > start_ner:
                        labels = mark_label(labels,idx,label2id,label_ner)
                        break
            idx +=1
        total_labels.append(labels)
    return total_labels
df['labels'] = create_labels(df["ner"],df["offset_mapping"],label2id)
df_val['labels'] = create_labels(df_val["ner"],df_val["offset_mapping"],label2id)

0it [00:00, ?it/s]

10it [00:00, 84.32it/s]
0it [00:00, ?it/s]


In [19]:
df['ner'][1]

[{'label': 'fio', 'start': 185, 'end': 197, 'value': 'Amber Bailey'},
 {'label': 'address',
  'start': 198,
  'end': 230,
  'value': 'Unit 3350 Box 4909, DPO AA 96340'}]

In [103]:
df["sentence"][0]

'127.0.0.1 - cynthiagraham [22/Jul/2025:15:56:14 ] "OPTIONS search HTTP/2" 502 230057 "http://walters.com/explore/list/categoryregister.asp" "Mozilla/5.0 (Macintosh; U; PPC Mac OS X 10_9_6 rv:3.0; nso-ZA) AppleWebKit/534.33.4 (KHTML, like Gecko) Version/5.0.1 Safari/534.33.4" 55d7e231-cb01-4718-89ec-623d6e6f7c71 c0fe:5627:a21d:4355:9882:c12d:c999:aa3c 7630 Taylor Extension, East Lisa, RI 45839 (264)562-0255x461 epotter@hotmail.com 36391'

In [20]:
df

Unnamed: 0,sentence,ner,input_ids,attention_mask,offset_mapping,labels
0,"127.0.0.1 - holly47 [22/Jul/2025:15:56:14 ] ""D...","[{'label': 'ipv6', 'start': 303, 'end': 339, '...","[tensor(0), tensor(427), tensor(25691), tensor...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[[tensor(0), tensor(0)], [tensor(0), tensor(2)...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,127.0.0.1 - kendraday [22/Jul/2025:15:56:14 ] ...,"[{'label': 'fio', 'start': 185, 'end': 197, 'v...","[tensor(0), tensor(427), tensor(25691), tensor...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[[tensor(0), tensor(0)], [tensor(0), tensor(2)...","[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"167.130.254.131 - - [22/Jul/2025:15:56:14 ] ""P...","[{'label': 'ip', 'start': 1, 'end': 16, 'value...","[tensor(0), tensor(20035), tensor(5), tensor(1...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[[tensor(0), tensor(0)], [tensor(0), tensor(3)...","[-100, 4, 13, 13, 13, 13, 13, 13, 0, 0, 0, 0, ..."
3,99.178.165.218 - Kimberly Reeves [22/Jul/2025:...,"[{'label': 'ip', 'start': 1, 'end': 15, 'value...","[tensor(0), tensor(4426), tensor(5), tensor(17...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[[tensor(0), tensor(0)], [tensor(0), tensor(2)...","[-100, 4, 13, 13, 13, 13, 13, 13, 13, 0, 3, 12..."
4,127.0.0.1 - Catherine Bailey ([22/Jul/2025:15:...,"[{'label': 'fio', 'start': 13, 'end': 29, 'val...","[tensor(0), tensor(427), tensor(25691), tensor...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[[tensor(0), tensor(0)], [tensor(0), tensor(2)...","[-100, 0, 0, 0, 0, 0, 3, 12, 12, 0, 0, 0, 0, 0..."
5,127.0.0.1 - sara16 (Traci Vega) [22/Jul/2025:1...,"[{'label': 'login', 'start': 13, 'end': 19, 'v...","[tensor(0), tensor(427), tensor(25691), tensor...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[[tensor(0), tensor(0)], [tensor(0), tensor(2)...","[-100, 0, 0, 0, 0, 0, 6, 15, 15, 0, 3, 12, 12,..."
6,7.35.90.59 - karl_2016 [22/Jul/2025:15:56:15 ]...,"[{'label': 'ip', 'start': 1, 'end': 11, 'value...","[tensor(0), tensor(1897), tensor(3050), tensor...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[[tensor(0), tensor(0)], [tensor(0), tensor(2)...","[-100, 4, 13, 13, 13, 13, 13, 0, 0, 0, 0, 0, 0..."
7,170.29.149.208 - Анжела Львовна Молчанова НПО ...,"[{'label': 'ip', 'start': 1, 'end': 15, 'value...","[tensor(0), tensor(7182), tensor(5), tensor(24...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[[tensor(0), tensor(0)], [tensor(0), tensor(3)...","[-100, 4, 13, 13, 13, 13, 13, 13, 0, 3, 12, 12..."
8,"127.0.0.1 - alla98 [15:56:16] ""OPTIONS categor...","[{'label': 'login', 'start': 13, 'end': 19, 'v...","[tensor(0), tensor(427), tensor(25691), tensor...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[[tensor(0), tensor(0)], [tensor(0), tensor(2)...","[-100, 0, 0, 0, 0, 0, 6, 15, 0, 0, 0, 0, 0, 0,..."
9,199.40.168.7 - Michele Harris - nathanpham@yah...,"[{'label': 'ip', 'start': 1, 'end': 13, 'value...","[tensor(0), tensor(20331), tensor(5), tensor(1...","[tensor(1), tensor(1), tensor(1), tensor(1), t...","[[tensor(0), tensor(0)], [tensor(0), tensor(3)...","[-100, 4, 13, 13, 13, 13, 13, 13, 0, 3, 12, 12..."


In [105]:
df['ner'][0]

[{'label': 'ipv6',
  'start': 314,
  'end': 353,
  'value': 'c0fe:5627:a21d:4355:9882:c12d:c999:aa3c'},
 {'label': 'address',
  'start': 354,
  'end': 396,
  'value': '7630 Taylor Extension, East Lisa, RI 45839'},
 {'label': 'phone', 'start': 397, 'end': 414, 'value': '(264)562-0255x461'},
 {'label': 'email', 'start': 415, 'end': 434, 'value': 'epotter@hotmail.com'}]

In [21]:
def test_tokens(labels,input_ids,offset):
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    token_labels = [id2label[l] if l != -100 else "IGNORED" for l in labels]
    for t, l, o in zip(tokens, token_labels,offset):
        print(f"{t:15} → {l}, {o}")
test_tokens(df['labels'][0],df['input_ids'][0],df["offset_mapping"][0])

<s>             → IGNORED, tensor([0, 0])
▁12             → O, tensor([0, 2])
7.0             → O, tensor([2, 5])
.               → O, tensor([5, 6])
0.1             → O, tensor([6, 9])
▁-              → O, tensor([ 9, 11])
▁holl           → O, tensor([11, 16])
y               → O, tensor([16, 17])
47              → O, tensor([17, 19])
▁[              → O, tensor([19, 21])
22              → O, tensor([21, 23])
/               → O, tensor([23, 24])
Jul             → O, tensor([24, 27])
/20             → O, tensor([27, 30])
25              → O, tensor([30, 32])
:15             → O, tensor([32, 35])
:               → O, tensor([35, 36])
56              → O, tensor([36, 38])
:14             → O, tensor([38, 41])
▁               → O, tensor([41, 42])
]               → O, tensor([42, 43])
▁"              → O, tensor([43, 45])
DE              → O, tensor([45, 47])
LE              → O, tensor([47, 49])
TE              → O, tensor([49, 51])
▁main           → O, tensor([51, 56])
/               

In [107]:
class TokenClassifierHead(nn.Module):
    def __init__(self, hidden_size, num_labels):
        """
        Classification head for token-level predictions.
        """
        super(TokenClassifierHead, self).__init__()
        self.classifier = nn.Linear(hidden_size, num_labels)

    def forward(self, x):
        """
        Args:
            x: tensor of shape [batch_size, seq_len, hidden_size]
        Returns:
            logits: tensor of shape [batch_size, seq_len, num_labels]
        """
        logits = self.classifier(x)
        return logits
    

In [108]:
class BERTForNER(nn.Module):
    def __init__(self, model, num_labels):
        """
        BERT + token classification head for NER.
        """
        super(BERTForNER, self).__init__()
        self.bert = model
        hidden_size = self.bert.config.hidden_size
        self.classifier = TokenClassifierHead(hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None):
        """
        Args:
            input_ids: [batch_size, seq_len]
            attention_mask: [batch_size, seq_len]
            token_type_ids: [batch_size, seq_len]
        Returns:
            logits: [batch_size, seq_len, num_labels]
        """
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )

        # Get all token embeddings
        sequence_output = outputs.last_hidden_state  # [batch_size, seq_len, hidden_size]

        # Predict logits for each token
        logits = self.classifier(sequence_output)  # [batch_size, seq_len, num_labels]
        return logits


In [109]:
from torch.utils.data import Dataset

class NERDataset(Dataset):
    def __init__(self, input_ids,attention_mask, labels):
        """
        Args:
            encodings: tokenizer output (input_ids, attention_mask, etc.)
            labels: list of BIO tag IDs aligned with tokens
        """
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item ={}
        item['input_ids'] =  torch.tensor(self.input_ids[idx])
        item['attention_mask']  = torch.tensor(self.attention_mask[idx])
        item["labels"] =  torch.tensor(self.labels[idx])
        return item

In [110]:
def train(model, df, df_val, unique_labels_ner, epochs):
    num_labels = len(unique_labels_ner)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if device == 'cuda':
        torch.cuda.empty_cache()

    optimizer = AdamW(model.parameters(), lr=1e-3)
    criterion = CrossEntropyLoss(ignore_index=-100)

    # Create dataloaders
    train_dataset = NERDataset(df['input_ids'], df['attention_mask'], df['labels'])
    val_dataset = NERDataset(df_val['input_ids'], df_val['attention_mask'], df_val['labels'])

    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

    for epoch in range(epochs):
        # Training phase
        model.train()
        total_loss_train = 0.0

        for b_n, batch in tqdm(enumerate(train_loader)):
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            logits = model(input_ids, attention_mask)

            loss = criterion(
                logits.view(-1, num_labels),  # Flatten
                labels.view(-1)
            )

            loss.backward()
            optimizer.step()

            total_loss_train += loss.item()
            if b_n % 100 == 0:
                print('train_loss:', loss.item())
                model.eval()
                total_loss_val = 0.0

                with torch.no_grad():
                    for batch in val_loader:
                        input_ids = batch["input_ids"].to(device)
                        attention_mask = batch["attention_mask"].to(device)
                        labels = batch["labels"].to(device)

                        logits = model(input_ids, attention_mask)

                        loss = criterion(
                            logits.view(-1, num_labels),
                            labels.view(-1)
                        )
                        total_loss_val += loss.item()

                avg_loss_val = total_loss_val / len(val_loader)

                print(f"Epoch {epoch+1}/{epochs} batch {b_n}/{len(train_loader)} - Loss_val: {avg_loss_val:.4f}")

        
        avg_loss_train = total_loss_train / len(train_loader)

        # Validation phase
        model.eval()
        total_loss_val = 0.0

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                logits = model(input_ids, attention_mask)

                loss = criterion(
                    logits.view(-1, num_labels),
                    labels.view(-1)
                )
                total_loss_val += loss.item()

        avg_loss_val = total_loss_val / len(val_loader)

        print(f"Epoch {epoch+1}/{epochs} - Loss_train: {avg_loss_train:.4f}, Loss_val: {avg_loss_val:.4f}")

    return

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTForNER(base_model, len(unique_labels_ner)).to(device)

In [None]:
train(model,df,df_val,unique_labels_ner,2)

0it [00:00, ?it/s]

train_loss: 3.342041492462158


1it [00:14, 14.16s/it]

Epoch 1/2 batch0/179 - Loss_val: 1.9312


100it [05:55,  3.45s/it]

train_loss: 1.6870251893997192


101it [06:09,  6.64s/it]

Epoch 1/2 batch100/179 - Loss_val: 1.7530


118it [07:08,  3.46s/it]

In [None]:
torch.save(model_test, "/home/jupyter/datasphere/project/model/bertner_full.pth")

In [23]:
df_test['sentence'][0]

'200.168.1.1 - aguljaeva - Ермил Арсенович Жуков [22/Jul/2025:15:56:19 ] "POST http://www.bankovski.info/ HTTP/1.1/200" 594358 "http://www.rao.biz/app/search/postssearch.htm" "Mozilla/5.0 (compatible; MSIE 8.0; Windows CE; Trident/4.0)" 594358 oao.biz д. Саранск, бул. Алтайский, д. 530, 529045'

In [25]:
def predict(text, model, tokenizer, id2label, device="cpu"):
    """
    Run NER prediction on raw text.

    Args:
        text (str): Raw input text.
        model (nn.Module): Your trained BERTForNER model.
        tokenizer: Hugging Face tokenizer.
        id2label (dict): Mapping from label IDs to label names.
        device (str): "cpu" or "cuda".
    Returns:
        List of (token, predicted_label)
    """
    # Tokenize input text
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,  # Adds [CLS] and [SEP]
        max_length=512,           # Pad/truncate to max length
        padding='max_length',
        truncation=True,
        return_offsets_mapping=True,
        return_attention_mask=True,
        return_tensors='pt'       # Return PyTorch tensors
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    # Set model to eval mode
    model.eval()
    with torch.no_grad():
        # Forward pass
        logits = model(input_ids, attention_mask)
        predictions = torch.argmax(logits, dim=-1)  # [batch_size, seq_len]

    # Convert IDs to tokens and labels
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    predicted_labels = [id2label[p.item()] for p in predictions[0]]

    # Filter out special tokens ([CLS], [SEP], [PAD])
    filtered_results = []
    for token, label in zip(tokens, predicted_labels):
        if token not in tokenizer.all_special_tokens:
            filtered_results.append((token, label))

    return filtered_results

In [27]:
filtered_results = predict(df_test['sentence'][0], model_test, tokenizer, id2label, device="cuda")

In [28]:
set(filtered_results

[('▁200', 'O'),
 ('.', 'O'),
 ('168', 'O'),
 ('.1.1', 'O'),
 ('▁-', 'O'),
 ('▁a', 'O'),
 ('gul', 'O'),
 ('ja', 'O'),
 ('eva', 'O'),
 ('▁-', 'O'),
 ('▁Ер', 'O'),
 ('мил', 'O'),
 ('▁Арсен', 'O'),
 ('ович', 'O'),
 ('▁Жу', 'O'),
 ('ков', 'O'),
 ('▁[', 'O'),
 ('22', 'O'),
 ('/', 'O'),
 ('Jul', 'O'),
 ('/20', 'O'),
 ('25', 'O'),
 (':15', 'O'),
 (':', 'O'),
 ('56', 'O'),
 (':', 'O'),
 ('19', 'O'),
 ('▁', 'O'),
 (']', 'O'),
 ('▁"', 'O'),
 ('P', 'O'),
 ('OST', 'O'),
 ('▁http', 'O'),
 ('://', 'O'),
 ('www', 'O'),
 ('.', 'O'),
 ('ban', 'O'),
 ('kov', 'O'),
 ('ski', 'O'),
 ('.', 'O'),
 ('info', 'O'),
 ('/', 'O'),
 ('▁HTTP', 'O'),
 ('/', 'O'),
 ('1.1', 'O'),
 ('/', 'O'),
 ('200', 'O'),
 ('"', 'O'),
 ('▁59', 'O'),
 ('43', 'O'),
 ('58', 'O'),
 ('▁"', 'O'),
 ('http', 'O'),
 ('://', 'O'),
 ('www', 'O'),
 ('.', 'O'),
 ('ra', 'O'),
 ('o', 'O'),
 ('.', 'O'),
 ('bi', 'O'),
 ('z', 'O'),
 ('/', 'O'),
 ('app', 'O'),
 ('/', 'O'),
 ('search', 'O'),
 ('/', 'O'),
 ('post', 'O'),
 ('s', 'O'),
 ('search', 'O'),
 ('