In [None]:
# Import all libs
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from seqeval.metrics import classification_report as seqeval_report
from typing import List, Union
import numpy as np
from collections import defaultdict
import numpy as np
import random
import json
import re
import time

In [None]:
# Fixing all random seeds
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
set_seed(42) 

In [None]:
# Tokenizer
def tokenizer(text):
    tokens = []    
    # RegEx
    word_re = re.compile(r'''
        \w+[-\w]*|      
        [^\w\s]|         
        \d+\.\d+|        
        \d+/\d+|        
        \d+              
    ''', re.VERBOSE)
    
    for match in word_re.finditer(text):
        tokens.append(match.group())
    
    return tokens

In [None]:
class NERDataset(Dataset):
    def __init__(self, data, vocab, tag_encoder):
        self.data = data
        self.vocab = vocab
        self.tag_encoder = tag_encoder
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        entry = self.data[idx]
        tokens = entry["tokens"]
        tags = entry["tags"]
        
        word_ids = [self.vocab.get(token, 1) for token in tokens]
        tag_ids = self.tag_encoder.transform(tags)
        
        return torch.tensor(word_ids, dtype=torch.long), torch.tensor(tag_ids, dtype=torch.long)

def collate_fn(batch):
    words, tags = zip(*batch)
    words_padded = pad_sequence(words, batch_first=True, padding_value=0)
    tags_padded = pad_sequence(tags, batch_first=True, padding_value=-100)
    return words_padded, tags_padded

In [None]:
class UniversalCNN_NER(nn.Module):
    def __init__(self, 
                 vocab_size: int,
                 num_tags: int,
                 embedding_dim: int = 100,
                 num_filters: int = 128,
                 filter_sizes: Union[int, List[int]] = [3, 5, 7],
                 use_lstm: bool = False,
                 lstm_hidden: int = 256,
                 bidirectional: bool = True,
                 use_layernorm: bool = False,
                 dropout: float = 0.3):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        if isinstance(filter_sizes, int):
            filter_sizes = [filter_sizes]
        self.num_conv_layers = len(filter_sizes)
        
        self.conv_layers = nn.ModuleList()
        in_channels = embedding_dim
        
        for i, fs in enumerate(filter_sizes):
            self.conv_layers.append(
                nn.Sequential(
                    nn.Conv1d(
                        in_channels=in_channels,
                        out_channels=num_filters,
                        kernel_size=fs,
                        padding=fs//2
                    ),
                    nn.GELU(),
                    nn.Dropout(dropout)
                )
            )
            in_channels = num_filters
        
        self.use_lstm = use_lstm
        if use_lstm:
            self.lstm = nn.LSTM(
                input_size=num_filters,
                hidden_size=lstm_hidden,
                num_layers=1,
                bidirectional=bidirectional,
                batch_first=True
            )
            lstm_output_size = lstm_hidden * 2 if bidirectional else lstm_hidden
        else:
            lstm_output_size = num_filters
        
        self.use_layernorm = use_layernorm
        if use_layernorm:
            self.layernorm = nn.LayerNorm(lstm_output_size)
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(lstm_output_size, num_tags)
        
        self._init_weights()

    def _init_weights(self):
        for layer in self.conv_layers:
            nn.init.kaiming_normal_(layer[0].weight, mode='fan_out', nonlinearity='relu')
            nn.init.constant_(layer[0].bias, 0)
        
        if hasattr(self, 'lstm'):
            for name, param in self.lstm.named_parameters():
                if 'weight' in name:
                    nn.init.xavier_normal_(param)
                else:
                    nn.init.constant_(param, 0)
        
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)

    def forward(self, x):
        x = self.embedding(x)  
        x = x.permute(0, 2, 1)
        
        # Применение CNN слоев
        for conv in self.conv_layers:
            x = conv(x)
        
        x = x.permute(0, 2, 1)
        
        if self.use_lstm:
            x, _ = self.lstm(x)
        
        if self.use_layernorm:
            x = self.layernorm(x)
        
        return self.fc(self.dropout(x))
    
# Train
def train_model(model, dataloader, num_tags, epochs=20, lr=1e-3, model_save_path='best_model.pt'):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    #criterion = nn.CrossEntropyLoss(ignore_index=-100)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    best_loss = float('inf')
    
    for epoch in range(epochs):
        
        epoch_loss = 0.0
        model.train()
        
        for words, tags in dataloader:
            words = words.to(device)
            tags = tags.to(device)
            
            optimizer.zero_grad()
            # Forward pass
            outputs = model(words)
            
            # Правильный reshape
            outputs = outputs.reshape(-1, num_tags)
            tags = tags.reshape(-1) 
            
            loss = criterion(outputs, tags)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
            
        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")
        avg_loss = epoch_loss / len(dataloader)
        
        torch.save(model.state_dict(), model_save_path)
        
        print(f'\nEpoch {epoch+1}/{epochs} | Avg Loss: {avg_loss:.4f}')
    return model

# Predicted
def predict(model, sentence, vocab, tag_encoder):
    model.eval()
    word_ids = [vocab.get(word, 1) for word in sentence]
    
    with torch.no_grad():
        outputs = model(torch.tensor([word_ids]))
        _, predicted = torch.max(outputs, 2)
    
    predicted_tags = tag_encoder.inverse_transform(predicted.squeeze().numpy())
    
    entities = []
    current_entity = None
    start_idx = 0
    
    for i, (word, tag) in enumerate(zip(sentence, predicted_tags)):
        if tag.startswith('B-'):
            if current_entity is not None:
                entities.append((start_idx, i, current_entity))
            current_entity = tag[2:]
            start_idx = i
        elif tag == 'O' and current_entity is not None:
            entities.append((start_idx, i, current_entity))
            current_entity = None
    
    if current_entity is not None:
        entities.append((start_idx, len(sentence), current_entity))
    
    # Return results
    for start, end, entity_type in entities:
        entity_text = ' '.join(sentence[start:end])
        print(f"{entity_text} -> {entity_type}")
    
    return predicted_tags


# 6. Предсказание
def evaluate_model(model, test_data, vocab, tag_encoder):
    model.eval()
    all_true_tags = []
    all_pred_tags = []
    
    for entry in test_data:
        tokens = entry["tokens"]
        true_tags = entry["tags"]
        
        word_ids = [vocab.get(word, 1) for word in tokens]
        with torch.no_grad():
            outputs = model(torch.tensor([word_ids]))
            _, predicted = torch.max(outputs, 2)
        
        pred_tags = tag_encoder.inverse_transform(predicted.squeeze().numpy())
        
        all_true_tags.append(true_tags)
        all_pred_tags.append(pred_tags.tolist())
    
    # Entity-level
    entity_report = seqeval_report(all_true_tags, all_pred_tags, zero_division=0)
    
    # Calculate metrics
    entity_types = set(tag[2:] for tags in all_true_tags for tag in tags if tag != 'O')
    entity_metrics = {}
    
    for entity in entity_types:
        tp = 0  # True Positives
        fp = 0  # False Positives
        fn = 0  # False Negatives
        
        for true_seq, pred_seq in zip(all_true_tags, all_pred_tags):
            true_entities = get_entities(true_seq)
            pred_entities = get_entities(pred_seq)
            
            true_set = set((start, end) for (start, end, e_type) in true_entities if e_type == entity)
            pred_set = set((start, end) for (start, end, e_type) in pred_entities if e_type == entity)
            
            tp += len(true_set & pred_set)
            fp += len(pred_set - true_set)
            fn += len(true_set - pred_set)
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        entity_metrics[entity] = {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'support': tp + fn
        }
    
    return {
        'entity_level': entity_report,
        'entity_metrics': entity_metrics
    }

def get_entities(tag_sequence):
    entities = []
    current_entity = None
    start_idx = 0
    
    for i, tag in enumerate(tag_sequence):
        if tag.startswith('B-'):
            if current_entity is not None:
                entities.append((start_idx, i, current_entity))
            current_entity = tag[2:]
            start_idx = i
        elif tag == 'O' and current_entity is not None:
            entities.append((start_idx, i, current_entity))
            current_entity = None
    
    if current_entity is not None:
        entities.append((start_idx, len(tag_sequence), current_entity))
    
    return entities    

In [None]:
# Read data
with open(r"..\..\data\raw\dataset_train.json", 'r', encoding='utf-8') as fp:
    #print(fp.read(4544900))
    sample_data_train = json.load(fp)
print(len(sample_data_train))

with open(r"..\..\data\raw\dataset_test.json", 'r', encoding='utf-8') as fp:
    sample_data_test = json.load(fp)
print(len(sample_data_test))

In [None]:
# Create dictionary
word_counts = defaultdict(int)
all_tags = []

for entry in sample_data_train:
    for token in entry["tokens"]:
        word_counts[token] += 1
    all_tags.extend(entry["tags"])

vocab = {word: i+2 for i, word in enumerate(word_counts)}  # 0 для padding, 1 для UNK
vocab_size = len(vocab) + 2

tag_encoder = LabelEncoder()
tag_encoder.fit(all_tags)
num_tags = len(tag_encoder.classes_)

dataset = NERDataset(sample_data_train, vocab, tag_encoder)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

In [None]:
# Train simple CNN (1 layer)
time_start = time.time()
model = UniversalCNN_NER(
    vocab_size=10000,
    num_tags=len(tag_encoder.classes_),
    filter_sizes=3,
    use_lstm=False 
)

trained_model = train_model(
    model=model,
    dataloader=dataloader,
    num_tags=len(tag_encoder.classes_),
    epochs=20,
    lr=1e-3
)

print('Time train: ', time.time() - time_start, 'c')

time_start = time.time()
metrics = evaluate_model(model, sample_data_test, vocab, tag_encoder)

print("\n=== Entity-level Metrics ===")
print(metrics['entity_level'])

print("\n=== Per-entity Metrics ===")
for entity, scores in metrics['entity_metrics'].items():
    print(f"{entity}:")
    print(f"  Precision: {scores['precision']:.4f}")
    print(f"  Recall:    {scores['recall']:.4f}")
    print(f"  F1-score:  {scores['f1']:.4f}")
    print(f"  Support:   {scores['support']}")
print("Time inference: ", time.time() - time_start, ' c')

In [None]:
# Train simple CNN (2 layer)
time_start = time.time()
model = UniversalCNN_NER(
    vocab_size=10000,
    num_tags=len(tag_encoder.classes_),
    filter_sizes=[3, 5],
    num_filters=256,
    use_lstm=False 
)

trained_model = train_model(
    model=model,
    dataloader=dataloader,
    num_tags=len(tag_encoder.classes_),
    epochs=20,
    lr=1e-3
)

print('Time train: ', time.time() - time_start, 'c')

time_start = time.time()
metrics = evaluate_model(model, sample_data_test, vocab, tag_encoder)

print("\n=== Entity-level Metrics ===")
print(metrics['entity_level'])

print("\n=== Per-entity Metrics ===")
for entity, scores in metrics['entity_metrics'].items():
    print(f"{entity}:")
    print(f"  Precision: {scores['precision']:.4f}")
    print(f"  Recall:    {scores['recall']:.4f}")
    print(f"  F1-score:  {scores['f1']:.4f}")
    print(f"  Support:   {scores['support']}")
print("Time inference: ", time.time() - time_start, ' c')

In [36]:
# Train simple CNN (3 layer)
time_start = time.time()
model = UniversalCNN_NER(
    vocab_size=10000,
    num_tags=len(tag_encoder.classes_),
    filter_sizes=[3, 5, 7],
    num_filters=256,
    use_lstm=False 
)

trained_model = train_model(
    model=model,
    dataloader=dataloader,
    num_tags=len(tag_encoder.classes_),
    epochs=20,
    lr=1e-3
)

print('Time train: ', time.time() - time_start, 'c')

time_start = time.time()
metrics = evaluate_model(model, sample_data_test, vocab, tag_encoder)

print("\n=== Entity-level Metrics ===")
print(metrics['entity_level'])

print("\n=== Per-entity Metrics ===")
for entity, scores in metrics['entity_metrics'].items():
    print(f"{entity}:")
    print(f"  Precision: {scores['precision']:.4f}")
    print(f"  Recall:    {scores['recall']:.4f}")
    print(f"  F1-score:  {scores['f1']:.4f}")
    print(f"  Support:   {scores['support']}")
print("Time inference: ", time.time() - time_start, ' c')

Epoch 4, Loss: 0.1109

Epoch 4/20 | Avg Loss: 0.0839
Epoch 5, Loss: 0.0659

Epoch 5/20 | Avg Loss: 0.0734
Epoch 6, Loss: 0.0252

Epoch 6/20 | Avg Loss: 0.0591
Epoch 7, Loss: 0.0007

Epoch 7/20 | Avg Loss: 0.0565
Epoch 8, Loss: 0.0837

Epoch 8/20 | Avg Loss: 0.0534
Epoch 9, Loss: 0.0158

Epoch 9/20 | Avg Loss: 0.0509
Epoch 10, Loss: 0.0344

Epoch 10/20 | Avg Loss: 0.0496
Epoch 11, Loss: 0.0014

Epoch 11/20 | Avg Loss: 0.0476
Epoch 12, Loss: 0.0014

Epoch 12/20 | Avg Loss: 0.0463
Epoch 13, Loss: 0.0922

Epoch 13/20 | Avg Loss: 0.0397
Epoch 14, Loss: 0.0374

Epoch 14/20 | Avg Loss: 0.0412
Epoch 15, Loss: 0.0252

Epoch 15/20 | Avg Loss: 0.0380
Epoch 16, Loss: 0.1932

Epoch 16/20 | Avg Loss: 0.0384
Epoch 17, Loss: 0.1215

Epoch 17/20 | Avg Loss: 0.0411
Epoch 18, Loss: 0.1632

Epoch 18/20 | Avg Loss: 0.0394
Epoch 19, Loss: 0.0187

Epoch 19/20 | Avg Loss: 0.0366
Epoch 20, Loss: 0.0004

Epoch 20/20 | Avg Loss: 0.0373
Time train:  207.43672037124634 c

=== Entity-level Metrics ===
             

In [43]:
# Train simple CNN (4 layer)
time_start = time.time()
model = UniversalCNN_NER(
    vocab_size=10000,
    embedding_dim=50,
    num_tags=len(tag_encoder.classes_),
    filter_sizes=[3, 5, 5, 7],
    num_filters=256,
    use_lstm=False 
)

trained_model = train_model(
    model=model,
    dataloader=dataloader,
    num_tags=len(tag_encoder.classes_),
    epochs=20,
    lr=1e-3
)

print('Time train: ', time.time() - time_start, 'c')

time_start = time.time()
metrics = evaluate_model(model, sample_data_test, vocab, tag_encoder)

print("\n=== Entity-level Metrics ===")
print(metrics['entity_level'])

print("\n=== Per-entity Metrics ===")
for entity, scores in metrics['entity_metrics'].items():
    print(f"{entity}:")
    print(f"  Precision: {scores['precision']:.4f}")
    print(f"  Recall:    {scores['recall']:.4f}")
    print(f"  F1-score:  {scores['f1']:.4f}")
    print(f"  Support:   {scores['support']}")
print("Time inference: ", time.time() - time_start, ' c')

Epoch 1, Loss: 0.1978

Epoch 1/20 | Avg Loss: 0.6642
Epoch 2, Loss: 0.0731

Epoch 2/20 | Avg Loss: 0.2388
Epoch 3, Loss: 0.1725

Epoch 3/20 | Avg Loss: 0.1592
Epoch 4, Loss: 0.0135

Epoch 4/20 | Avg Loss: 0.1275
Epoch 5, Loss: 0.1317

Epoch 5/20 | Avg Loss: 0.1049
Epoch 6, Loss: 0.0218

Epoch 6/20 | Avg Loss: 0.0952
Epoch 7, Loss: 0.1287

Epoch 7/20 | Avg Loss: 0.0863
Epoch 8, Loss: 0.0008

Epoch 8/20 | Avg Loss: 0.0804
Epoch 9, Loss: 0.1102

Epoch 9/20 | Avg Loss: 0.0785
Epoch 10, Loss: 0.2032

Epoch 10/20 | Avg Loss: 0.0737
Epoch 11, Loss: 0.0162

Epoch 11/20 | Avg Loss: 0.0680
Epoch 12, Loss: 0.0502

Epoch 12/20 | Avg Loss: 0.0692
Epoch 13, Loss: 0.0225

Epoch 13/20 | Avg Loss: 0.0672
Epoch 14, Loss: 0.0015

Epoch 14/20 | Avg Loss: 0.0650
Epoch 15, Loss: 0.0967

Epoch 15/20 | Avg Loss: 0.0600
Epoch 16, Loss: 0.2870

Epoch 16/20 | Avg Loss: 0.0605
Epoch 17, Loss: 0.1219

Epoch 17/20 | Avg Loss: 0.0633
Epoch 18, Loss: 0.1211

Epoch 18/20 | Avg Loss: 0.0562
Epoch 19, Loss: 0.0261

Epoc

In [45]:
# Train simple CNN (1 layer) + LSTM
time_start = time.time()
model = UniversalCNN_NER(
    vocab_size=10000,
    embedding_dim=50,
    num_tags=len(tag_encoder.classes_),
    filter_sizes=[3],
    num_filters=256,
    use_lstm=True,
    bidirectional=False
)

trained_model = train_model(
    model=model,
    dataloader=dataloader,
    num_tags=len(tag_encoder.classes_),
    epochs=20,
    lr=1e-3
)

print('Time train: ', time.time() - time_start, 'c')

time_start = time.time()
metrics = evaluate_model(model, sample_data_test, vocab, tag_encoder)

print("\n=== Entity-level Metrics ===")
print(metrics['entity_level'])

print("\n=== Per-entity Metrics ===")
for entity, scores in metrics['entity_metrics'].items():
    print(f"{entity}:")
    print(f"  Precision: {scores['precision']:.4f}")
    print(f"  Recall:    {scores['recall']:.4f}")
    print(f"  F1-score:  {scores['f1']:.4f}")
    print(f"  Support:   {scores['support']}")
print("Time inference: ", time.time() - time_start, ' c')

Epoch 1, Loss: 0.1572

Epoch 1/20 | Avg Loss: 0.6893
Epoch 2, Loss: 0.0288

Epoch 2/20 | Avg Loss: 0.2147
Epoch 3, Loss: 0.0159

Epoch 3/20 | Avg Loss: 0.1264
Epoch 4, Loss: 0.0098

Epoch 4/20 | Avg Loss: 0.0837
Epoch 5, Loss: 0.0108

Epoch 5/20 | Avg Loss: 0.0592
Epoch 6, Loss: 0.2980

Epoch 6/20 | Avg Loss: 0.0489
Epoch 7, Loss: 0.0016

Epoch 7/20 | Avg Loss: 0.0374
Epoch 8, Loss: 0.1246

Epoch 8/20 | Avg Loss: 0.0301
Epoch 9, Loss: 0.0647

Epoch 9/20 | Avg Loss: 0.0258
Epoch 10, Loss: 0.0059

Epoch 10/20 | Avg Loss: 0.0226
Epoch 11, Loss: 0.0017

Epoch 11/20 | Avg Loss: 0.0197
Epoch 12, Loss: 0.0502

Epoch 12/20 | Avg Loss: 0.0168
Epoch 13, Loss: 0.0014

Epoch 13/20 | Avg Loss: 0.0147
Epoch 14, Loss: 0.0440

Epoch 14/20 | Avg Loss: 0.0134
Epoch 15, Loss: 0.0011

Epoch 15/20 | Avg Loss: 0.0130
Epoch 16, Loss: 0.0087

Epoch 16/20 | Avg Loss: 0.0125
Epoch 17, Loss: 0.0130

Epoch 17/20 | Avg Loss: 0.0122
Epoch 18, Loss: 0.0010

Epoch 18/20 | Avg Loss: 0.0104
Epoch 19, Loss: 0.0923

Epoc

In [44]:
# Train simple CNN (1 layer) + BiLSTM
time_start = time.time()
model = UniversalCNN_NER(
    vocab_size=10000,
    embedding_dim=50,
    num_tags=len(tag_encoder.classes_),
    filter_sizes=[3],
    num_filters=256,
    use_lstm=True
)

trained_model = train_model(
    model=model,
    dataloader=dataloader,
    num_tags=len(tag_encoder.classes_),
    epochs=20,
    lr=1e-3
)

print('Time train: ', time.time() - time_start, 'c')

time_start = time.time()
metrics = evaluate_model(model, sample_data_test, vocab, tag_encoder)

print("\n=== Entity-level Metrics ===")
print(metrics['entity_level'])

print("\n=== Per-entity Metrics ===")
for entity, scores in metrics['entity_metrics'].items():
    print(f"{entity}:")
    print(f"  Precision: {scores['precision']:.4f}")
    print(f"  Recall:    {scores['recall']:.4f}")
    print(f"  F1-score:  {scores['f1']:.4f}")
    print(f"  Support:   {scores['support']}")
print("Time inference: ", time.time() - time_start, ' c')

Epoch 1, Loss: 0.3573

Epoch 1/20 | Avg Loss: 0.6224
Epoch 2, Loss: 0.2820

Epoch 2/20 | Avg Loss: 0.1732
Epoch 3, Loss: 0.0279

Epoch 3/20 | Avg Loss: 0.0939
Epoch 4, Loss: 0.0797

Epoch 4/20 | Avg Loss: 0.0611
Epoch 5, Loss: 0.0320

Epoch 5/20 | Avg Loss: 0.0424
Epoch 6, Loss: 0.1321

Epoch 6/20 | Avg Loss: 0.0317
Epoch 7, Loss: 0.0876

Epoch 7/20 | Avg Loss: 0.0234
Epoch 8, Loss: 0.0232

Epoch 8/20 | Avg Loss: 0.0182
Epoch 9, Loss: 0.0015

Epoch 9/20 | Avg Loss: 0.0160
Epoch 10, Loss: 0.0299

Epoch 10/20 | Avg Loss: 0.0128
Epoch 11, Loss: 0.0078

Epoch 11/20 | Avg Loss: 0.0120
Epoch 12, Loss: 0.0345

Epoch 12/20 | Avg Loss: 0.0105
Epoch 13, Loss: 0.0117

Epoch 13/20 | Avg Loss: 0.0095
Epoch 14, Loss: 0.0034

Epoch 14/20 | Avg Loss: 0.0114
Epoch 15, Loss: 0.0037

Epoch 15/20 | Avg Loss: 0.0083
Epoch 16, Loss: 0.0108

Epoch 16/20 | Avg Loss: 0.0058
Epoch 17, Loss: 0.0003

Epoch 17/20 | Avg Loss: 0.0068
Epoch 18, Loss: 0.0011

Epoch 18/20 | Avg Loss: 0.0070
Epoch 19, Loss: 0.0017

Epoc

In [51]:
# Train simple CNN (2 layer) + BiLSTM
time_start = time.time()
model = UniversalCNN_NER(
    vocab_size=10000,
    embedding_dim=50,
    num_tags=len(tag_encoder.classes_),
    filter_sizes=[3, 5],
    num_filters=256,
    use_lstm=True
)

trained_model = train_model(
    model=model,
    dataloader=dataloader,
    num_tags=len(tag_encoder.classes_),
    epochs=20,
    lr=1e-3
)

print('Time train: ', time.time() - time_start, 'c')

time_start = time.time()
metrics = evaluate_model(model, sample_data_test, vocab, tag_encoder)

print("\n=== Entity-level Metrics ===")
print(metrics['entity_level'])

print("\n=== Per-entity Metrics ===")
for entity, scores in metrics['entity_metrics'].items():
    print(f"{entity}:")
    print(f"  Precision: {scores['precision']:.4f}")
    print(f"  Recall:    {scores['recall']:.4f}")
    print(f"  F1-score:  {scores['f1']:.4f}")
    print(f"  Support:   {scores['support']}")
print("Time inference: ", time.time() - time_start, ' c')

Epoch 1, Loss: 0.4488

Epoch 1/20 | Avg Loss: 0.6167
Epoch 2, Loss: 0.0782

Epoch 2/20 | Avg Loss: 0.1700
Epoch 3, Loss: 0.0127

Epoch 3/20 | Avg Loss: 0.0939
Epoch 4, Loss: 0.0311

Epoch 4/20 | Avg Loss: 0.0650
Epoch 5, Loss: 0.0129

Epoch 5/20 | Avg Loss: 0.0502
Epoch 6, Loss: 0.0024

Epoch 6/20 | Avg Loss: 0.0378
Epoch 7, Loss: 0.0400

Epoch 7/20 | Avg Loss: 0.0288
Epoch 8, Loss: 0.0006

Epoch 8/20 | Avg Loss: 0.0246
Epoch 9, Loss: 0.0236

Epoch 9/20 | Avg Loss: 0.0211
Epoch 10, Loss: 0.0147

Epoch 10/20 | Avg Loss: 0.0210
Epoch 11, Loss: 0.0003

Epoch 11/20 | Avg Loss: 0.0218
Epoch 12, Loss: 0.0095

Epoch 12/20 | Avg Loss: 0.0186
Epoch 13, Loss: 0.0119

Epoch 13/20 | Avg Loss: 0.0166
Epoch 14, Loss: 0.0088

Epoch 14/20 | Avg Loss: 0.0163
Epoch 15, Loss: 0.0897

Epoch 15/20 | Avg Loss: 0.0161
Epoch 16, Loss: 0.0007

Epoch 16/20 | Avg Loss: 0.0145
Epoch 17, Loss: 0.0050

Epoch 17/20 | Avg Loss: 0.0137
Epoch 18, Loss: 0.0095

Epoch 18/20 | Avg Loss: 0.0118
Epoch 19, Loss: 0.0008

Epoc

In [52]:
# Train simple CNN (2 layer) + LSTM
time_start = time.time()
model = UniversalCNN_NER(
    vocab_size=10000,
    embedding_dim=50,
    num_tags=len(tag_encoder.classes_),
    filter_sizes=[3, 5],
    num_filters=256,
    use_lstm=True,
    bidirectional=False
)

trained_model = train_model(
    model=model,
    dataloader=dataloader,
    num_tags=len(tag_encoder.classes_),
    epochs=20,
    lr=1e-3
)

print('Time train: ', time.time() - time_start, 'c')

time_start = time.time()
metrics = evaluate_model(model, sample_data_test, vocab, tag_encoder)

print("\n=== Entity-level Metrics ===")
print(metrics['entity_level'])

print("\n=== Per-entity Metrics ===")
for entity, scores in metrics['entity_metrics'].items():
    print(f"{entity}:")
    print(f"  Precision: {scores['precision']:.4f}")
    print(f"  Recall:    {scores['recall']:.4f}")
    print(f"  F1-score:  {scores['f1']:.4f}")
    print(f"  Support:   {scores['support']}")
print("Time inference: ", time.time() - time_start, ' c')

Epoch 1, Loss: 0.0735

Epoch 1/20 | Avg Loss: 0.6616
Epoch 2, Loss: 0.2923

Epoch 2/20 | Avg Loss: 0.1981
Epoch 3, Loss: 0.0450

Epoch 3/20 | Avg Loss: 0.1191
Epoch 4, Loss: 0.0035

Epoch 4/20 | Avg Loss: 0.0823
Epoch 5, Loss: 0.1976

Epoch 5/20 | Avg Loss: 0.0619
Epoch 6, Loss: 0.0131

Epoch 6/20 | Avg Loss: 0.0520
Epoch 7, Loss: 0.0064

Epoch 7/20 | Avg Loss: 0.0422
Epoch 8, Loss: 0.0074

Epoch 8/20 | Avg Loss: 0.0341
Epoch 9, Loss: 0.0116

Epoch 9/20 | Avg Loss: 0.0300
Epoch 10, Loss: 0.0018

Epoch 10/20 | Avg Loss: 0.0259
Epoch 11, Loss: 0.2246

Epoch 11/20 | Avg Loss: 0.0249
Epoch 12, Loss: 0.0042

Epoch 12/20 | Avg Loss: 0.0253
Epoch 13, Loss: 0.0069

Epoch 13/20 | Avg Loss: 0.0229
Epoch 14, Loss: 0.0051

Epoch 14/20 | Avg Loss: 0.0212
Epoch 15, Loss: 0.0219

Epoch 15/20 | Avg Loss: 0.0197
Epoch 16, Loss: 0.1209

Epoch 16/20 | Avg Loss: 0.0204
Epoch 17, Loss: 0.0007

Epoch 17/20 | Avg Loss: 0.0162
Epoch 18, Loss: 0.0210

Epoch 18/20 | Avg Loss: 0.0174
Epoch 19, Loss: 0.0093

Epoc

In [53]:
# Train simple CNN (1 layer) + BiLSTM + Layernorm
time_start = time.time()
model = UniversalCNN_NER(
    vocab_size=10000,
    embedding_dim=50,
    num_tags=len(tag_encoder.classes_),
    filter_sizes=[3],
    num_filters=256,
    use_lstm=True,
    use_layernorm=True
)

trained_model = train_model(
    model=model,
    dataloader=dataloader,
    num_tags=len(tag_encoder.classes_),
    epochs=20,
    lr=1e-3
)

print('Time train: ', time.time() - time_start, 'c')

time_start = time.time()
metrics = evaluate_model(model, sample_data_test, vocab, tag_encoder)

print("\n=== Entity-level Metrics ===")
print(metrics['entity_level'])

print("\n=== Per-entity Metrics ===")
for entity, scores in metrics['entity_metrics'].items():
    print(f"{entity}:")
    print(f"  Precision: {scores['precision']:.4f}")
    print(f"  Recall:    {scores['recall']:.4f}")
    print(f"  F1-score:  {scores['f1']:.4f}")
    print(f"  Support:   {scores['support']}")
print("Time inference: ", time.time() - time_start, ' c')

Epoch 1, Loss: 0.3519

Epoch 1/20 | Avg Loss: 0.6049
Epoch 2, Loss: 0.0776

Epoch 2/20 | Avg Loss: 0.1845
Epoch 3, Loss: 0.0254

Epoch 3/20 | Avg Loss: 0.1046
Epoch 4, Loss: 0.1597

Epoch 4/20 | Avg Loss: 0.0720
Epoch 5, Loss: 0.2079

Epoch 5/20 | Avg Loss: 0.0487
Epoch 6, Loss: 0.0152

Epoch 6/20 | Avg Loss: 0.0363
Epoch 7, Loss: 0.0726

Epoch 7/20 | Avg Loss: 0.0282
Epoch 8, Loss: 0.0023

Epoch 8/20 | Avg Loss: 0.0227
Epoch 9, Loss: 0.0384

Epoch 9/20 | Avg Loss: 0.0191
Epoch 10, Loss: 0.0063

Epoch 10/20 | Avg Loss: 0.0183
Epoch 11, Loss: 0.1301

Epoch 11/20 | Avg Loss: 0.0178
Epoch 12, Loss: 0.0016

Epoch 12/20 | Avg Loss: 0.0169
Epoch 13, Loss: 0.0026

Epoch 13/20 | Avg Loss: 0.0129
Epoch 14, Loss: 0.0045

Epoch 14/20 | Avg Loss: 0.0140
Epoch 15, Loss: 0.0142

Epoch 15/20 | Avg Loss: 0.0136
Epoch 16, Loss: 0.0859

Epoch 16/20 | Avg Loss: 0.0112
Epoch 17, Loss: 0.0074

Epoch 17/20 | Avg Loss: 0.0117
Epoch 18, Loss: 0.0010

Epoch 18/20 | Avg Loss: 0.0084
Epoch 19, Loss: 0.0195

Epoc

In [49]:
# Тестирование
test = 'квадрат 70 x 100 ст3 сп - 2, 6000 гост 2591 - 88 vvf'
test_sentence = tokenizer(test)
predicted_tags = predict(model, test_sentence, vocab, tag_encoder)

квадрат -> product
70 -> width
100 -> height
ст3 -> mark_steal
сп -> mark_steal
- -> mark_steal
2 -> mark_steal
6000 -> length
гост -> standart_gost
2591 -> standart_gost
- -> standart_gost
88 -> standart_gost
