In [19]:
# Import all libs
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from seqeval.metrics import classification_report as seqeval_report
from typing import List, Union
import numpy as np
from collections import defaultdict
import numpy as np
import random
import json
import re
import time

In [20]:
# Fixing all random seeds
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
set_seed(42) 

In [21]:
# Tokenizer
def tokenizer(text):
    tokens = []    
    # RegEx
    word_re = re.compile(r'''
        \w+[-\w]*|      
        [^\w\s]|         
        \d+\.\d+|        
        \d+/\d+|        
        \d+              
    ''', re.VERBOSE)
    
    for match in word_re.finditer(text):
        tokens.append(match.group())
    
    return tokens

In [22]:
class NERDataset(Dataset):
    def __init__(self, data, vocab, tag_encoder):
        self.data = data
        self.vocab = vocab
        self.tag_encoder = tag_encoder
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        entry = self.data[idx]
        tokens = entry["tokens"]
        tags = entry["tags"]
        
        word_ids = [self.vocab.get(token, 1) for token in tokens]
        tag_ids = self.tag_encoder.transform(tags)
        
        return torch.tensor(word_ids, dtype=torch.long), torch.tensor(tag_ids, dtype=torch.long)

def collate_fn(batch):
    words, tags = zip(*batch)
    words_padded = pad_sequence(words, batch_first=True, padding_value=0)
    tags_padded = pad_sequence(tags, batch_first=True, padding_value=-100)
    return words_padded, tags_padded

In [33]:
class UniversalCNN_NER(nn.Module):
    def __init__(self, 
                 vocab_size: int,
                 num_tags: int,
                 embedding_dim: int = 100,
                 num_filters: int = 128,
                 filter_sizes: Union[int, List[int]] = [3, 5, 7],
                 use_lstm: bool = False,
                 lstm_hidden: int = 256,
                 lstm_num_layers: int = 1,
                 bidirectional: bool = True,
                 use_layernorm: bool = False,
                 use_attention: bool = False,  # Новый параметр
                 num_heads: int = 4,         # Количество голов внимания
                 attn_dropout: float = 0.1,  # Dropout для внимания
                 dropout: float = 0.3):
        super().__init__()
        
        # Эмбеддинг
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        
        # CNN слои
        if isinstance(filter_sizes, int):
            filter_sizes = [filter_sizes]
        self.num_conv_layers = len(filter_sizes)
        
        self.conv_layers = nn.ModuleList()
        in_channels = embedding_dim
        
        for i, fs in enumerate(filter_sizes):
            padding = fs // 2
            self.conv_layers.append(
                nn.Sequential(
                    nn.Conv1d(
                        in_channels=in_channels,
                        out_channels=num_filters,
                        kernel_size=fs,
                        padding=padding,
                        padding_mode='zeros'  # Можно изменить на 'reflect' или 'replicate'
                    ),
                nn.GELU(),
                nn.Dropout(dropout)
                ) 
            )
            in_channels = num_filters
        
        # LSTM слой
        self.use_lstm = use_lstm
        if use_lstm:
            self.lstm = nn.LSTM(
                input_size=num_filters,
                hidden_size=lstm_hidden,
                num_layers=lstm_num_layers,
                bidirectional=bidirectional,
                batch_first=True
            )
            lstm_output_size = lstm_hidden * 2 if bidirectional else lstm_hidden
        else:
            lstm_output_size = num_filters
        
        # Механизм внимания
        self.use_attention = use_attention
        if use_attention:
            self.attention = MultiHeadAttention(
                embed_dim=lstm_output_size,
                num_heads=num_heads,
                dropout=attn_dropout
            )
            self.attn_layer_norm = nn.LayerNorm(lstm_output_size) if use_layernorm else None
        
        # Нормализация
        self.use_layernorm = use_layernorm
        if use_layernorm:
            self.layernorm = nn.LayerNorm(lstm_output_size)
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(lstm_output_size, num_tags)
        
        self._init_weights()

    def _init_weights(self):
        # Инициализация CNN
        for layer in self.conv_layers:
            nn.init.kaiming_normal_(layer[0].weight, mode='fan_out', nonlinearity='relu')
            nn.init.constant_(layer[0].bias, 0)
        
        # Инициализация LSTM
        if hasattr(self, 'lstm'):
            for name, param in self.lstm.named_parameters():
                if 'weight' in name:
                    nn.init.xavier_normal_(param)
                else:
                    nn.init.constant_(param, 0)
        
        # Инициализация выходного слоя
        nn.init.xavier_normal_(self.fc.weight)
        nn.init.constant_(self.fc.bias, 0)

    def forward(self, x):
        # Эмбеддинг
        x = self.embedding(x)  
        x = x.permute(0, 2, 1)
        
        # CNN слои
        for conv in self.conv_layers:
            x = conv(x)
        
        x = x.permute(0, 2, 1)
        
        # LSTM слой
        if self.use_lstm:
            x, _ = self.lstm(x)
        
        # Механизм внимания
        if self.use_attention:
            residual = x
            x, _ = self.attention(x, x, x)  # Self-attention
            x = self.dropout(x)
            if self.attn_layer_norm is not None:
                x = self.attn_layer_norm(x + residual)
        
        # Нормализация
        if self.use_layernorm and not self.use_attention:  # Для attention уже есть своя нормализация
            x = self.layernorm(x)
        
        return self.fc(self.dropout(x))


class MultiHeadAttention(nn.Module):
    """Реализация Multi-Head Self-Attention"""
    def __init__(self, embed_dim, num_heads, dropout=0.1):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        
        assert self.head_dim * num_heads == embed_dim, "Embed dim must be divisible by num_heads"
        
        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, query, key, value, key_padding_mask=None):
        batch_size = query.size(0)
        
        # Проецируем и разбиваем на головы
        q = self.q_proj(query).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        k = self.k_proj(key).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        v = self.v_proj(value).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        
        # Вычисляем attention scores
        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
        
        if key_padding_mask is not None:
            attn_scores = attn_scores.masked_fill(
                key_padding_mask.unsqueeze(1).unsqueeze(2),
                float('-inf')
            )
        
        attn_weights = F.softmax(attn_scores, dim=-1)
        attn_weights = self.dropout(attn_weights)
        
        # Применяем attention weights к values
        output = torch.matmul(attn_weights, v)
        
        # Собираем головы обратно
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.embed_dim)
        output = self.out_proj(output)
        
        return output, attn_weights
    
# Train
def train_model(model, dataloader, num_tags, epochs=20, lr=1e-3, model_save_path='best_model.pt'):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    #criterion = nn.CrossEntropyLoss(ignore_index=-100)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    best_loss = float('inf')
    
    for epoch in range(epochs):
        
        epoch_loss = 0.0
        model.train()
        
        for words, tags in dataloader:
            words = words.to(device)
            tags = tags.to(device)
            
            optimizer.zero_grad()
            # Forward pass
            outputs = model(words)
            
            # Правильный reshape
            outputs = outputs.reshape(-1, num_tags)
            tags = tags.reshape(-1) 
            
            loss = criterion(outputs, tags)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
            
        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")
        avg_loss = epoch_loss / len(dataloader)
        
        torch.save(model.state_dict(), model_save_path)
        
        print(f'\nEpoch {epoch+1}/{epochs} | Avg Loss: {avg_loss:.4f}')
    return model

# Predicted
def predict(model, sentence, vocab, tag_encoder):
    model.eval()
    word_ids = [vocab.get(word, 1) for word in sentence]
    
    with torch.no_grad():
        outputs = model(torch.tensor([word_ids]))
        _, predicted = torch.max(outputs, 2)
    
    predicted_tags = tag_encoder.inverse_transform(predicted.squeeze().numpy())
    
    entities = []
    current_entity = None
    start_idx = 0
    
    for i, (word, tag) in enumerate(zip(sentence, predicted_tags)):
        if tag.startswith('B-'):
            if current_entity is not None:
                entities.append((start_idx, i, current_entity))
            current_entity = tag[2:]
            start_idx = i
        elif tag == 'O' and current_entity is not None:
            entities.append((start_idx, i, current_entity))
            current_entity = None
    
    if current_entity is not None:
        entities.append((start_idx, len(sentence), current_entity))
    
    # Return results
    for start, end, entity_type in entities:
        entity_text = ' '.join(sentence[start:end])
        print(f"{entity_text} -> {entity_type}")
    
    return predicted_tags


# 6. Предсказание
def evaluate_model(model, test_data, vocab, tag_encoder):
    model.eval()
    all_true_tags = []
    all_pred_tags = []
    
    for entry in test_data:
        tokens = entry["tokens"]
        true_tags = entry["tags"]
        
        word_ids = [vocab.get(word, 1) for word in tokens]
        with torch.no_grad():
            outputs = model(torch.tensor([word_ids]))
            _, predicted = torch.max(outputs, 2)
        
        pred_tags = tag_encoder.inverse_transform(predicted.squeeze().numpy())
        
        all_true_tags.append(true_tags)
        all_pred_tags.append(pred_tags.tolist())
    
    # Entity-level
    entity_report = seqeval_report(all_true_tags, all_pred_tags, zero_division=0)
    
    # Calculate metrics
    entity_types = set(tag[2:] for tags in all_true_tags for tag in tags if tag != 'O')
    entity_metrics = {}
    
    for entity in entity_types:
        tp = 0  # True Positives
        fp = 0  # False Positives
        fn = 0  # False Negatives
        
        for true_seq, pred_seq in zip(all_true_tags, all_pred_tags):
            true_entities = get_entities(true_seq)
            pred_entities = get_entities(pred_seq)
            
            true_set = set((start, end) for (start, end, e_type) in true_entities if e_type == entity)
            pred_set = set((start, end) for (start, end, e_type) in pred_entities if e_type == entity)
            
            tp += len(true_set & pred_set)
            fp += len(pred_set - true_set)
            fn += len(true_set - pred_set)
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        entity_metrics[entity] = {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'support': tp + fn
        }
    
    return {
        'entity_level': entity_report,
        'entity_metrics': entity_metrics
    }

def get_entities(tag_sequence):
    entities = []
    current_entity = None
    start_idx = 0
    
    for i, tag in enumerate(tag_sequence):
        if tag.startswith('B-'):
            if current_entity is not None:
                entities.append((start_idx, i, current_entity))
            current_entity = tag[2:]
            start_idx = i
        elif tag == 'O' and current_entity is not None:
            entities.append((start_idx, i, current_entity))
            current_entity = None
    
    if current_entity is not None:
        entities.append((start_idx, len(tag_sequence), current_entity))
    
    return entities    

In [24]:
# Read data
with open(r"..\..\data\raw\dataset_train.json", 'r', encoding='utf-8') as fp:
    #print(fp.read(4544900))
    sample_data_train = json.load(fp)
print(len(sample_data_train))

with open(r"..\..\data\raw\dataset_test.json", 'r', encoding='utf-8') as fp:
    sample_data_test = json.load(fp)
print(len(sample_data_test))

2051
879


In [25]:
# Create dictionary
word_counts = defaultdict(int)
all_tags = []

for entry in sample_data_train:
    for token in entry["tokens"]:
        word_counts[token] += 1
    all_tags.extend(entry["tags"])

vocab = {word: i+2 for i, word in enumerate(word_counts)}  # 0 для padding, 1 для UNK
vocab_size = len(vocab) + 2

tag_encoder = LabelEncoder()
tag_encoder.fit(all_tags)
num_tags = len(tag_encoder.classes_)

dataset = NERDataset(sample_data_train, vocab, tag_encoder)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)

In [28]:
# Train simple CNN (1 layer) + BiLSTM + Layernorm + Att
time_start = time.time()
model = UniversalCNN_NER(
    vocab_size=10000,
    embedding_dim=128,
    num_tags=len(tag_encoder.classes_),
    filter_sizes=[3],
    num_filters=128,
    use_lstm=True,
    use_attention=True,
    num_heads=4,
    use_layernorm=True
)

trained_model = train_model(
    model=model,
    dataloader=dataloader,
    num_tags=len(tag_encoder.classes_),
    epochs=20,
    lr=1e-3
)

print('Time train: ', time.time() - time_start, 'c')

time_start = time.time()
metrics = evaluate_model(model, sample_data_test, vocab, tag_encoder)

print("\n=== Entity-level Metrics ===")
print(metrics['entity_level'])

print("\n=== Per-entity Metrics ===")
for entity, scores in metrics['entity_metrics'].items():
    print(f"{entity}:")
    print(f"  Precision: {scores['precision']:.4f}")
    print(f"  Recall:    {scores['recall']:.4f}")
    print(f"  F1-score:  {scores['f1']:.4f}")
    print(f"  Support:   {scores['support']}")
print("Time inference: ", time.time() - time_start, ' c')

Epoch 1, Loss: 0.1176

Epoch 1/20 | Avg Loss: 0.6190
Epoch 2, Loss: 0.0191

Epoch 2/20 | Avg Loss: 0.1794
Epoch 3, Loss: 0.1989

Epoch 3/20 | Avg Loss: 0.1082
Epoch 4, Loss: 0.0254

Epoch 4/20 | Avg Loss: 0.0806
Epoch 5, Loss: 0.0136

Epoch 5/20 | Avg Loss: 0.0674
Epoch 6, Loss: 0.0375

Epoch 6/20 | Avg Loss: 0.0532
Epoch 7, Loss: 0.0240

Epoch 7/20 | Avg Loss: 0.0442
Epoch 8, Loss: 0.0746

Epoch 8/20 | Avg Loss: 0.0391
Epoch 9, Loss: 0.0503

Epoch 9/20 | Avg Loss: 0.0321
Epoch 10, Loss: 0.0544

Epoch 10/20 | Avg Loss: 0.0407
Epoch 11, Loss: 0.0003

Epoch 11/20 | Avg Loss: 0.0508
Epoch 12, Loss: 0.0005

Epoch 12/20 | Avg Loss: 0.0269
Epoch 13, Loss: 0.0045

Epoch 13/20 | Avg Loss: 0.0217
Epoch 14, Loss: 0.0210

Epoch 14/20 | Avg Loss: 0.0249
Epoch 15, Loss: 0.0338

Epoch 15/20 | Avg Loss: 0.0425
Epoch 16, Loss: 0.0172

Epoch 16/20 | Avg Loss: 0.0234
Epoch 17, Loss: 0.0008

Epoch 17/20 | Avg Loss: 0.0184
Epoch 18, Loss: 0.0001

Epoch 18/20 | Avg Loss: 0.0158
Epoch 19, Loss: 0.0171

Epoc

In [32]:
# Train simple BiLSTM + Layernorm + Att
time_start = time.time()
model = UniversalCNN_NER(
    vocab_size=10000,
    embedding_dim=128,
    num_tags=len(tag_encoder.classes_),
    filter_sizes=[],
    num_filters=128,
    use_lstm=True,
    use_attention=True,
    num_heads=4,
    use_layernorm=True
)

trained_model = train_model(
    model=model,
    dataloader=dataloader,
    num_tags=len(tag_encoder.classes_),
    epochs=20,
    lr=1e-3
)

print('Time train: ', time.time() - time_start, 'c')

time_start = time.time()
metrics = evaluate_model(model, sample_data_test, vocab, tag_encoder)

print("\n=== Entity-level Metrics ===")
print(metrics['entity_level'])

print("\n=== Per-entity Metrics ===")
for entity, scores in metrics['entity_metrics'].items():
    print(f"{entity}:")
    print(f"  Precision: {scores['precision']:.4f}")
    print(f"  Recall:    {scores['recall']:.4f}")
    print(f"  F1-score:  {scores['f1']:.4f}")
    print(f"  Support:   {scores['support']}")
print("Time inference: ", time.time() - time_start, ' c')

Epoch 1, Loss: 0.1315

Epoch 1/20 | Avg Loss: 0.4104
Epoch 2, Loss: 0.0461

Epoch 2/20 | Avg Loss: 0.0986
Epoch 3, Loss: 0.1434

Epoch 3/20 | Avg Loss: 0.0600
Epoch 4, Loss: 0.0125

Epoch 4/20 | Avg Loss: 0.0384
Epoch 5, Loss: 0.0204

Epoch 5/20 | Avg Loss: 0.0364
Epoch 6, Loss: 0.1405

Epoch 6/20 | Avg Loss: 0.1087
Epoch 7, Loss: 0.0017

Epoch 7/20 | Avg Loss: 0.0340
Epoch 8, Loss: 0.0001

Epoch 8/20 | Avg Loss: 0.0100
Epoch 9, Loss: 0.0009

Epoch 9/20 | Avg Loss: 0.0069
Epoch 10, Loss: 0.0208

Epoch 10/20 | Avg Loss: 0.0269
Epoch 11, Loss: 0.0192

Epoch 11/20 | Avg Loss: 0.0202
Epoch 12, Loss: 0.0003

Epoch 12/20 | Avg Loss: 0.0138
Epoch 13, Loss: 0.0274

Epoch 13/20 | Avg Loss: 0.0718
Epoch 14, Loss: 0.0015

Epoch 14/20 | Avg Loss: 0.0218
Epoch 15, Loss: 0.0002

Epoch 15/20 | Avg Loss: 0.0076
Epoch 16, Loss: 0.0069

Epoch 16/20 | Avg Loss: 0.0055
Epoch 17, Loss: 0.0509

Epoch 17/20 | Avg Loss: 0.0346
Epoch 18, Loss: 0.0816

Epoch 18/20 | Avg Loss: 0.0223
Epoch 19, Loss: 0.0001

Epoc

In [34]:
# Train simple BiLSTM 2 + Layernorm + Att
time_start = time.time()
model = UniversalCNN_NER(
    vocab_size=10000,
    embedding_dim=128,
    num_tags=len(tag_encoder.classes_),
    filter_sizes=[],
    num_filters=128,
    use_lstm=True,
    lstm_num_layers=2,
    use_attention=True,
    num_heads=4,
    use_layernorm=True
)

trained_model = train_model(
    model=model,
    dataloader=dataloader,
    num_tags=len(tag_encoder.classes_),
    epochs=20,
    lr=1e-3
)

print('Time train: ', time.time() - time_start, 'c')

time_start = time.time()
metrics = evaluate_model(model, sample_data_test, vocab, tag_encoder)

print("\n=== Entity-level Metrics ===")
print(metrics['entity_level'])

print("\n=== Per-entity Metrics ===")
for entity, scores in metrics['entity_metrics'].items():
    print(f"{entity}:")
    print(f"  Precision: {scores['precision']:.4f}")
    print(f"  Recall:    {scores['recall']:.4f}")
    print(f"  F1-score:  {scores['f1']:.4f}")
    print(f"  Support:   {scores['support']}")
print("Time inference: ", time.time() - time_start, ' c')

Epoch 1, Loss: 0.1251

Epoch 1/20 | Avg Loss: 0.4756
Epoch 2, Loss: 0.0028

Epoch 2/20 | Avg Loss: 0.1089
Epoch 3, Loss: 0.0008

Epoch 3/20 | Avg Loss: 0.0572
Epoch 4, Loss: 0.0419

Epoch 4/20 | Avg Loss: 0.0334
Epoch 5, Loss: 0.0102

Epoch 5/20 | Avg Loss: 0.0203
Epoch 6, Loss: 0.0141

Epoch 6/20 | Avg Loss: 0.0168
Epoch 7, Loss: 0.0431

Epoch 7/20 | Avg Loss: 0.1329
Epoch 8, Loss: 0.0006

Epoch 8/20 | Avg Loss: 0.0246
Epoch 9, Loss: 0.0008

Epoch 9/20 | Avg Loss: 0.0081
Epoch 10, Loss: 0.0446

Epoch 10/20 | Avg Loss: 0.0050
Epoch 11, Loss: 0.0006

Epoch 11/20 | Avg Loss: 0.0101
Epoch 12, Loss: 0.0013

Epoch 12/20 | Avg Loss: 0.0144
Epoch 13, Loss: 0.0205

Epoch 13/20 | Avg Loss: 0.0363
Epoch 14, Loss: 0.0004

Epoch 14/20 | Avg Loss: 0.0180
Epoch 15, Loss: 0.0422

Epoch 15/20 | Avg Loss: 0.0104
Epoch 16, Loss: 0.0003

Epoch 16/20 | Avg Loss: 0.0041
Epoch 17, Loss: 0.0242

Epoch 17/20 | Avg Loss: 0.0043
Epoch 18, Loss: 0.0008

Epoch 18/20 | Avg Loss: 0.0124
Epoch 19, Loss: 0.0004

Epoc

In [49]:
# Тестирование
test = 'квадрат 70 x 100 ст3 сп - 2, 6000 гост 2591 - 88 vvf'
test_sentence = tokenizer(test)
predicted_tags = predict(model, test_sentence, vocab, tag_encoder)

квадрат -> product
70 -> width
100 -> height
ст3 -> mark_steal
сп -> mark_steal
- -> mark_steal
2 -> mark_steal
6000 -> length
гост -> standart_gost
2591 -> standart_gost
- -> standart_gost
88 -> standart_gost
