In [None]:
import torch
import torch.nn as nn
from torchcrf import CRF
import pandas as pd
import numpy as np
import pickle
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from collections import Counter

TRAIN_BIO_PATH = "data/processed/train_bio.csv"
VAL_BIO_PATH = "data/processed/validation_bio.csv"
EMBEDDINGS_PATH = "data/external/ru_en_aligned.pkl"
MODELS_DIR = "models/iteration-1/"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Информация: Вычисления будут производиться на устройстве: {device}")

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"Информация: Random seed ({SEED}) установлен для обеспечения воспроизводимости.")

Гиперпараметры 

In [None]:
WORD_EMBEDDING_DIM = 300 
CHAR_EMBEDDING_DIM = 50  
CHAR_HIDDEN_DIM = 50      

LSTM_HIDDEN_DIM = 256
LSTM_NUM_LAYERS = 2 

BATCH_SIZE = 32
LEARNING_RATE = 1e-3
NUM_EPOCHS = 10 
DROPOUT_RATE = 0.5 

Создание модели

In [None]:
class CharEmbedding(nn.Module):
    def __init__(self, char_vocab_size, embedding_dim, hidden_dim, dropout_rate=0.25):
        super(CharEmbedding, self).__init__()
        self.embedding = nn.Embedding(char_vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=1,
            bidirectional=True,
            batch_first=True
        )
        self.dropout = nn.Dropout(dropout_rate)
        print("Информация: Модуль CharEmbedding успешно инициализирован.")

    def forward(self, x):
        batch_size, seq_len, word_len = x.size()
        
        x = x.view(batch_size * seq_len, word_len)
        
        embedded = self.embedding(x)
        
        embedded = self.dropout(embedded)
        
        lstm_out, _ = self.lstm(embedded)
        
        output = lstm_out.permute(0, 2, 1)
        output = torch.max(output, 2)[0]
        
        output = output.view(batch_size, seq_len, -1)
        
        return self.dropout(output)

In [None]:
class BiLSTMCrfForNer(nn.Module):
    def __init__(self,
                 word_vocab_size,
                 word_embedding_dim,
                 char_vocab_size,
                 char_embedding_dim,
                 char_hidden_dim,
                 lstm_hidden_dim,
                 num_tags,
                 dropout_rate=0.33,
                 padding_idx=0):
        super(BiLSTMCrfForNer, self).__init__()

        self.word_embedding = nn.Embedding(
            num_embeddings=word_vocab_size,
            embedding_dim=word_embedding_dim,
            padding_idx=padding_idx
        )
        self.word_embedding.weight.requires_grad = False

        self.char_embedding = CharEmbedding(
            char_vocab_size=char_vocab_size,
            embedding_dim=char_embedding_dim,
            hidden_dim=char_hidden_dim,
            dropout_rate=dropout_rate
        )

        self.embedding_dropout = nn.Dropout(dropout_rate)

        self.lstm = nn.LSTM(
            input_size=word_embedding_dim + (2 * char_hidden_dim),
            hidden_size=lstm_hidden_dim,
            num_layers=2,
            bidirectional=True,
            batch_first=True,
            dropout=dropout_rate if 2 > 1 else 0
        )

        self.classifier = nn.Linear(2 * lstm_hidden_dim, num_tags)

        self.crf = CRF(num_tags=num_tags, batch_first=True)
        
        print("Информация: Основная модель BiLSTMCrfForNer успешно инициализирована.")

    def forward(self, word_ids, char_ids, mask, tags=None):
        # word_ids: (batch_size, seq_len)
        # char_ids: (batch_size, seq_len, word_len)
        # mask: (batch_size, seq_len)
        # tags: (batch_size, seq_len)

        word_embeds = self.word_embedding(word_ids)
        char_embeds = self.char_embedding(char_ids)
        
        combined_embeds = torch.cat([word_embeds, char_embeds], dim=-1)
        combined_embeds = self.embedding_dropout(combined_embeds)
        
        lstm_out, _ = self.lstm(combined_embeds)
        
        emissions = self.classifier(lstm_out)

        if tags is not None:
            loss = -self.crf(emissions, tags, mask=mask.byte(), reduction='mean')
            return loss
        else:
            decoded_tags = self.crf.decode(emissions, mask=mask.byte())
            return decoded_tags

Подача данных

In [None]:
class NerDataset(Dataset):
    def __init__(self, df_path, word2id=None, char2id=None, tag2id=None):
        self.df = pd.read_csv(df_path, sep=";")
        
        self.df['tokens'] = self.df['tokens'].apply(ast.literal_eval)
        self.df['tags'] = self.df['tags'].apply(ast.literal_eval)

        if word2id is None:
            self.word2id, self.id2word = self._build_vocab(self.df['tokens'])
        else:
            self.word2id, self.id2word = word2id, {v: k for k, v in word2id.items()}

        if char2id is None:
            all_chars = set("".join(["".join(tokens) for tokens in self.df['tokens']]))
            self.char2id, self.id2char = self._build_char_vocab(all_chars)
        else:
            self.char2id, self.id2char = char2id, {v: k for k, v in char2id.items()}

        if tag2id is None:
            self.tag2id, self.id2tag = self._build_vocab(self.df['tags'])
        else:
            self.tag2id, self.id2tag = tag2id, {v: k for k, v in tag2id.items()}
            
        print(f"Информация: Dataset загружен. Размер: {len(self.df)} записей.")
        print(f"Информация: Размер словаря слов: {len(self.word2id)}")
        print(f"Информация: Размер словаря символов: {len(self.char2id)}")
        print(f"Информация: Размер словаря тегов: {len(self.tag2id)}")


    def _build_vocab(self, data):
        vocab = {"<PAD>": 0, "<UNK>": 1}
        for sequence in data:
            for item in sequence:
                if item not in vocab:
                    vocab[item] = len(vocab)
        id2vocab = {v: k for k, v in vocab.items()}
        return vocab, id2vocab

    def _build_char_vocab(self, chars):
        vocab = {"<PAD>": 0, "<UNK>": 1}
        for char in sorted(list(chars)):
            if char not in vocab:
                vocab[char] = len(vocab)
        id2vocab = {v: k for k, v in vocab.items()}
        return vocab, id2vocab
    
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        tokens = row['tokens']
        tags = row['tags']

        word_ids = [self.word2id.get(token, self.word2id["<UNK>"]) for token in tokens]
        tag_ids = [self.tag2id.get(tag, 0) for tag in tags] # Паддинг-тег для OOV-тегов не должен происходить
        
        char_ids = []
        for token in tokens:
            ids = [self.char2id.get(char, self.char2id["<UNK>"]) for char in token]
            char_ids.append(ids)

        return {"words": word_ids, "chars": char_ids, "tags": tag_ids}

def collate_fn(batch, word_pad_idx=0, char_pad_idx=0, tag_pad_idx=0):
    max_seq_len = max(len(item['words']) for item in batch)
    max_word_len = max(max(len(char_seq) for char_seq in item['chars']) if item['chars'] else 0 for item in batch)

    padded_words, padded_chars, padded_tags, masks = [], [], [], []

    for item in batch:
        seq_len = len(item['words'])
        
        padded_words.append(item['words'] + [word_pad_idx] * (max_seq_len - seq_len))
        padded_tags.append(item['tags'] + [tag_pad_idx] * (max_seq_len - seq_len))
        
        masks.append([1] * seq_len + [0] * (max_seq_len - seq_len))
        
        padded_char_seq = []
        for char_seq in item['chars']:
            padded_char_seq.append(char_seq + [char_pad_idx] * (max_word_len - len(char_seq)))
        
        if seq_len < max_seq_len:
            for _ in range(max_seq_len - seq_len):
                padded_char_seq.append([char_pad_idx] * max_word_len)
        
        padded_chars.append(padded_char_seq)

    return {
        "words": torch.tensor(padded_words, dtype=torch.long),
        "chars": torch.tensor(padded_chars, dtype=torch.long),
        "tags": torch.tensor(padded_tags, dtype=torch.long),
        "mask": torch.tensor(masks, dtype=torch.bool)
    }

Цикл обучения и оценки

In [None]:
def calculate_class_weights(tags_series, tag2id):
    all_tags = [tag for seq in tags_series for tag in seq]
    tag_counts = Counter(all_tags)
    
    # Создаем веса. Используем сглаживание, чтобы избежать деления на ноль.
    # Более редкие классы получат больший вес.
    weights = torch.ones(len(tag2id), device=device)
    for tag, count in tag_counts.items():
        if tag in tag2id:
            # Вес обратно пропорционален частоте
            weights[tag2id[tag]] = 1.0 / (count + 1e-6) 
    
    # Нормализуем веса
    weights = weights / weights.sum()
    # Увеличим вес редких классов еще сильнее
    weights = weights.pow(0.5)

    print("Информация: Рассчитаны веса для классов:")
    for tag, i in tag2id.items():
        print(f"- {tag}: {weights[i]:.4f}")
        
    return weights

def train_epoch(model, dataloader, optimizer):
    model.train()
    total_loss = 0
    for batch in dataloader:
        # Перенос данных на нужное устройство
        words = batch['words'].to(device)
        chars = batch['chars'].to(device)
        tags = batch['tags'].to(device)
        mask = batch['mask'].to(device)

        # Обнуление градиентов
        optimizer.zero_grad()
        
        # Прямой проход и вычисление потерь
        loss = model(words, chars, mask, tags)
        
        # Обратный проход
        loss.backward()
        
        # Шаг оптимизатора
        optimizer.step()
        
        total_loss += loss.item()
        
    return total_loss / len(dataloader)

def eval_epoch(model, dataloader, id2tag):
    model.eval()
    all_true_tags = []
    all_pred_tags = []

    with torch.no_grad():
        for batch in dataloader:
            words = batch['words'].to(device)
            chars = batch['chars'].to(device)
            tags = batch['tags'].to(device)
            mask = batch['mask'].to(device)
            
            # Получение предсказаний
            predictions = model(words, chars, mask)
            
            # "Разворачиваем" батчи в плоские списки, убирая паддинг
            for i in range(len(predictions)):
                seq_len = mask[i].sum().item()
                true_tags = tags[i][:seq_len].cpu().tolist()
                pred_tags = predictions[i][:seq_len]
                
                all_true_tags.extend([id2tag[tag_id] for tag_id in true_tags])
                all_pred_tags.extend([id2tag[tag_id] for tag_id in pred_tags])

    # Игнорируем тег 'O' при расчете метрик, так как он доминирует
    labels_to_include = [tag for tag in id2tag.values() if tag != "O" and tag != "<PAD>"]
    
    report = classification_report(
        all_true_tags, 
        all_pred_tags, 
        labels=labels_to_include,
        output_dict=True,
        zero_division=0
    )
    return report

Подготовка эмбеддингов

In [None]:
def load_and_prepare_embeddings(word2id, filepath, embedding_dim):
    with open(filepath, 'rb') as f:
        fasttext_model = pickle.load(f)

    embedding_matrix = np.random.uniform(-0.05, 0.05, (len(word2id), embedding_dim))
    
    hits = 0
    for word, i in word2id.items():
        if word in fasttext_model:
            embedding_matrix[i] = fasttext_model[word]
            hits += 1
    
    print(f"Информация: Найдено {hits} из {len(word2id)} слов в предобученной модели ({hits / len(word2id) * 100:.2f}%).")
    
    embedding_matrix[word2id["<PAD>"]] = np.zeros(embedding_dim)
    
    return torch.tensor(embedding_matrix, dtype=torch.float)

ОБУЧЕНИЕ

Загрузка данных

In [None]:
train_dataset = NerDataset(TRAIN_BIO_PATH)

val_dataset = NerDataset(VAL_BIO_PATH, 
                         word2id=train_dataset.word2id, 
                         char2id=train_dataset.char2id, 
                         tag2id=train_dataset.tag2id)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

print("\nИнформация: Датасеты и даталоадеры успешно созданы.")

embedding_weights = load_and_prepare_embeddings(train_dataset.word2id, EMBEDDINGS_PATH, WORD_EMBEDDING_DIM)

Инициализация модели

In [None]:
if embedding_weights is not None:
    model = BiLSTMCrfForNer(
        word_vocab_size=len(train_dataset.word2id),
        word_embedding_dim=WORD_EMBEDDING_DIM,
        char_vocab_size=len(train_dataset.char2id),
        char_embedding_dim=CHAR_EMBEDDING_DIM,
        char_hidden_dim=CHAR_HIDDEN_DIM,
        lstm_hidden_dim=LSTM_HIDDEN_DIM,
        num_tags=len(train_dataset.tag2id),
        dropout_rate=DROPOUT_RATE,
        padding_idx=train_dataset.word2id["<PAD>"]
    )
    
    model.word_embedding.weight.data.copy_(embedding_weights)
    
    model.to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

    print("\nИнформация: Модель и оптимизатор инициализированы.")

Запуск обучения

In [None]:
best_val_f1 = 0
best_model_state = None

print("\n--- Начало процесса обучения ---")
for epoch in range(1, NUM_EPOCHS + 1):
    train_loss = train_epoch(model, train_dataloader, optimizer)
    
    report = eval_epoch(model, val_dataloader, train_dataset.id2tag)
    val_f1_macro = report['macro avg']['f1-score']
    
    print(f"\nЭпоха {epoch}/{NUM_EPOCHS}:")
    print(f"  Потери на обучении (Train Loss): {train_loss:.4f}")
    print(f"  F1-macro на валидации: {val_f1_macro:.4f}")

    print("  Детальный отчет по классам:")
    for tag, metrics in report.items():
        if isinstance(metrics, dict):
            print(f"    - {tag:<10}: F1={metrics['f1-score']:.4f}, Precision={metrics['precision']:.4f}, Recall={metrics['recall']:.4f}")

    if val_f1_macro > best_val_f1:
        best_val_f1 = val_f1_macro
        best_model_state = model.state_dict().copy()
        print(f"  Новый лучший результат! Модель сохранена (F1-macro: {best_val_f1:.4f}).")

print("\n--- Обучение завершено ---")

Сохрнение результатов

In [None]:
os.makedirs(MODELS_DIR, exist_ok=True)

torch.save(best_model_state, os.path.join(MODELS_DIR, "bilstm_v1.pth"))

artefacts = {
    "word2id": train_dataset.word2id,
    "char2id": train_dataset.char2id,
    "tag2id": train_dataset.tag2id,
    "id2tag": train_dataset.id2tag
}
with open(os.path.join(MODELS_DIR, "artefacts_v1.pkl"), "wb") as f:
    pickle.dump(artefacts, f)

print(f"\nИнформация: Лучшая модель и артефакты сохранены в директорию: {MODELS_DIR}")