# Восстановление пропущенных пробелов в тексте с помощью NLP / DL / алгоритма

Абраменко Александр Родионович cfif12349@yandex.ru

Avito DS Internship 2025 // Тестовое задание 3

Я предлагаю следующее решение:

1) Разбить исходный текст на части с помощью регулярных выражений, и обрабатывать буквенные и численные символы по отдельности
2) Классический алгоритм с частотной оценкой слова. Динамическое программирование + словарь (wordfreq)
3) Нейросеть BiLSTM

    Для обучения использовать сгенерировать синтетические данные из частотного словаря, а так же реальные данные, размеченные предыдущим алгоритмом
4) Beam search для улучшения качества
5) Результат будет считаться голосованием по большинству среди предыдущих трех алгоритмов

##Загрузка данных и установка библиотек

In [1]:
import pandas as pd
import numpy as np

In [2]:
# загрузка тестовых данных
data = []
f = open('dataset_1937770_3.txt').readlines()[1:]
for line in f:
    id, text = line.strip().split(',', 1)
    data.append(text)
test_data = pd.Series(data, name='text')
test_data

Unnamed: 0,text
0,куплюайфон14про
1,ищудомвПодмосковье
2,сдаюквартирусмебельюитехникой
3,новыйдивандоставканедорого
4,отдамдаромкошку
...,...
1000,Янеусну.
1001,Весна-яуженегреюпио.
1002,Весна-скоровырастеттрава.
1003,"Весна-выпосмотрите,каккрасиво."


In [3]:
!pip install wordfreq



##Классический алгоритм

In [4]:
import os
import random
import math
import time
from wordfreq import top_n_list, zipf_frequency
import re


MAX_WORD_LEN = 30
LANGS = ["ru", "en"]
UNKNOWN_WORD_DENSITY_PENALTY = -7.0 # штраф за неизвестные словарю слова
# награда за более длинное слово,
#чтобы алгоритм не выделял короткие, которые являются частями более длинного слова
SCORE_LEN_POWER = 1.7

def get_word_score(word):
    if not word:
        return -1e9
    word_len = len(word)
    len_score = word_len ** SCORE_LEN_POWER
    w = word.lower()
    best_zipf = max(zipf_frequency(w, lang) for lang in LANGS)
    if best_zipf > 0.0:
        return best_zipf * len_score
    else:
        return UNKNOWN_WORD_DENSITY_PENALTY * len_score

def segment_one_alpha_token(token):
    """
    получить разбиение строки
    """
    n = len(token)
    if n == 0:
        return "", []
    dp = [float("-inf")] * (n + 1)
    back = [-1] * (n + 1)
    dp[0] = 0.0
    for i in range(1, n + 1):
        for j in range(max(0, i - MAX_WORD_LEN), i):
            word = token[j:i]
            score = get_word_score(word)
            if dp[j] + score > dp[i]:
                dp[i] = dp[j] + score
                back[i] = j
    if dp[n] == float("-inf"):
        return token, []
    words = []
    indices = []
    i = n
    while i > 0:
        j = back[i]
        if i == j:
            break
        if j > 0:
            indices.append(j)
        words.append(token[j:i])
        i = j
    words.reverse()
    indices.sort()
    return " ".join(words), indices

# разделяет строку на части, где каждая - либо подряд идущие символы,
# либо подряд идущие не-символы(числа, знаки)
RE_SPLIT = re.compile(r"([A-Za-zА-Яа-яЁё0-9]+|[^A-Za-zА-Яа-яЁё0-9]+)", flags=re.UNICODE)

def smart_segment_with_indices(text):
    """
    разделение с учетом регулярки
    """
    if not text:
        return "", []
    parts = RE_SPLIT.findall(text)
    out_parts = []
    all_indices = []
    current_offset = 0
    for part in parts:
        if re.search(r"[A-Za-zА-Яа-яЁё]", part):
            segmented_part, local_indices = segment_one_alpha_token(part)
            out_parts.append(segmented_part)
            for idx in local_indices:
                all_indices.append(current_offset + idx)
        else:
            out_parts.append(part)
        current_offset += len(part)
    return "".join(out_parts), all_indices

def smart_segment(text):
    res, _ = smart_segment_with_indices(text)
    return res


def build_wordfreq_dict(langs=('ru','en'), top_k=50000):
    """
    создает частотный словарь из top_k самых частых слов
    """
    wf = {}
    for lang in langs:
        try:
            words = top_n_list(lang, top_k)
        except Exception:
            words = []
        for w in words:
            wf[w.lower()] = zipf_frequency(w, lang)
    return wf

##BiLSTM

In [5]:
from typing import List, Tuple
from collections import Counter
import json
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

class CharDataset(Dataset):
    def __init__(self, pairs, char2idx):
        """
        pairs - список (string, labels)

        """

        self.pairs = pairs
        self.pad = "<PAD>"
        self.unk = "<UNK>"
        if char2idx is None:
            chars = Counter()
            for s, _ in self.pairs:
                chars.update(list(s))
            idx2 = [self.pad, self.unk] + sorted([c for c in chars if c not in (self.pad, self.unk)])
            self.idx2char = idx2
            self.char2idx = {c:i for i,c in enumerate(idx2)}
        else:
            self.char2idx = char2idx
            idx2 = [None]*len(char2idx)
            for c, i in char2idx.items():
                 idx2[i] = c
            self.idx2char = idx2

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        s, labels = self.pairs[idx]
        ids = [self.char2idx.get(c, self.char2idx.get(self.unk)) for c in s]
        return torch.tensor(ids, dtype=torch.long), torch.tensor(labels, dtype=torch.float), s

def collate_fn(batch):
    ids, labels, raws = zip(*batch)
    lengths = [len(x) for x in ids]
    maxlen = max(lengths)
    pad_id = batch[0][0].new_tensor([0]).item()
    ids_p = torch.full((len(ids), maxlen), pad_id, dtype=torch.long)
    labels_p = torch.zeros((len(ids), maxlen), dtype=torch.float)
    mask = torch.zeros((len(ids), maxlen), dtype=torch.bool)
    for i, (id_seq, lab_seq) in enumerate(zip(ids, labels)):
        L = id_seq.size(0)
        ids_p[i, :L] = id_seq
        labels_p[i, :L] = lab_seq
        mask[i, :L] = 1
    return ids_p, labels_p, mask, raws

class BiLSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, hidden_dim=256, n_layers=2, dropout=0.2):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, num_layers=n_layers, batch_first=True,
                            bidirectional=True, dropout=dropout if n_layers>1 else 0.0)
        self.fc = nn.Linear(hidden_dim*2, 1)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, mask=None):
        h = self.emb(x)
        h = self.dropout(h)
        h, _ = self.lstm(h)
        logits = self.fc(h).squeeze(-1)
        return logits

def train_epoch(model, dataloader, optimizer, device, bce_loss, grad_clip=1.0):
    model.train()
    total_loss = 0.0
    total_masked = 0
    for ids_p, labels_p, mask, _ in dataloader:
        ids_p = ids_p.to(device)
        labels_p = labels_p.to(device)
        mask = mask.to(device)

        logits = model(ids_p, mask=mask)

        logits_masked = logits[mask]
        labels_masked = labels_p[mask]

        loss = bce_loss(logits_masked, labels_masked)

        optimizer.zero_grad()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()

        total_loss += loss.item() * labels_masked.size(0)
        total_masked += labels_masked.size(0)
    return total_loss / (total_masked + 1e-12)

def compute_prf_from_flat(probs_flat, gold_flat, mask_flat, threshold):
    preds = (probs_flat > threshold).astype(int)
    gold = gold_flat.astype(int)
    mask = mask_flat.astype(int)

    tp = int(((preds == 1) & (gold == 1) & (mask == 1)).sum())
    fp = int(((preds == 1) & (gold == 0) & (mask == 1)).sum())
    fn = int(((preds == 0) & (gold == 1) & (mask == 1)).sum())
    prec = tp / (tp + fp + 1e-12)
    rec = tp / (tp + fn + 1e-12)
    f1 = 2 * prec * rec / (prec + rec + 1e-12)
    return prec, rec, f1, tp, fp, fn

def eval_model(model, dataloader, device):
    model.eval()
    probs_list=[]
    gold_list=[]
    mask_list=[]

    with torch.no_grad():
        for ids_p, labels_p, mask, _ in dataloader:
            ids_p = ids_p.to(device)
            labels_p = labels_p.to(device)
            mask = mask.to(device)

            logits = model(ids_p, mask=mask)
            probs = torch.sigmoid(logits)

            probs_list.append(probs.cpu().numpy())
            gold_list.append(labels_p.cpu().numpy())
            mask_list.append(mask.cpu().numpy().astype(int))

    probs_flat = np.concatenate([p.reshape(-1) for p in probs_list])
    gold_flat = np.concatenate([g.reshape(-1) for g in gold_list])
    mask_flat = np.concatenate([m.reshape(-1) for m in mask_list])

    probs_masked = probs_flat[mask_flat==1]
    gold_masked = gold_flat[mask_flat==1]

    if probs_masked.size == 0:
        return 0.0,0.0,0.0,0.5

    # выбираем лучшее пороговое значение
    best_f1 = -1.0
    best_t = 0.5
    best_p = best_r = 0.0
    for thr in np.linspace(0.01,0.99,99):
        p, r, f1, *_ = compute_prf_from_flat(probs_masked, gold_masked, np.ones_like(gold_masked), thr)
        if f1 > best_f1:
            best_f1 = f1
            best_t = thr
            best_p = p
            best_r = r
    return best_p, best_r, best_f1, best_t

def predict_and_segment_threshold(model, device, raw_text, char2idx, threshold=0.5):
    model.eval()
    ids = torch.tensor([char2idx.get(c, char2idx.get("<UNK>")) for c in raw_text],
                       dtype=torch.long).unsqueeze(0).to(device)
    mask = torch.ones_like(ids, dtype=torch.bool).to(device)
    with torch.no_grad():
        logits = model(ids, mask=mask)
        probs = torch.sigmoid(logits).squeeze(0).cpu().numpy()
    labels = (probs > threshold).astype(int).tolist()
    out_chars=[]
    for ch, lb in zip(raw_text, labels):
        out_chars.append(ch)
        if lb==1:
            out_chars.append(" ")
    return "".join(out_chars), labels, probs



##Beam search

In [6]:
def beam_segment_from_probs(raw, probs,
                            beam_width=20, max_word_len=30, alpha=1.0, beta=1.5):
    """
    score combination: alpha * log(prob_at_boundary) + beta * dp_word_score
    """
    n = len(raw)
    beams = {0: [(0.0, [])]}  # pos -> list of (score, seg_list)
    for i in range(0, n):
        if i not in beams:
            continue
        states = beams[i]
        # перебираем кандидатов
        for score, seg in states:
            for L in range(1, min(max_word_len, n - i) + 1):
                j = i + L
                word = raw[i:j]
                # вероятность границы после j-1 символа
                p_bound = probs[j-1] if (j-1) < len(probs) else 0.0
                # избегаем log(0)
                log_p = math.log(max(p_bound, 1e-9))

                log_w = get_word_score(word)
                new_score = score + alpha * log_p + beta * log_w
                new_seg = seg + [word]
                if j not in beams:
                    beams[j] = []
                beams[j].append((new_score, new_seg))
        #  идем по всем позициям где есть кандидаты
        for pos in list(beams.keys()):
            if len(beams[pos]) > beam_width:
                # выбираем лучших по скору
                beams[pos] = sorted(beams[pos], key=lambda x: -x[0])[:beam_width]
    if n not in beams: # не нашелся ни один кандидат
        return raw
    best = max(beams[n], key=lambda x: x[0])
    return " ".join(best[1])


##Majority voting

In [7]:
def seg_to_boundary_vector(seg, raw):
    """
    seg: строка с пробелами
    raw: строка без пробелов
    возвращает список состоящий из 0/1 где 1 на i-том месте если после него стоит пробел
    """
    vec = [0]*len(raw)
    parts = seg.split()
    idx = 0
    for w in parts:
        idx += len(w)
        if idx-1 < len(raw):
            vec[idx-1] = 1
    return vec

def majority_vote_segment(cands, raw, threshold_votes=2):
    n = len(raw)
    votes = [0]*n
    for seg in cands:
        vec = seg_to_boundary_vector(seg, raw)
        for i,v in enumerate(vec):
            votes[i] += v
    final_vec = [1 if v >= threshold_votes else 0 for v in votes]
    out = []
    for i,ch in enumerate(raw):
        out.append(ch)
        if final_vec[i]==1:
            out.append(" ")
    return "".join(out)


##Разметка

In [8]:
def pseudo_label_file(file_path, top_percent=0.3, max_pseudo=80000):
    pseudo_all=[]
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            s = line.strip()
            raw = s.replace(" ", "")

            if not raw:
                 continue

            seg_str, indices = smart_segment_with_indices(raw)
            words = [w for w in seg_str.split(" ") if w != ""]
            if len(words)==0:
                 continue
            scores = [get_word_score(w) for w in words]
            avg_score = float(sum(scores)/len(scores))
            pseudo_all.append((raw, indices, avg_score))

    if not pseudo_all:
        return []

    pseudo_all_sorted = sorted(pseudo_all, key=lambda x: -x[2])
    take = min(int(len(pseudo_all_sorted)*top_percent), max_pseudo)
    taken = pseudo_all_sorted[:take]
    pseudo_pairs = []
    for raw, indices, score in taken:
        labels = [0] * len(raw)
        for idx in indices:
            if 0 <= idx < len(raw):
                labels[idx-1] = 1
        pseudo_pairs.append((raw, labels))
    return pseudo_pairs

##Конфиг

In [9]:
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

N_SYNTHETIC = 120000 # количество синтетических примеров для обучения
MAX_WORDS = 6 # максимальное число слов в синтетическом примере
BATCH_SIZE = 256
EPOCHS = 12
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SAVE_PATH = "best_lstm_weights.pt"
META_PATH = SAVE_PATH + ".meta.json"

UPLOADED_PATH = "dataset_1937770_3.txt"
PSEUDO_TOP_PERCENT = 0.30  # столько процентов топ псевдо лейблов берем
MAX_PSEUDO = 60000
PSEUDO_WEIGHT = 0.6

# beam/voting
BEAM_WIDTH = 30
BEAM_MAX_WORD_LEN = 30
BEAM_ALPHA = 1.0
BEAM_BETA = 1.5
VOTE_THRESHOLD = 2

##Генерация синтетики

In [10]:
def make_synthetic_sentences(n_sentences=20000, max_words=8, langs=('ru','en')):
    """
    генерирует n_sentences синтетических предложений из популярных слов
    и добавляет случайный шум
    """
    words = []
    per_lang = max(100000 // len(langs), 1000)
    for lang in langs:
        words += top_n_list(lang, per_lang)

    words = [w for w in words if " " not in w and len(w)<=25]

    if len(words) < 1000: # если получилось мало слов, то дублируем
        words = (words * 10)[:20000]

    sentences = []
    for _ in range(n_sentences):
        k = random.randint(1, max_words)
        chosen = random.choices(words, k=k)
        for i in range(len(chosen)):
            if random.random() < 0.06:
                chosen[i] = chosen[i].capitalize() # делаем буквы большими
            if random.random() < 0.03: # добавляем число к слову
                chosen[i] = chosen[i] + ' ' + str(random.randint(1, 999))
            if random.random() < 0.02 and i != len(chosen) - 1:
                chosen[i] = chosen[i] + "," # ставим запятую

        sentence = " ".join(chosen)
        if random.random() < 0.05:
            sentence += "."
        sentences.append(sentence)
    return sentences

def sentence_to_input_and_labels(sentence):
    """
    взять предложение с пробелами и вернуть предлоэение без пробелов с лейблами
    """
    input_text = sentence.replace(" ", "")
    labels = []
    for i, ch in enumerate(sentence):
        if ch == " ":
            continue
        next_is_space = (i + 1 < len(sentence) and sentence[i + 1] == " ")
        labels.append(1 if next_is_space else 0)
    assert len(input_text) == len(labels)
    return input_text, labels

##Обучение

In [11]:
print("Generating synthetic sentences...")
sentences = make_synthetic_sentences(n_sentences=N_SYNTHETIC, max_words=MAX_WORDS)
pairs = []
for s in sentences:
    inp, labs = sentence_to_input_and_labels(s)
    if len(inp) > 250:
         continue
    pairs.append((inp, labs))
print("Synthetic generated:", len(pairs))

Generating synthetic sentences...
Synthetic generated: 120000


In [12]:
pseudo_pairs = []
print("Pseudo-labeling file...")
pseudo_pairs = pseudo_label_file(UPLOADED_PATH, top_percent=PSEUDO_TOP_PERCENT, max_pseudo=MAX_PSEUDO)
print("Pseudo selected:", len(pseudo_pairs))


Pseudo-labeling file...
Pseudo selected: 301


In [13]:
combined = pairs.copy()
combined.extend(pseudo_pairs)
random.shuffle(combined)
n = len(combined)
n_train = int(0.85 * n)
n_val = int(0.07 * n)

train_pairs = combined[:n_train]
val_pairs = combined[n_train:n_train+n_val]
test_pairs = combined[n_train+n_val:]
print(f"Train/Val/Test sizes: {len(train_pairs)}/{len(val_pairs)}/{len(test_pairs)}")
print(f"Pseudo in pool: {len(pseudo_pairs)} (weight {PSEUDO_WEIGHT})")

Train/Val/Test sizes: 102255/8421/9625
Pseudo in pool: 301 (weight 0.6)


In [14]:
ds_train = CharDataset(train_pairs, char2idx=None)
char2idx = ds_train.char2idx
ds_val = CharDataset(val_pairs, char2idx=char2idx)
ds_test = CharDataset(test_pairs, char2idx=char2idx)

In [15]:
ds_all_pairs = train_pairs
ds_all = CharDataset(ds_all_pairs, char2idx=char2idx)
pseudo_set = set([p[0] for p in pseudo_pairs])
weights = [PSEUDO_WEIGHT if s in pseudo_set else 1.0 for s, _ in ds_all.pairs]
sampler = WeightedRandomSampler(weights, num_samples=len(weights), replacement=True)

In [16]:
train_loader = DataLoader(ds_all, batch_size=BATCH_SIZE, sampler=sampler, collate_fn=collate_fn)
val_loader = DataLoader(ds_val, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(ds_test, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)


In [17]:
vocab_size = len(ds_train.idx2char)

model = BiLSTM(vocab_size=vocab_size, emb_dim=128, hidden_dim=256, n_layers=2, dropout=0.2)
model.to(DEVICE)

BiLSTM(
  (emb): Embedding(279, 128, padding_idx=0)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [18]:
num_pos = sum(sum(l) for _, l in train_pairs) # общее число 1 в трейне
num_total = sum(len(s) for s, _ in train_pairs) # всего токенов
num_neg = num_total - num_pos # общее число 0
pos_w = float(num_neg) / (num_pos + 1e-12) if num_pos > 0 else 1.0 # выбор веса для положительного класса
pos_weight = torch.tensor([pos_w], dtype=torch.float, device=DEVICE)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-2)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS, eta_min=1e-5)

best_val_f1 = -1.0
best_thresh = 0.5

In [19]:
for epoch in range(1, EPOCHS+1):
    t0 = time.time()
    train_loss = train_epoch(model, train_loader, optimizer, DEVICE, criterion, grad_clip=1.0)
    val_p, val_r, val_f1, val_t = eval_model(model, val_loader, DEVICE)
    scheduler.step()
    t1 = time.time()
    print(f"Epoch {epoch}/{EPOCHS} | train_loss={train_loss:.6f} | val P={val_p:.4f} R={val_r:.4f} F1={val_f1:.4f} T={val_t:.3f} | time={t1-t0:.1f}s")
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_thresh = val_t
        torch.save(model.state_dict(), SAVE_PATH)
        meta = {"char2idx": ds_train.char2idx, "idx2char": ds_train.idx2char, "best_thresh": float(best_thresh), "epoch": epoch, "val_f1": float(val_f1), "pseudo_used": len(pseudo_pairs)}
        with open(META_PATH, "w", encoding="utf-8") as f:
             json.dump(meta, f, ensure_ascii=False)
        print("Saved best model.")

Epoch 1/12 | train_loss=0.449700 | val P=0.8970 R=0.8559 F1=0.8760 T=0.840 | time=33.8s
Saved best model.
Epoch 2/12 | train_loss=0.231143 | val P=0.9306 R=0.8679 F1=0.8981 T=0.870 | time=32.0s
Saved best model.
Epoch 3/12 | train_loss=0.196656 | val P=0.9383 R=0.8820 F1=0.9093 T=0.920 | time=34.1s
Saved best model.
Epoch 4/12 | train_loss=0.174070 | val P=0.9458 R=0.8874 F1=0.9156 T=0.920 | time=34.7s
Saved best model.
Epoch 5/12 | train_loss=0.160042 | val P=0.9401 R=0.9004 F1=0.9198 T=0.880 | time=34.1s
Saved best model.
Epoch 6/12 | train_loss=0.149950 | val P=0.9361 R=0.9087 F1=0.9222 T=0.870 | time=34.0s
Saved best model.
Epoch 7/12 | train_loss=0.139334 | val P=0.9411 R=0.9120 F1=0.9263 T=0.890 | time=34.3s
Saved best model.
Epoch 8/12 | train_loss=0.135206 | val P=0.9437 R=0.9131 F1=0.9282 T=0.870 | time=34.6s
Saved best model.
Epoch 9/12 | train_loss=0.127822 | val P=0.9464 R=0.9142 F1=0.9300 T=0.880 | time=34.2s
Saved best model.
Epoch 10/12 | train_loss=0.124864 | val P=0.95

In [20]:
if os.path.exists(SAVE_PATH):
    state_dict = torch.load(SAVE_PATH, map_location=DEVICE)
    model.load_state_dict(state_dict)
if os.path.exists(META_PATH):
    with open(META_PATH, "r", encoding="utf-8") as f:
        meta = json.load(f)
    best_thresh = float(meta.get("best_thresh", best_thresh))

In [21]:
wf_dict = build_wordfreq_dict(langs=('ru','en'), top_k=50000)

In [22]:
model.eval()
results = []
with torch.no_grad():
    for ids_p, labels_p, mask, raws in test_loader:
        ids_p = ids_p.to(DEVICE)
        logits = model(ids_p, mask=mask.to(DEVICE))
        probs_batch = torch.sigmoid(logits).cpu().numpy()

        for raw, probs in zip(raws, probs_batch):
            raw_str = raw
            # первый кандидат
            cand_dp = smart_segment(raw_str)
            # второй кандидат
            labels_thr = (probs > best_thresh).astype(int).tolist()
            out_chars = []
            for ch, lb in zip(raw_str, labels_thr):
                out_chars.append(ch)
                if lb == 1:
                     out_chars.append(" ")
            cand_nn_thr = "".join(out_chars)
            # третий кандидат
            cand_nn_beam = beam_segment_from_probs(raw_str, probs.tolist(),
                                                    beam_width=BEAM_WIDTH,
                                                    max_word_len=BEAM_MAX_WORD_LEN,
                                                    alpha=BEAM_ALPHA, beta=BEAM_BETA)

            final_seg = majority_vote_segment([cand_dp, cand_nn_thr, cand_nn_beam],
                                              raw_str, threshold_votes=VOTE_THRESHOLD)
            results.append((raw_str, cand_dp, cand_nn_thr, cand_nn_beam, final_seg))


In [23]:
all_preds = []
all_golds = []

gold_map = {s: labs for s, labs in test_pairs}
preds_flat = []
golds_flat = []
for raw, cand_dp, cand_nn_thr, cand_nn_beam, final_seg in results:
    pred_vec = seg_to_boundary_vector(final_seg, raw)
    gold = gold_map.get(raw, None)

    if gold is None:
        continue

    preds_flat.extend(pred_vec)
    golds_flat.extend(gold)

preds_arr = np.array(preds_flat)
golds_arr = np.array(golds_flat)
tp = int(((preds_arr==1) & (golds_arr==1)).sum())
fp = int(((preds_arr==1) & (golds_arr==0)).sum())
fn = int(((preds_arr==0) & (golds_arr==1)).sum())
prec = tp / (tp + fp + 1e-12)
rec = tp / (tp + fn + 1e-12)
f1 = 2 * prec * rec / (prec + rec + 1e-12)
print(f"\nFINAL MAJORITY VOTE TEST => P={prec:.4f} R={rec:.4f} F1={f1:.4f}")


FINAL MAJORITY VOTE TEST => P=0.6412 R=0.9748 F1=0.7736


In [24]:
demo = [
    "Аятанцуюпьяныйподдождём",
    "Курилитолькосиги,есличё",
    "Нашобразжизнистанетпалачом",
    "Нискольконежалеюниочём",
    "Грустно,когдарасписанывсемаршруты",
    "Когданачасахважныминуты",
    "куплюайфон14про",
    "ищудомвПодмосковье",
    "сдаюквартирусмебельюитехникой",
    "новыйдивандоставканедорого",
    "отдамдаромкошку",
    "работавМосквеудаленно",
    "куплютелевизорPhilips",
    "ищугрузчиковдляпереезда"
]
for s in demo:
    raw = s.replace(" ", "")
    cand_dp = smart_segment(raw)
    cand_nn_thr, _, probs = predict_and_segment_threshold(model, DEVICE, raw, ds_train.char2idx, threshold=best_thresh)
    cand_nn_beam = beam_segment_from_probs(raw, probs.tolist(),
                                            beam_width=BEAM_WIDTH, max_word_len=BEAM_MAX_WORD_LEN,
                                            alpha=BEAM_ALPHA, beta=BEAM_BETA)
    final = majority_vote_segment([cand_dp, cand_nn_thr, cand_nn_beam], raw, threshold_votes=VOTE_THRESHOLD)
    print(f"RAW: {raw}")
    print("DP:        ", cand_dp)
    print("NN_thr:    ", cand_nn_thr)
    print("NN_beam:   ", cand_nn_beam)
    print("MAJORITY:  ", final)
    print("---")

RAW: Аятанцуюпьяныйподдождём
DP:         А я танцую пьяный под дождём
NN_thr:     Аятанцую пьяный поддождём 
NN_beam:    Ая танцую пьяный под дождём
MAJORITY:   Ая танцую пьяный под дождём 
---
RAW: Курилитолькосиги,есличё
DP:         Курили только с и г и,если чё
NN_thr:     Курили только сиги, есличё
NN_beam:    Курили только сиг и,если чё
MAJORITY:   Курили только сиг и,если чё 
---
RAW: Нашобразжизнистанетпалачом
DP:         Наш образ жизни станет палачом
NN_thr:     Нашобразжизнистанет палачом 
NN_beam:    Наш образ жизни станет палачом
MAJORITY:   Наш образ жизни станет палачом 
---
RAW: Нискольконежалеюниочём
DP:         Ни сколько не жалею ни о чём
NN_thr:     Нисколько нежале юниочём 
NN_beam:    Ни сколько не жалею ни о чём
MAJORITY:   Ни сколько не жалею ни о чём 
---
RAW: Грустно,когдарасписанывсемаршруты
DP:         Грустно,когда расписаны все маршруты
NN_thr:     Грустно, когда расписаны всемар шруты
NN_beam:    Грустно,когда расписаны все маршруты
MAJORITY:   Грустно,ког

##Экспорт данных

In [25]:
from typing import List

def indices_from_segmented(segmented):
    """
    принимает строку с пробелами
    возвращает список индексов в строке без пробелов, перед которыми вставляется пробел
    """
    if segmented is None:
        return []
    parts = [p for p in segmented.strip().split(" ") if p != ""]
    indices = []
    pos = 0
    for i, w in enumerate(parts):
        if i == 0:
            pos += len(w)
            continue
        indices.append(pos)
        pos += len(w)
    return indices

In [26]:
labels_res = []
for line in test_data:

    cand_dp = smart_segment(line)
    cand_nn_thr, _, probs = predict_and_segment_threshold(model, DEVICE, line, ds_train.char2idx, threshold=best_thresh)
    cand_nn_beam = beam_segment_from_probs(line, probs.tolist(),
                                            beam_width=BEAM_WIDTH, max_word_len=BEAM_MAX_WORD_LEN,
                                            alpha=BEAM_ALPHA, beta=BEAM_BETA)
    final = majority_vote_segment([cand_dp, cand_nn_thr, cand_nn_beam], line, threshold_votes=VOTE_THRESHOLD)
    labels_res.append(indices_from_segmented(final))
result = pd.DataFrame(columns=['id', 'predicted_positions'])
for i in range(len(labels_res)):
    result.loc[i] = [i, str(labels_res[i])]
result.to_csv('result.csv', index=False)
