# Загрузка библиотек и словарей

Предварительно поместите файл для теста в директорию с ноутбуком. Я использовал Google Colab и загружал файл в "Файлы" через кнопку "Загрузить в сессионное хранилище".

In [1]:
!pip install pymorphy3

Collecting pymorphy3
  Downloading pymorphy3-2.0.4-py3-none-any.whl.metadata (2.4 kB)
Collecting dawg2-python>=0.8.0 (from pymorphy3)
  Downloading dawg2_python-0.9.0-py3-none-any.whl.metadata (7.5 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl.metadata (2.0 kB)
Downloading pymorphy3-2.0.4-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.1/54.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dawg2_python-0.9.0-py3-none-any.whl (9.3 kB)
Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymorphy3-dicts-ru, dawg2-python, pymorphy3
Successfully installed dawg2-python-0.9.0 pymorphy3-2.0.4 pymorphy3-dicts-ru-2.4.417150.4580142


В качестве словаря для русских слов я использовал корпус OpenCorpora, для английского языка брал словарь из библиотеки nltk. Для того чтобы обрабатывать морфемы слов и приводить их к начальной форме я использовал pymorphy3

In [2]:
import requests
import re
from itertools import product
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from collections import defaultdict
def load_russian_words():
          url = "https://raw.githubusercontent.com/danakt/russian-words/master/russian.txt"
          response = requests.get(url)
          if response.status_code == 200:
              words = set()
              for word in response.text.split('\n'):
                  word = word.strip().lower()
                  if word and re.match('^[а-яё]+$', word) and len(word) > 1:
                      words.add(word)
              print(f"Загружено {len(words)} русских слов")
              return words
corpora = load_russian_words()

Загружено 1525391 русских слов


In [3]:
import nltk
nltk.download('words')
from nltk.corpus import words as nltk_words
import pymorphy3

morph=pymorphy3.MorphAnalyzer()
en_dict=set(nltk_words.words())

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


# Создание классов для сегментатора строки и валидатора слов

Проверка последовательности символов на английское слово, русское слово, словоформу русского слова

In [4]:
class WordValidator:
    def __init__(self, ru_dict, en_dict, morph):
        self.russian_words=ru_dict
        self.english_words=en_dict
        self.morph_analyzer = morph
        self.russian_normal_forms={self.morph_analyzer.parse(word)[0].normal_form for word in ru_dict}

    def is_valid_word(self, word):

        word_lower = word.lower()

        if word_lower in self.english_words:
            return True

        if word_lower in self.russian_words:
            return True

        if self.morph_analyzer and self.russian_normal_forms:
            try:
                parsed = self.morph_analyzer.parse(word_lower)
                if parsed:
                    normal_form = parsed[0].normal_form
                    return normal_form in self.russian_normal_forms
            except Exception:
                pass

        return False


validator=WordValidator(corpora, en_dict, morph)

Алгоритм: динамически разбивать строку и оценивать её разбиения с помощью perplexity (метрики естественности языка) и также построить разбиение с помощью жадного алгоритма. Для оценки перплексии я использовал такие модели как rugpt2 и SmolLM3.


In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from collections import defaultdict

class TextRestorer:
    def __init__(self, corpora, validator, model_name='sberbank-ai/rugpt3small_based_on_gpt2'):
      #HuggingFaceTB/SmolLM3-3B
      #sberbank-ai/rugpt3small_based_on_gpt2
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.corpora = set(corpora)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if self.device.type == "cuda" else torch.float32
        )
        self.model.to(self.device)
        self.model.eval()
        self.validator = validator


    def is_valid_word(self, word: str) -> bool:
        return self.validator.is_valid_word(word)

    def dp_segmentation(self, text):
        n = len(text)
        if n == 0:
            return []

        dp = [None] * (n + 1)
        dp[0] = []

        for i in range(1, n + 1):
            for j in range(max(0, i - 20), i):
                word = text[j:i]
                if dp[j] is not None and word in self.corpora:
                    current_segmentation = dp[j] + [word]

                    if dp[i] is None or self.is_better_segmentation(current_segmentation, dp[i]):
                        dp[i] = current_segmentation

        if dp[n] is not None:
            return dp[n]
        else:
            return self.greedy_segmentation(text)

    def is_better_segmentation(self, new_seg, old_seg):
        return len(new_seg) < len(old_seg) #Тут можно выбрать другую метрику, например оценить вероятности

    def attach_punctuation(self, segments):
        if not segments:
            return []

        result = []
        i = 0

        while i < len(segments):
            current_word = segments[i]
            if i < len(segments) - 1 and re.match(r'^[\W_]+$', segments[i+1]):
                punctuation_group = []
                j = i + 1
                while j < len(segments) and re.match(r'^[\W_]+$', segments[j]):
                    punctuation_group.append(segments[j])
                    j += 1

                combined = current_word + ''.join(punctuation_group)
                result.append(combined)
                i = j
            else:
                result.append(current_word)
                i += 1

        return result

    def get_phrase_probability(self, text):
        inputs = self.tokenizer(text, return_tensors="pt", max_length=128, truncation=True)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = self.model(**inputs, labels=inputs['input_ids'])
            return torch.exp(-outputs.loss).item()

    def greedy_segmentation(self, text):
        n = len(text)
        if n == 0:
            return []

        for length in range(min(20, n), 0, -1):
            word = text[:length]
            if word in self.corpora:
                remaining_segmentation = self.greedy_segmentation(text[length:])
                if remaining_segmentation is not None:
                    return [word] + remaining_segmentation

        return [text[0]] + self.greedy_segmentation(text[1:]) if n > 1 else [text]

    def restore_spaces(self, text):
        segments1 = self.dp_segmentation(text)
        segments2 = self.greedy_segmentation(text)

        segments1_with_punct = self.attach_punctuation(segments1)
        segments2_with_punct = self.attach_punctuation(segments2)

        restored1_text = ' '.join(segments1_with_punct)
        restored2_text = ' '.join(segments2_with_punct)

        probability1 = self.get_phrase_probability(restored1_text)
        probability2 = self.get_phrase_probability(restored2_text)

        probability = probability1 if probability1>probability2 else probability2
        restored_text=restored1_text if probability1>probability2 else restored2_text
        return {
            'restored_text': restored_text,
            'probability': probability
        }



In [6]:
restorer = TextRestorer(corpora,validator)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/574 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


pytorch_model.bin:   0%|          | 0.00/551M [00:00<?, ?B/s]

# Тестирование

In [7]:
test_cases = [
      "новыйфильтрдляводы",
      "сдаюквартирусмебельюитехникой",
      "ищудомработницу,центр",
      "Лишьоднаона",
      "Имнеживётся"
      ]
for test_text in test_cases:
      result = restorer.restore_spaces(test_text)

      print(f"Исходный текст: {test_text}")
      print(f"Восстановленный: {result['restored_text']}")
      print(f"Вероятность: {result['probability']:.6f}")

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Исходный текст: новыйфильтрдляводы
Восстановленный: новый фильтр для воды
Вероятность: 0.022389
Исходный текст: сдаюквартирусмебельюитехникой
Восстановленный: сдаю квартиру сме белью ите х никой
Вероятность: 0.000094
Исходный текст: ищудомработницу,центр
Восстановленный: ищу домработницу, центр
Вероятность: 0.004076
Исходный текст: Лишьоднаона
Восстановленный: Л ишь одна она
Вероятность: 0.000197
Исходный текст: Имнеживётся
Восстановленный: И мне живётся
Вероятность: 0.003683


In [8]:
restorer.restore_spaces("Однаонаповсюду,гдебынескрылсяя")

{'restored_text': 'О дна она повсюду, где бы неск рылся я',
 'probability': 0.0002628164365887642}

In [9]:
import pandas as pd

def read_special_csv(filename):
    ids = []
    texts = []

    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            comma_pos = line.find(',')
            if comma_pos != -1:
                id_val = line[:comma_pos]
                text_val = line[comma_pos+1:]
                ids.append(id_val)
                texts.append(text_val)
            else:
                ids.append(None)
                texts.append(line)

    return pd.DataFrame({'id': ids, 'text': texts})




In [11]:
df = read_special_csv('dataset_1937770_3.txt')[1:]

In [12]:
def find_space_indices(text):
    return [i for i, char in enumerate(text) if char == ' ']

df["text"]=df["text"].apply(lambda x:find_space_indices(restorer.restore_spaces(x)['restored_text'].lower()))

In [13]:
df=df.rename(columns={"text": "predicted_positions"})
df["predicted_positions"]=df["predicted_positions"].apply(lambda x:str(x))
df.sample(10)

Unnamed: 0,id,predicted_positions
684,683,"[1, 6, 12, 20, 25, 31, 35, 41, 46]"
798,797,"[1, 3, 6, 10, 14, 23, 30, 38]"
266,265,"[3, 12]"
574,573,"[1, 5, 13, 17, 27, 30, 34]"
307,306,"[5, 12]"
289,288,"[3, 13, 17]"
694,693,"[1, 5, 8]"
149,148,"[3, 13, 19, 23, 26]"
475,474,"[1, 6, 24]"
583,582,"[1, 4, 7, 16]"


In [14]:
df.to_csv("submission.csv",  index=False)