In [9]:
!pip install -q pytorch-crf

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchcrf import CRF
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Используется устройство: {device}")

[0mИспользуется устройство: cuda


In [12]:
DATA_PATH = "/kaggle/input/avito-task-segment/train.parquet"
SAMPLE_SIZE = 320000 
MAX_LEN = 128 

print(f"Загружаем parquet-файл '{DATA_PATH}'...")
full_df = pd.read_parquet(DATA_PATH)

print(f"Берем случайную выборку из {SAMPLE_SIZE} строк...")
df = full_df.sample(n=SAMPLE_SIZE, random_state=42)

def create_bio_labels_pd(row):
    text_no_spaces = row["sentence_without_spaces"]
    space_positions = set(pos - 1 for pos in row["true_positions"])
    labels = [1 if i in space_positions else 0 for i in range(len(text_no_spaces))]
    return text_no_spaces, labels

print("Создаем данные (текст, метки на символ)...")
processed_data = [create_bio_labels_pd(row) for index, row in tqdm(df.iterrows(), total=len(df))]
texts = [item[0] for item in processed_data]
labels = [item[1] for item in processed_data]

all_text = "".join(texts)
chars = sorted(list(set(all_text)))
char_to_id = {c: i + 2 for i, c in enumerate(chars)}
char_to_id["<PAD>"] = 0
char_to_id["<UNK>"] = 1
VOCAB_SIZE = len(char_to_id)
print(f"Размер словаря символов: {VOCAB_SIZE}")

print(f"Преобразуем тексты в последовательности ID и обрезаем до {MAX_LEN} символов...")
X = [[char_to_id.get(c, char_to_id["<UNK>"]) for c in text] for text in texts]
y = [[label for label in label_seq] for label_seq in labels]

X_padded = np.array([seq[:MAX_LEN] + [char_to_id["<PAD>"]] * (MAX_LEN - len(seq)) for seq in X])
y_padded = np.array([seq[:MAX_LEN] + [-100] * (MAX_LEN - len(seq)) for seq in y])

X_train, X_val, y_train, y_val = train_test_split(X_padded, y_padded, test_size=0.15, random_state=42)

class WordSegDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.long)
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

train_dataset = WordSegDataset(X_train, y_train)
val_dataset = WordSegDataset(X_val, y_val)

BATCH_SIZE = 256
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

print(f"Размеры выборок: train={len(train_dataset)}, val={len(val_dataset)}")

Загружаем parquet-файл '/kaggle/input/avito-task-segment/train.parquet'...
Берем случайную выборку из 320000 строк...
Создаем данные (текст, метки на символ)...


  0%|          | 0/320000 [00:00<?, ?it/s]

Размер словаря символов: 675
Преобразуем тексты в последовательности ID и обрезаем до 128 символов...
Размеры выборок: train=272000, val=48000


In [13]:
EMBEDDING_DIM = 64
LSTM_UNITS = 128
EPOCHS = 3
LEARNING_RATE = 0.001

class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_labels):
        super(BiLSTM_CRF, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1, bidirectional=True, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim * 2, num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, x):
        mask = (x != 0).bool()
        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds)
        emissions = self.hidden2tag(lstm_out)
        return emissions, mask

    def loss(self, x, tags):
        emissions, mask = self.forward(x)
        active_tags = torch.where(mask, tags, torch.tensor(0).type_as(tags))
        return -self.crf(emissions, active_tags, mask=mask, reduction='mean')
    
    def predict(self, x):
        emissions, mask = self.forward(x)
        return self.crf.decode(emissions, mask=mask)

model_bilstm_crf = BiLSTM_CRF(VOCAB_SIZE, EMBEDDING_DIM, LSTM_UNITS, 2).to(device)
optimizer = torch.optim.Adam(model_bilstm_crf.parameters(), lr=LEARNING_RATE)

print("Начинаем обучение модели Bi-LSTM+CRF на PyTorch...")
for epoch in range(EPOCHS):
    model_bilstm_crf.train()
    train_loss = 0
    for texts, labels in tqdm(train_loader, desc=f"Эпоха {epoch+1}/{EPOCHS}"):
        texts, labels = texts.to(device), labels.to(device)
        
        mask = (labels != -100)
        if not mask.any(): continue

        optimizer.zero_grad()
        loss = model_bilstm_crf.loss(texts, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    model_bilstm_crf.eval()
    val_loss = 0
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            mask = (labels != -100)
            if not mask.any(): continue
            loss = model_bilstm_crf.loss(texts, labels)
            val_loss += loss.item()
    
    print(f"Эпоха {epoch+1}: Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")

print("\nОбучение завершено.")

Начинаем обучение модели Bi-LSTM+CRF на PyTorch...


Эпоха 1/3:   0%|          | 0/1063 [00:00<?, ?it/s]

Эпоха 1: Train Loss: 5.4869, Val Loss: 3.3527


Эпоха 2/3:   0%|          | 0/1063 [00:00<?, ?it/s]

Эпоха 2: Train Loss: 2.8883, Val Loss: 2.7102


Эпоха 3/3:   0%|          | 0/1063 [00:00<?, ?it/s]

Эпоха 3: Train Loss: 2.4602, Val Loss: 2.4700

Обучение завершено.


In [18]:
def predict_spaces(text: str, model, char_to_id_map, max_len, device):
    if not text or not isinstance(text, str):
        return []

    model.eval()
    
    token_ids = [char_to_id_map.get(c, char_to_id_map["<UNK>"]) for c in text[:max_len]]
    
    input_tensor = torch.tensor([token_ids], dtype=torch.long).to(device)
    
    with torch.no_grad():
        predicted_labels_list = model.predict(input_tensor)
        
    if not predicted_labels_list or not predicted_labels_list[0]:
        return []
    
    predicted_labels = predicted_labels_list[0] # Предсказания для первой (и единственной) строки

    positions = []
    for i in range(len(text[:max_len])):
        if i < len(predicted_labels) and predicted_labels[i] == 1: # 1 - метка "нужен пробел"
            positions.append(i + 1) # Позиция - это индекс СЛЕДУЮЩЕГО символа
            
    return positions

example_row = df.iloc[10] # iloc для pandas
sample_text = example_row['sentence_without_spaces']
true_pos = example_row['true_positions']

predicted_pos = predict_spaces(sample_text, model_bilstm_crf, char_to_id, MAX_LEN, device)

print(f"Пример текста: {sample_text}")
print(f"Истинные позиции: {true_pos}")
print(f"Предсказанные позиции: {predicted_pos}")
print("\nЯчейка 4: Функция инференса готова.")

Пример текста: нимолоденькиедевушки,ниихмамы...
Истинные позиции: [ 2 13 21 23 25]
Предсказанные позиции: [2, 13, 21, 25]

Ячейка 4: Функция инференса готова.


In [21]:
TEST_PATH = "/kaggle/input/avito-task-segment/test.parquet"
test_df_eval = pd.read_parquet(TEST_PATH)

print("Начинаем предсказание на тестовой выборке для оценки F1...")
sentences_to_eval = test_df_eval["sentence_without_spaces"].tolist()
true_positions_eval = test_df_eval["true_positions"].tolist()

predicted_positions_eval = [
    predict_spaces(s, model_bilstm_crf, char_to_id, MAX_LEN, device)
    for s in tqdm(sentences_to_eval, desc="Оценка модели")
]

def calculate_f1_corrected(predicted: list, true: list) -> float:
    if len(predicted) == 0 and len(true) == 0: return 1.0
    pred_set = set(predicted)
    true_set = set(true)
    tp = len(pred_set.intersection(true_set))
    precision = tp / len(pred_set) if pred_set else 0.0
    recall = tp / len(true_set) if true_set else 0.0
    if precision + recall == 0: return 0.0
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

f1_scores = [
    calculate_f1_corrected(pred, true)
    for pred, true in zip(predicted_positions_eval, true_positions_eval)
]

mean_f1 = sum(f1_scores) / len(f1_scores) if f1_scores else 0.0
print("-" * 50)
print(f"ИТОГОВЫЙ РЕЗУЛЬТАТ (Bi-LSTM+CRF):")
print(f"Средний F1-score на тестовой выборке: {mean_f1:.4f}")
print("-" * 50)

Начинаем предсказание на тестовой выборке для оценки F1...


Оценка модели:   0%|          | 0/37500 [00:00<?, ?it/s]

--------------------------------------------------
ИТОГОВЫЙ РЕЗУЛЬТАТ (Bi-LSTM+CRF):
Средний F1-score на тестовой выборке: 0.8734
--------------------------------------------------


In [23]:
SUBMISSION_TEMPLATE_PATH = "/kaggle/input/avito-task-segment/submission.parquet"
submission_df = pd.read_parquet(SUBMISSION_TEMPLATE_PATH)

predicted_positions_for_submission = []

print("Начинаем генерацию предсказаний для финального сабмита...")
for text in tqdm(submission_df['text_no_spaces'], desc="Генерация сабмита"):
    positions = predict_spaces(text, model_bilstm_crf, char_to_id, MAX_LEN, device)
    
    positions_str = str(sorted(positions))
    predicted_positions_for_submission.append(positions_str)

submission_df['predicted_positions'] = predicted_positions_for_submission
final_submission_df = submission_df[['id', 'predicted_positions']]
final_submission_df.to_csv('submission.csv', index=False)

print("\nФайл submission.csv успешно создан и готов к отправке!")
print("Пример содержимого:")
print(final_submission_df.head())

Начинаем генерацию предсказаний для финального сабмита...


Генерация сабмита:   0%|          | 0/1005 [00:00<?, ?it/s]


Файл submission.csv успешно создан и готов к отправке!
Пример содержимого:
   id      predicted_positions
0   0              [5, 10, 12]
1   1                   [6, 7]
2   2  [4, 12, 13, 20, 21, 29]
3   3          [5, 10, 18, 26]
4   4              [5, 10, 15]
