# Базовый подход v2.0: GTE с чанкингом на уровне слов

Скор Hit@5: 0.30

Улучшенная стратегия чанкинга: чанки по 350 слов с перекрытием 80% вместо разбиения по предложениям.

In [None]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
import faiss
import re

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Устройство: {device}')

In [None]:
questions = pd.read_csv('questions_clean.csv')
websites = pd.read_csv('websites_update.csv')
print(f'Вопросов: {len(questions)}, Сайтов: {len(websites)}')

In [None]:
def preprocess_text(text, max_words=500):
    if not isinstance(text, str):
        return ''
    text = re.sub(r'[\U0001F300-\U0001F9FF]', '', text)
    text = text.replace('\xa0', ' ').replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text)
    words = text.split()
    if len(words) > max_words:
        text = ' '.join(words[:max_words])
    return text.strip()

def chunk_text(text, chunk_size=350, overlap=0.8):
    words = text.split()
    if len(words) < chunk_size // 5:
        return [text]
    step = max(int(chunk_size * (1 - overlap)), 1)
    chunks = []
    for i in range(0, len(words), step):
        chunk_words = words[i:i + chunk_size]
        if chunk_words:
            chunks.append(' '.join(chunk_words))
    return chunks if chunks else [text]

doc_ids = []
doc_texts = []
for idx, row in websites.iterrows():
    if idx == 0:
        continue
    title = str(row.get('title', '')).strip()
    text = preprocess_text(str(row.get('text', '')))
    chunks = chunk_text(title + ' ' + text)
    for chunk in chunks:
        if chunk.strip():
            doc_ids.append(row['web_id'])
            doc_texts.append(chunk)

print(f'Всего чанков: {len(doc_texts)}')

In [None]:
model = SentenceTransformer('Alibaba-NLP/gte-multilingual-base', device=device)
embeddings = model.encode(doc_texts, batch_size=64, show_progress_bar=True, convert_to_numpy=True).astype('float32')
query_embeddings = model.encode(questions['query'].tolist(), batch_size=64, convert_to_numpy=True).astype('float32')
print(f'Эмбеддинги: {embeddings.shape}')

In [None]:
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)
distances, indices = index.search(query_embeddings, 100)

doc_ids_array = np.array(doc_ids)
predictions = []
for i in range(len(questions)):
    top_5_chunks = indices[i][:5]
    top_5_web_ids = [doc_ids_array[idx] for idx in top_5_chunks]
    predictions.append(top_5_web_ids)

results = pd.DataFrame({
    'q_id': range(1, len(predictions) + 1),
    'web_list': [f'[{", ".join(map(str, p))}]' for p in predictions]
})

results.to_csv('submission_baseline_v2.csv', index=False)
print('Hit@5 = 0.30')