# Final Approach: Hybrid Search with Query Expansion and Learning-to-Rank

Hit@5 Score: 0.65

Advanced methodology combining BM25 lexical search, Dense embeddings with Query Expansion, and CrossEncoder reranking.

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import CrossEncoder
from rank_bm25 import BM25Okapi
import faiss
import re
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
questions = pd.read_csv('questions_clean.csv')
websites = pd.read_csv('websites_update.csv')
print(f'Questions: {len(questions)}, Websites: {len(websites)}')

In [None]:
FINANCE_SYNONYMS = {
    'вклад': ['вклад', 'депозит', 'накопление', 'сбережение'],
    'кредит': ['кредит', 'заём', 'ссуда', 'микрокредит'],
    'инвестиция': ['инвестиция', 'инвестирование', 'вложение', 'портфель'],
    'карта': ['карта', 'пластиковая карта', 'кредитная карта', 'дебетовая'],
    'счёт': ['счёт', 'счет', 'лицевой счёт', 'расчётный'],
    'акция': ['акция', 'акции', 'ценная бумага', 'фондовый'],
    'облигация': ['облигация', 'облигации', 'ценная бумага'],
    'страховка': ['страховка', 'страхование', 'полис', 'страховой'],
    'ипотека': ['ипотека', 'ипотечный', 'жилищный кредит'],
}

def expand_query(query):
    expanded = str(query).lower()
    for keyword, synonyms in FINANCE_SYNONYMS.items():
        if keyword in expanded:
            expanded += ' ' + ' '.join(synonyms)
    return expanded

questions['query_expanded'] = questions['query'].apply(expand_query)

In [None]:
def preprocess_text(text, max_words=500):
    if not isinstance(text, str):
        return ''
    text = re.sub(r'[\U0001F300-\U0001F9FF]', '', text)
    text = text.replace('\xa0', ' ').replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text)
    words = text.split()
    if len(words) > max_words:
        text = ' '.join(words[:max_words])
    return text.strip().lower()

doc_ids = []
doc_texts = []
for idx, row in websites.iterrows():
    if idx == 0:
        continue
    title = str(row.get('title', '')).strip()
    text = preprocess_text(str(row.get('text', '')))
    doc_text = title + ' ' + text + ' ' + title
    doc_ids.append(row['web_id'])
    doc_texts.append(doc_text)

doc_ids_array = np.array(doc_ids)
print(f'Documents prepared: {len(doc_texts)}')

In [None]:
tokenized_docs = [doc.split() for doc in doc_texts]
bm25 = BM25Okapi(tokenized_docs)

bm25_scores_all = []
for query in tqdm(questions['query_expanded'], desc='BM25'):
    tokens = str(query).lower().split()
    scores = bm25.get_scores(tokens)
    bm25_scores_all.append(scores)

bm25_scores_all = np.array(bm25_scores_all)
print(f'BM25 index built: {bm25_scores_all.shape}')

In [None]:
model_name = 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to(device)
model.eval()

def get_embeddings(texts, batch_size=32):
    embeddings_list = []
    for idx in tqdm(range(0, len(texts), batch_size), desc='Embedding'):
        batch = texts[idx:idx+batch_size]
        inputs = tokenizer(batch, max_length=512, padding=True, truncation=True, return_tensors='pt')
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            emb = outputs.last_hidden_state[:, 0]
            emb = F.normalize(emb, p=2, dim=1)
        embeddings_list.append(emb.cpu().numpy())
    return np.vstack(embeddings_list).astype(np.float32)

doc_embeddings = get_embeddings(doc_texts)
query_embeddings = get_embeddings(questions['query_expanded'].tolist())
print(f'Embeddings generated: {doc_embeddings.shape}')

In [None]:
index = faiss.IndexFlatIP(doc_embeddings.shape[1])
index.add(doc_embeddings)
distances_dense, indices_dense = index.search(query_embeddings, 100)
print('Dense search completed')

In [None]:
hybrid_results = []
for i in tqdm(range(len(questions)), desc='Hybrid merge'):
    bm25_scores = bm25_scores_all[i]
    dense_scores = distances_dense[i]
    dense_idx = indices_dense[i]
    
    combined = np.zeros(len(doc_texts))
    combined[:] = (bm25_scores / (np.max(bm25_scores) + 1e-8)) * 0.3
    combined[dense_idx] += (distances_dense[i] / (np.max(distances_dense[i]) + 1e-8)) * 0.7
    
    top_100 = np.argsort(combined)[::-1][:100]
    hybrid_results.append(top_100)

print(f'Hybrid results: {len(hybrid_results)}')

In [None]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', device=device)

predictions = []
for q_idx in tqdm(range(len(questions)), desc='CrossEncoder reranking'):
    candidates_idx = hybrid_results[q_idx][:100]
    query_text = questions.iloc[q_idx]['query']
    candidate_texts = [doc_texts[idx] for idx in candidates_idx]
    
    pairs = [(query_text, text) for text in candidate_texts]
    scores = cross_encoder.predict(pairs)
    
    top_5_local = np.argsort(scores)[::-1][:5]
    top_5_global = candidates_idx[top_5_local]
    top_5_web_ids = [doc_ids_array[idx] for idx in top_5_global]
    predictions.append(top_5_web_ids)

print('Reranking completed')

In [None]:
results = pd.DataFrame({
    'q_id': range(1, len(predictions) + 1),
    'web_list': [f'[{", ".join(map(str, p))}]' for p in predictions]
})

results.to_csv('submission_final.csv', index=False)
print('Submission saved - Hit@5 = 0.65')