In [None]:
!pip install -U FlagEmbedding

In [None]:
!pip install razdel

In [None]:
!pip install rank-bm25 pymorphy3 nltk

In [None]:
!pip install faiss-cpu

In [None]:
!pip install -U langchain-community

In [None]:
!pip install langchain-huggingface sentence-transformers

In [None]:
import pandas as pd
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from typing import Callable
from langchain_core.documents import Document
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
import time
import numpy as np
from rank_bm25 import BM25Okapi
import pymorphy3
from nltk.corpus import stopwords
import nltk
import re
from razdel import tokenize
import string

In [None]:
nltk.download('stopwords')

In [None]:
morph = pymorphy3.MorphAnalyzer()
russian_stopwords = set(stopwords.words('russian'))

stop_words_rag = [
    'который',      # 7220
    'клиент',       # 16358
    'услуга',       # 7711
    'договор',      # 10123
    'условие',      # 5132
    'случай',       # 3951
    'день',         # 5435
    'месяц',        # 5352
    'год',          # 4836
    'срок',         # 4081
    'альфа-банк'    # 3966
]


all_stopwords = russian_stopwords.union(stop_words_rag)

In [None]:
def preprocess_text_for_bm25(text, stopwords, use_lemmatization=True, remove_numbers=True):
    """
    Предобработка текста для BM25 с учетом банковской тематики и маркдаун-разметки

    Args:
        text: исходный текст (может содержать маркдаун)
        use_lemmatization: приводить слова к нормальной форме
        remove_numbers: удалять цифры
    """
    if not isinstance(text, str):
        return ""

    # 1. Удаление маркдаун-разметки
    text = re.sub(r'#+\s+', ' ', text)  # заголовки
    text = re.sub(r'\*{1,2}(.*?)\*{1,2}', r'\1', text)  # жирный/курсив
    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)  # ссылки [текст](url)
    text = re.sub(r'`{1,3}(.*?)`{1,3}', r'\1', text)  # код
    text = re.sub(r'-\s*\[[x\s]\]\s*', ' ', text)  # чекбоксы
    text = re.sub(r'\|.*?\|', ' ', text)  # таблицы

    # 2. Приведение к нижнему регистру
    text = text.lower()

    # 3. Удаление смайлов и эмодзи
    text = re.sub(r'[\U00010000-\U0010ffff]', '', text)
    text = re.sub(r'[;:]-?[\)\(/\\|\[\]]', '', text)

    # 4. Удаление пунктуации (сохраняем дефисы в составных словах)
    text = re.sub(r'[%s]' % re.escape(string.punctuation.replace('-', '') + '«»—'), ' ', text)

    # 5. Удаление цифр
    if remove_numbers:
        text = re.sub(r'\d+', ' ', text)

    # 6. Удаление лишних пробелов
    text = re.sub(r'\s+', ' ', text).strip()

    # 7. Токенизация и фильтрация
    tokens = text.split()
    processed_tokens = []

    for token in tokens:
        # Удаляем токены короче 2 символов
        if len(token) < 2:
            continue



        # Лемматизация (рекомендуется для русского языка)
        if use_lemmatization:
            try:
                parsed = morph.parse(token)[0]
                lemma = parsed.normal_form
                # Не добавляем леммы короче 2 символов
                if len(lemma) >= 2:
                    processed_tokens.append(lemma)
            except:
                # В случае ошибки оставляем оригинальный токен
                processed_tokens.append(token)
        else:
            processed_tokens.append(token)

        # Удаляем стоп-слова
        if token in stopwords:
            continue

    return ' '.join(processed_tokens)

In [None]:
df_questions_clean = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Хакатоны/ALPHA - RAG/questions_clean.csv")
df_sample_submission = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Хакатоны/ALPHA - RAG/sample_submission.csv")
df_websites_updated = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Хакатоны/ALPHA - RAG/websites.csv")

In [None]:
mx = []
mx_texts = []
c = 0
k = 0

sharp_texts = []
lens_sharp_texts = []
for i in range(len(df_websites_updated["text"])):
  tmp = str(df_websites_updated.loc[i, "text"])
  if len(tmp) > 1024:
    c += 1
    mx_texts.append(tmp)
  if "#" in tmp:
    sharp_texts.append(tmp)
    lens_sharp_texts.append(len(tmp))
  mx.append(len(tmp))

  if len(tmp) > 512:
    k += 1

print("ЗАПИСЕЙ БОЛЬШЕ 1024 СИМВОЛОВ: ", c)
print("ЗАПИСЕЙ БОЛЬШЕ 512 СИМВОЛОВ: ", k)
print("ВСЕГО ЗАПИСЕЙ: ", len(df_websites_updated["text"]))

mx.sort()
print("МАКСИМАЛЬНЫЕ ДЛИНЫ ТЕКСТОВ: ")
mx[-10:]

print("ДЛИНЫ ТЕКСТОВ С #: ", lens_sharp_texts)

In [None]:
mx = []

c = 0
for i in range(len(df_questions_clean["query"])):
  tmp = str(df_questions_clean.iloc[i, 1])

  mx.append(len(tmp))

mx.sort()
print("МАКСИМАЛЬНЫЕ ДЛИНЫ ЗАПРОСОВ: ")
mx[-10:]

In [None]:
web_clone = df_websites_updated.copy()

In [None]:
web_clone['text'] = web_clone['text'].apply(
    lambda x: preprocess_text_for_bm25(x, stopwords=all_stopwords)
)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("deepvk/USER-bge-m3")

splitter = RecursiveCharacterTextSplitter(
    chunk_size = 400,
    chunk_overlap = 80,
    length_function = lambda x: len(tokenizer.encode(x)),
    keep_separator = True
)

In [None]:
primary_chunks = []
titles = list(df_websites_updated["title"])
texts = df_websites_updated[["web_id", "text"]]

for i in range(len(texts)-1):

  chunk = {"text": texts.iloc[i, 1], "id": int(texts.iloc[i, 0]), "title": titles[i]}

  primary_chunk= Document(page_content = chunk["text"], metadata = {"id": chunk["id"], "title": chunk["title"]})

  primary_chunks.append(primary_chunk)

In [None]:
rec_chunks = splitter.split_documents(primary_chunks)

In [None]:
embedding_model = HuggingFaceEmbeddings(
    model_name = "deepvk/USER-bge-m3"
)

In [None]:
"""

vectorstore = FAISS.from_documents(
    documents = rec_chunks,
    embedding=embedding_model
)

vectorstore.save_local("/content/drive/MyDrive/Colab Notebooks/Хакатоны/ALPHA - RAG/faiss_index")

print(f"БД создана, документов: {vectorstore.index.ntotal}")

"""

In [None]:

vectorstore = FAISS.load_local(
    folder_path="/content/drive/MyDrive/Colab Notebooks/Хакатоны/ALPHA - RAG/faiss_index",
    embeddings=embedding_model,
    allow_dangerous_deserialization=True
)

print(f"Документов: {vectorstore.index.ntotal}")

In [None]:
retriever = vectorstore.as_retriever(search_type = "similarity", search_kwargs = {"k": 150})

In [None]:
russian_stopwords = set(stopwords.words("russian"))

def preprocess_text(text: str):

  tokens = tokenize(text)

  return [

      token.text for token in tokens
      if token.text not in russian_stopwords
      and token.text not in string.punctuation
      and token.text.isalnum()
      and len(token.text) > 1
  ]

In [None]:
tokenized_corpus = [preprocess_text(doc.page_content) for doc in rec_chunks]
bm25 = BM25Okapi(tokenized_corpus)

In [None]:
query = "Где узнать бик и счёт"
tokenized_query = preprocess_text(query)

all_scores = bm25.get_scores(tokenized_query)

scored_docs = []
for i, score in enumerate(all_scores):
    if score > 0:
        scored_docs.append({
            'document': rec_chunks[i],
            'score': score,
            'index': i
        })

scored_docs.sort(key=lambda x: x['score'], reverse=True)
top_docs_with_scores = scored_docs[:5]

top_docs_with_scores

In [None]:
bm25_k = 150
faiss_k = 150
merged_k = 250–300
reranker_k = 150–200
documents_out = 5

score(doc) = max(chunk_scores) + 0.1 * mean(top-3)

def rank_documents_from_chunks(query, chunks, reranker, top_chunk_k=100, top_doc_k=5):
    # chunks: список dict({
    #   'doc_id': str,
    #   'chunk': object with .page_content
    # })

    # Step 1 — ограничить кандидатов
    chunks = chunks[:top_chunk_k]

    # Step 2 — подготовить пары
    pairs = [[query, ch["chunk"].page_content] for ch in chunks]

    # Step 3 — получить скор
    if reranker.model_type == "flag":
        scores = reranker.reranker.compute_score(pairs)
    else:
        scores = reranker.reranker.predict(pairs)

    for i, ch in enumerate(chunks):
        ch["rerank_score"] = float(scores[i])

    # Step 4 — групировка по документам
    from collections import defaultdict
    doc_scores = defaultdict(list)

    for ch in chunks:
        doc_scores[ch["doc_id"]].append(ch["rerank_score"])

    # Step 5 — агрегированная оценка
    final_docs = []
    for doc_id, scs in doc_scores.items():
        scs_sorted = sorted(scs, reverse=True)
        max_score = scs_sorted[0]
        mean_top3 = sum(scs_sorted[:3]) / min(len(scs_sorted), 3)
        final_score = max_score + 0.1 * mean_top3   # Рекомендованное правило
        final_docs.append({"doc_id": doc_id, "score": final_score})

    # Step 6 — сортировка документов
    final_docs = sorted(final_docs, key=lambda x: x["score"], reverse=True)

    return final_docs[:top_doc_k]


from collections import defaultdict

doc_scores = defaultdict(list)

for chunk in reranked_chunks:
    doc_id = chunk["doc_id"]
    doc_scores[doc_id].append(chunk["rerank_score"])

In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [None]:
from FlagEmbedding import FlagReranker
from sentence_transformers import CrossEncoder


class BGEReranker:
    def __init__(self, model_name="BAAI/bge-reranker-v2-m3", device="cuda"):
        try:
            self.reranker = FlagReranker(model_name, use_fp16=True, device=device)
            self.model_type = "flag"
        except:

            self.reranker = CrossEncoder(model_name, device=device)
            self.model_type = "crossencoder"

    def rerank(self, query, chunks, top_k=150):

      documents = chunks

      pairs = [[query, doc["chunk"].page_content] for doc in documents]

      if self.model_type == "flag":
          scores = self.reranker.compute_score(pairs)
      else:
          scores = self.reranker.predict(pairs)

      for i, doc in enumerate(documents):
          doc["rerank_score"] = float(scores[i])

      from collections import defaultdict
      doc_scores = defaultdict(list)

      for ch in documents:
          doc_scores[ch["chunk"].metadata["id"]].append(ch["rerank_score"])

      final_docs = []
      for doc_id, scs in doc_scores.items():
          scs_sorted = sorted(scs, reverse=True)
          max_score = scs_sorted[0]
          mean_top3 = sum(scs_sorted[:3]) / min(len(scs_sorted), 3)
          final_score = max_score + 0.1 * mean_top3
          final_docs.append({"doc_id": doc_id, "score": final_score})

      final_docs = sorted(final_docs, key=lambda x: x["score"], reverse=True)


      return final_docs[:5]

In [None]:
model = BGEReranker()

In [None]:
def get_final_res(query):

  vec_chunks = retriever.invoke(query)

  tokenized_query = preprocess_text(query)
  bm25_chunks = bm25.get_top_n(tokenized_query, rec_chunks, n = 150)

  chunks = []

  for i in vec_chunks:
    if i not in bm25_chunks:
      chunks.append(i)
  chunks.extend(bm25_chunks)

  print(len(chunks))

  n_chunks = [{"chunk": chunk} for chunk in chunks]

  final_ids = model.rerank(query, n_chunks)

  return final_ids


In [None]:
res = get_final_res("Где узнать бик и счёт")

In [None]:
res

In [None]:
indexes = []

for elem in res:

  if elem["chunk"].metadata["id"] not in indexes:
    indexes.append(elem["chunk"].metadata["id"])
  if len(indexes) >= 5:
    break

indexes



In [None]:
from tqdm import tqdm
import json

total_res = []
c = 0

for i in tqdm(df_questions_clean["query"][3500:5001]):

  c += 1

  res = get_final_res(i)

  total_res.append(res)

  if c % 10 == 0:
    with open("/content/drive/MyDrive/Colab Notebooks/Хакатоны/ALPHA - RAG/res_3.json", "w") as f:
      json.dump(total_res, f)

In [None]:
with open("/content/drive/MyDrive/Colab Notebooks/Хакатоны/ALPHA - RAG/res.json", "w") as f:
      json.dump(total_indexes, f)