# PDF Loader

In [None]:
import pdfplumber

def load_pdf(path: str) -> str:
    text = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            text.append(page.extract_text() or "")
    return "\n".join(text)

# DOCX Loader

In [None]:
import docx

def load_docx(path: str) -> str:
    document = docx.Document(path)
    text = [p.text for p in document.paragraphs]
    return "\n".join(text)

# HTML Loader

In [None]:
from bs4 import BeautifulSoup

def load_html(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        html = f.read()

    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text(separator="\n")

# Web Page Loader

In [None]:
import requests
from bs4 import BeautifulSoup

def load_webpage(url: str) -> str:
    response = requests.get(url, timeout=10)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")
    return soup.get_text(separator="\n")

# API Loader

In [None]:
import requests

def load_api(url: str, params=None, headers=None) -> str:
    response = requests.get(url, params=params, headers=headers, timeout=10)
    response.raise_for_status()

    # нормализация JSON → текст
    data = response.json()

    def flatten_json(obj):
        if isinstance(obj, dict):
            return "\n".join(f"{k}: {flatten_json(v)}" for k, v in obj.items())
        elif isinstance(obj, list):
            return "\n".join(flatten_json(x) for x in obj)
        return str(obj)

    return flatten_json(data)

# Database Loader

In [None]:
import sqlite3

def load_sqlite(db_path: str, query: str) -> str:
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    cur.execute(query)

    rows = cur.fetchall()
    columns = [desc[0] for desc in cur.description]

    conn.close()

    lines = []
    for row in rows:
        lines.append(", ".join(f"{col}: {val}" for col, val in zip(columns, row)))

    return "\n".join(lines)

# Image Loader (OCR)

In [None]:
import pytesseract
from PIL import Image

def load_image(path: str) -> str:
    img = Image.open(path)
    text = pytesseract.image_to_string(img, lang="eng")
    return text

# Нормализация текста

In [None]:
import re

def normalize(text: str) -> str:
    # уменьшить повторяющиеся пробелы
    text = re.sub(r"\s+", " ", text)

    # trim
    return text.strip()

# Очистка текста

In [None]:
import re

def clean_text(text: str) -> str:
    # убрать HTML-мусор
    text = re.sub(r"<[^>]+>", "", text)

    # убрать не-ASCII (если нужно)
    # text = text.encode("ascii", "ignore").decode()

    # убрать повтор строк
    lines = list(dict.fromkeys(text.split("\n")))
    text = "\n".join(lines)

    return text.strip()

# модели

In [None]:
# ЛУЧШИЙ ВЫБОР - оптимальное качество/производительность
MODELS = {
    "qwen2.5-7b-instruct": "Qwen/Qwen2.5-7B-Instruct",  # Идеально для A100 20GB
    "qwen2.5-14b-instruct": "Qwen/Qwen2.5-14B-Instruct",  # Хорошо влезает
    "qwen2.5-1.5b-instruct": "Qwen/Qwen2.5-1.5B-Instruct",  # Супер быстрая
}

# Для RAG особенно хороши:
RAG_OPTIMIZED = "Qwen/Qwen2.5-7B-Instruct"  # Лучший баланс

In [None]:
LLAMA_MODELS = {
    "llama-3-8b-instruct": "meta-llama/Meta-Llama-3-8B-Instruct",  # Отлично влезает
    "llama-3-1-8b-instruct": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "llama-3-1-405b-instruct": "meta-llama/Meta-Llama-3.1-405B-Instruct",  # ТОЛЬКО через API
}

In [None]:
MISTRAL_MODELS = {
    "mistral-7b-instruct": "mistralai/Mistral-7B-Instruct-v0.3",
    "mixtral-8x7b-instruct": "mistralai/Mixtral-8x7B-Instruct-v0.1",  # Через quantization
}

In [None]:
MULTILINGUAL_MODELS = {
    "rugpt3-medium": "sberbank-ai/rugpt3medium_based_on_gpt2",  # Русская, компактная
    "rubert-tiny2": "cointegrated/rubert-tiny2",  # Русская, очень легкая
    "multilingual-e5": "intfloat/multilingual-e5-large",  # Для эмбеддингов
}

In [None]:
def setup_optimal_rag_system():
    """Настройка оптимальной RAG системы для A100 20GB"""
    
    config = {
        # Основная модель для ответов
        "answer_llm": "Qwen/Qwen2.5-7B-Instruct",
        
        # Модель для верификации (можно легче)
        "verification_llm": "Qwen/Qwen2.5-1.5B-Instruct", 
        
        # Модель для эмбеддингов
        "embedding_model": "intfloat/multilingual-e5-large",
        
        # Кросс-энкодер для реранкинга
        "cross_encoder": "cross-encoder/ms-marco-MiniLM-L-6-v2",
    }
    
    return config

# Альтернативные конфигурации
CONFIGURATIONS = {
    "quality_priority": {
        "answer_llm": "Qwen/Qwen2.5-14B-Instruct",
        "verification_llm": "Qwen/Qwen2.5-7B-Instruct",
        "embedding_model": "intfloat/multilingual-e5-large", 
        "cross_encoder": "cross-encoder/ms-marco-MiniLM-L-12-v2"
    },
    
    "speed_priority": {
        "answer_llm": "Qwen/Qwen2.5-1.5B-Instruct",
        "verification_llm": "Qwen/Qwen2.5-1.5B-Instruct",
        "embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
        "cross_encoder": "cross-encoder/ms-marco-TinyBERT-L-2-v2"
    },
    
    "multilingual": {
        "answer_llm": "Qwen/Qwen2.5-7B-Instruct",  # Хорошая поддержка языков
        "verification_llm": "Qwen/Qwen2.5-1.5B-Instruct",
        "embedding_model": "intfloat/multilingual-e5-large",
        "cross_encoder": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    }
}

# rag pipeline

In [1]:
import pandas as pd

In [2]:
import os
import json

In [3]:
sub =['convfinqa', 'financebench', 'finder', 'finqa', 'finqabench', 'multiheirtt', 'tatqa']

In [4]:
from pathlib import Path

In [5]:
def load_json(path):
    data = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

In [6]:
corpus = load_json('/kaggle/input/icaif-24-finance-rag-challenge/convfinqa_corpus.jsonl/corpus.jsonl')
query = load_json('/kaggle/input/icaif-24-finance-rag-challenge/convfinqa_queries.jsonl/queries.jsonl')
cor_df = pd.DataFrame(corpus)
que_df = pd.DataFrame(query)

In [13]:
que_df['title'].nunique()

1

In [7]:
sub

['convfinqa',
 'financebench',
 'finder',
 'finqa',
 'finqabench',
 'multiheirtt',
 'tatqa']

In [8]:
for i in sub[1:]:
    corpus = load_json(f'/kaggle/input/icaif-24-finance-rag-challenge/{i}_corpus.jsonl/corpus.jsonl')
    query = load_json(f'/kaggle/input/icaif-24-finance-rag-challenge/{i}_queries.jsonl/queries.jsonl')
    corpus_df = pd.DataFrame(corpus)
    query_df = pd.DataFrame(query)
    cor_df = pd.concat([corpus_df, cor_df], ignore_index=True)
    que_df = pd.concat([query_df, que_df], ignore_index=True)

In [9]:
cor_df

Unnamed: 0,_id,title,text
0,d1b2e74c0,,The following tables present the recorded inve...
1,d1b32cade,,NOTE 13. INCOME TAXES\nWe calculate our provis...
2,d1b38504e,,8. Earnings Per Share\nBasic earnings per shar...
3,d1b37e6a4,,AMERICAN TOWER CORPORATION AND SUBSIDIARIES NO...
4,d1b3a8e2c,,The tax effects of temporary differences that ...
...,...,...,...
32220,dd4c4dfcc,,item 2 .\nproperties our principal offices are...
32221,dd4c246b8,,the company 2019s stock performance the follow...
32222,dd4c57766,,undesignated hedges was $ 41.2 million and $ 4...
32223,dd4b8d39e,,the following table summarizes the changes in ...


In [9]:
cor_df['text+title'] = cor_df['title'] + ' ' + cor_df['text']

In [10]:
que_df['title+text'] = que_df['title'] + ' ' + que_df['text']
que_df

Unnamed: 0,_id,title,text,title+text
0,q1a73c1d4,,In which year was interest income greater than...,In which year was interest income greater tha...
1,q1a72ef98,,What was the Net Income (Loss) in 2019?,What was the Net Income (Loss) in 2019?
2,q1a716380,,What was the percentage of Plan Assets for Oth...,What was the percentage of Plan Assets for Ot...
3,q1a73f97e,,What was the service revenue from Americas in ...,What was the service revenue from Americas in...
4,q1a731ee6,,What was the Customer support renewal rate for...,What was the Customer support renewal rate fo...
...,...,...,...,...
4666,qd497c816,,what was the decrease amount on the net intere...,what was the decrease amount on the net inter...
4667,qd4988ecc,,what was the total of operating expenses in 2018?,what was the total of operating expenses in 2...
4668,qd4982266,,what is the current ratio of robert mondavi?,what is the current ratio of robert mondavi?
4669,qd4985e34,,what was the rent expense in 2007?,what was the rent expense in 2007?


In [11]:
import tqdm

In [12]:
!pip install qdrant_client

Collecting qdrant_client
  Downloading qdrant_client-1.16.0-py3-none-any.whl.metadata (11 kB)
Collecting portalocker<4.0,>=2.7.0 (from qdrant_client)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Downloading qdrant_client-1.16.0-py3-none-any.whl (328 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.6/328.6 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, qdrant_client
Successfully installed portalocker-3.2.0 qdrant_client-1.16.0


In [13]:
#построим базу данных
from qdrant_client import QdrantClient, models

True

In [14]:
import uuid

In [15]:
from transformers import pipeline
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(device)

generation_pipeline = pipeline(
    "text-generation",
    model="RefalMachine/ruadapt_qwen2.5_3B_ext_u48_instruct_v4",
    device=device,
    torch_dtype=torch.float16
)

2025-11-17 19:54:00.937398: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763409241.164480      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763409241.229052      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

cuda


config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]



model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/12.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/759 [00:00<?, ?B/s]

Device set to use cuda


In [16]:
messages = [
    {"role": "system", "content": "Ты полезный и дружелюбный помощник."},
    {"role": "user", "content": "Привет, напиши анекдот про домработниц"},
]

print(generation_pipeline(messages, max_new_tokens=256, do_sample=True, temperature=0.5, top_p=0.9)[0]['generated_text'][-1]['content'])

Здравствуйте! Вот вам забавный анекдот про домработницу:

Жена: "Дорогой, ты знаешь, почему я тебя люблю?"
Муж: "Потому что ты всегда меня кормишь?"
Жена: "Ну да, но еще и мою домашнюю уборщицу!"
Муж: "Ой, не надо было это говорить!"


что за пизд

In [17]:
from sentence_transformers import SentenceTransformer

In [18]:
#эмбеддинги модели 

embedding_model = SentenceTransformer("intfloat/multilingual-e5-large", model_kwargs={'torch_dtype': torch.float16})

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

In [20]:
import qdrant_client

In [21]:
# создаем базу данных, в которой представляем схожесть векторов через косинусное расстояние 
from qdrant_client import QdrantClient, models

client = QdrantClient(":memory:")

client.create_collection(
    collection_name="icaif",
    on_disk_payload=True,
    vectors_config=models.VectorParams(
        size=1024,
        distance=models.Distance.COSINE,
        on_disk=True
    ),
)

True

In [22]:
#чанкинг
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

In [23]:
cor_df['text+title'][1]

' NOTE 13. INCOME TAXES\nWe calculate our provision for federal and state income taxes based on current tax law. U.S. federal tax reform (Tax Act) was enacted on December 22, 2017, and has several key provisions impacting the accounting for and reporting of income taxes. The most significant provision reduced the U.S. corporate statutory tax rate from 35% to 21% beginning on January 1, 2018. We remeasured the applicable deferred tax assets and liabilities based on the rates at which they are expected to reverse. As a result, the gross deferred tax assets and liabilities were adjusted which resulted in an expense for income taxes of $7.1 million which was fully offset by a corresponding change to our valuation allowance in 2017. The Tax Act contains several base broadening provisions that became effective on January 1, 2018, that did not have a material impact on 2018 and 2019 earnings.\nDeferred tax asset (liability) is comprised of the following (in thousands):\nWe have determined it 

In [24]:
from tqdm import tqdm

In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [26]:
text_chunks = [
    {cor_df['_id'].iloc[i]: text_splitter.split_text(cor_df['text+title'][i])}
    for i in tqdm(tqdm((range(len(cor_df)))))]

  0%|          | 0/32225 [00:00<?, ?it/s]
  2%|▏         | 684/32225 [00:00<00:04, 6830.87it/s]
  4%|▍         | 1394/32225 [00:00<00:04, 6971.89it/s][A
  7%|▋         | 2124/32225 [00:00<00:04, 7115.95it/s][A
  9%|▉         | 2885/32225 [00:00<00:04, 7308.97it/s][A
 14%|█▎        | 4378/32225 [00:00<00:02, 10053.98it/s][A
 18%|█▊        | 5807/32225 [00:00<00:02, 11491.38it/s][A
 22%|██▏       | 7213/32225 [00:00<00:02, 12329.89it/s][A
 27%|██▋       | 8615/32225 [00:00<00:01, 12866.29it/s][A
 31%|███       | 10002/32225 [00:00<00:01, 13177.43it/s][A
 36%|███▌      | 11452/32225 [00:01<00:01, 13575.09it/s][A
 40%|███▉      | 12870/32225 [00:01<00:01, 13756.71it/s][A
 44%|████▍     | 14246/32225 [00:01<00:01, 12037.90it/s][A
 48%|████▊     | 15488/32225 [00:01<00:01, 10687.41it/s][A
 52%|█████▏    | 16606/32225 [00:01<00:01, 10544.80it/s][A
 55%|█████▍    | 17694/32225 [00:01<00:01, 10554.62it/s][A
 59%|█████▊    | 18859/32225 [00:01<00:01, 10845.94it/s][A
 62%|██████▏   | 

In [27]:
text_chunks[0]

{'d1b2e74c0': ['The following tables present the recorded investment by portfolio segment and by class, excluding commercial financing receivables and other miscellaneous financing receivables at December 31, 2019 and 2018. Commercial financing receivables are excluded from the presentation of financing receivables by portfolio segment, as they are short term in nature and the current estimated risk of loss and resulting impact to the company’s financing results are not material.',
  'Write-offs of lease receivables and loan receivables were $16 million and $47 million, respectively, for the year ended December 31, 2019. Provisions for credit losses recorded for lease receivables and loan receivables were a release of $6 million and an addition of $2 million, respectively, for the year ended December 31, 2019.',
  'The average recorded investment of impaired leases and loans for Americas, EMEA and Asia Pacific was $138 million, $49 million and $45 million, respectively, for the year en

In [28]:
text_chunks_id = []

In [29]:
for chunk_dict in text_chunks:
    key, values = next(iter(chunk_dict.items()))

    for chunk in values:
        text_chunks_id.append((key, chunk))

In [30]:
text_chunks_id[0]

('d1b2e74c0',
 'The following tables present the recorded investment by portfolio segment and by class, excluding commercial financing receivables and other miscellaneous financing receivables at December 31, 2019 and 2018. Commercial financing receivables are excluded from the presentation of financing receivables by portfolio segment, as they are short term in nature and the current estimated risk of loss and resulting impact to the company’s financing results are not material.')

In [31]:
idx, text_chunks = zip(*text_chunks_id)

In [34]:
len(text_chunks)

182416

In [None]:
vectors = embedding_model.encode(text_chunks, batch_size=32, device=device, normalize_embeddings=True, show_progress_bar=True).tolist()

In [33]:
from sentence_transformers import SentenceTransformer
import torch

# Инициализация модели
embedding_model = SentenceTransformer("intfloat/multilingual-e5-large", device="cuda")

# Получение размерности
embedding_dim = embedding_model.get_sentence_embedding_dimension()
print(f"Размерность векторов: {embedding_dim}")

Размерность векторов: 1024


In [35]:
#чтобы не грузить vectors, просто не использовать 
import numpy as np
import random
from typing import List

def generate_random_vectors(num_vectors: int = 182416, vector_dim: int = 1024) -> List[List[float]]:
    """
    Генерация случайных векторов с нормальным распределением
    """
    print(f"Генерация {num_vectors:,} векторов размерности {vector_dim}...")
    
    # Используем нормальное распределение (более реалистично для эмбеддингов)
    vectors = np.random.normal(0, 1, (num_vectors, vector_dim))
    
    # Нормализуем векторы (как при normalize_embeddings=True)
    vectors = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)
    
    # Конвертируем в список списков
    vectors_list = vectors.tolist()
    
    print(f"Сгенерировано {len(vectors_list):,} векторов")
    print(f"Размерность каждого вектора: {len(vectors_list[0])}")
    
    return vectors_list

# Использование
vectors = generate_random_vectors(182416, 1024)

Генерация 182,416 векторов размерности 1024...
Сгенерировано 182,416 векторов
Размерность каждого вектора: 1024


In [36]:
import uuid

# может будет полезно: https://habr.com/ru/articles/966966/

In [None]:
for i in tqdm(range(len(vectors))):
    client.upsert(
        collection_name='icaif',
        points=[
            models.PointStruct(
                id=str(uuid.uuid4()),
                vector=vectors[i],
                payload={
                    'text': text_chunks[i],
                }
            )
        ]
    )

  client.upsert(
 97%|█████████▋| 176546/182416 [03:05<00:06, 852.73it/s]

In [36]:
que_df

Unnamed: 0,_id,title,text,title+text
0,q1a73c1d4,,In which year was interest income greater than...,In which year was interest income greater tha...
1,q1a72ef98,,What was the Net Income (Loss) in 2019?,What was the Net Income (Loss) in 2019?
2,q1a716380,,What was the percentage of Plan Assets for Oth...,What was the percentage of Plan Assets for Ot...
3,q1a73f97e,,What was the service revenue from Americas in ...,What was the service revenue from Americas in...
4,q1a731ee6,,What was the Customer support renewal rate for...,What was the Customer support renewal rate fo...
...,...,...,...,...
4666,qd497c816,,what was the decrease amount on the net intere...,what was the decrease amount on the net inter...
4667,qd4988ecc,,what was the total of operating expenses in 2018?,what was the total of operating expenses in 2...
4668,qd4982266,,what is the current ratio of robert mondavi?,what is the current ratio of robert mondavi?
4669,qd4985e34,,what was the rent expense in 2007?,what was the rent expense in 2007?


In [37]:
query_vector = embedding_model.encode(que_df['title+text'], normalize_embeddings=True, device=device).tolist()

In [41]:
hits = client.search(
    collection_name="icaif",
    query_vector=query_vector[0],
    limit=10)
[hit.payload for hit in hits]

  hits = client.search(


[{'text': 'Investment Income and Interest Expense 2007 compared to 2006 The increase in investment income of $13 million was primarily due to higher realized gains on sales of investments, but partially offset by a lower average balance of interest-earning investments and increased equitymethod losses on investment partnerships.\nInterest expense increased $35 million in 2007, primarily due to higher average debt balances outstanding, largely related to commercial paper.\nOur commercial paper balances increased in the fourth quarter of 2007, causing a corresponding increase in interest expense, as a result of the payment made to withdraw from the Central States Pension Fund.'},
 {'text': '|  | December 31, 2008 | December 31, 2007 |\n| (in thousands) | Interest Expense | Amortization | Interest Expense | Amortization |\n| May 1, 2006 term loan | $1,219 | $952 | $6,023 | $504 |\n| July 31, 2008 term loan | 6,524 | 573 | — | — |\n| Total | $7,743 | $1,525 | $6,023 | $504 |\nThe increased

In [48]:
text_chunks_id[1][0]

'd1b2e74c0'

In [49]:
data = pd.DataFrame(text_chunks_id)

In [50]:
data

Unnamed: 0,0,1
0,d1b2e74c0,The following tables present the recorded inve...
1,d1b2e74c0,The average recorded investment of impaired le...
2,d1b2e74c0,($ in millions) ...
3,d1b2e74c0,Allowance for credit losses ...
4,d1b2e74c0,"Ending balance at December 31, 2019 ..."
...,...,...
95147,dd4baa80e,atoi for this segment climbed $ 10 in 2014 com...
95148,dd4baa80e,| 2015 | 2014 | 2013 \n----------------- ...
95149,dd4baa80e,this segment represents a portion of alcoa 201...
95150,dd4baa80e,"dollar , the euro , and the brazilian real .\n..."


In [54]:
[hit.payload for hit in hits][0]['text']

'Investment Income and Interest Expense 2007 compared to 2006 The increase in investment income of $13 million was primarily due to higher realized gains on sales of investments, but partially offset by a lower average balance of interest-earning investments and increased equitymethod losses on investment partnerships.\nInterest expense increased $35 million in 2007, primarily due to higher average debt balances outstanding, largely related to commercial paper.\nOur commercial paper balances increased in the fourth quarter of 2007, causing a corresponding increase in interest expense, as a result of the payment made to withdraw from the Central States Pension Fund.'

In [None]:
value = [hit.payload for hit in hits][0]['text']

idx = data.index[data[1] == value].tolist()

In [72]:
val = data[0].iloc[idx]

In [73]:
val

732    d1a73fa6e
Name: 0, dtype: object

In [64]:
import pandas as pd

rows = []

for i in range(100):
    row = {"col1": i, "col2": i * 2}
    rows.append(row)

df = pd.DataFrame(rows)


In [75]:
que_df['_id'].iloc[1]

'q1a72ef98'

In [104]:
len(query_df)

1663

In [101]:
len(query_vector)

4671

In [119]:
rows = []
for i in tqdm(range(len(que_df))):
    hits = client.search(
    collection_name="icaif",
    query_vector=query_vector[i],
    limit=10)
    for j in range(10):
        value = [hit.payload for hit in hits][j]['text']
        idx = data.index[data[1] == value].tolist()
        val = data[0].iloc[idx]
        row = {'query_id': que_df['_id'].iloc[i], 'corpus_id': val}
        rows.append(row)
result = pd.DataFrame(rows)

  hits = client.search(
100%|██████████| 4671/4671 [39:20<00:00,  1.98it/s]


In [120]:
result

Unnamed: 0,query_id,corpus_id
0,q1a73c1d4,"32936 d87de6e90 Name: 0, dtype: object"
1,q1a73c1d4,"33678 d8c3f3a28 Name: 0, dtype: object"
2,q1a73c1d4,"17065 d87e41d36 Name: 0, dtype: object"
3,q1a73c1d4,"47379 d8b39b2a2 Name: 0, dtype: object"
4,q1a73c1d4,"25782 d8e99e52a Name: 0, dtype: object"
...,...,...
46705,qd4989714,"11195 d8e4cbf8e Name: 0, dtype: object"
46706,qd4989714,"37837 d8c2c6ef2 Name: 0, dtype: object"
46707,qd4989714,"21149 d8166161c Name: 0, dtype: object"
46708,qd4989714,"42558 d8ac57680 Name: 0, dtype: object"


In [121]:
result['corpus_id_clean'] = result['corpus_id'].apply(lambda x: list(x)[-1] if hasattr(x, "__iter__") else x)

In [122]:
result = result.drop('corpus_id', axis=1)

In [123]:
result.rename(columns={'corpus_id_clean': 'corpus_id'}, inplace=True)

In [126]:
result.to_csv('/kaggle/working/xz3.csv', index=False)

In [125]:
result

Unnamed: 0,query_id,corpus_id
0,q1a73c1d4,d87de6e90
1,q1a73c1d4,d8c3f3a28
2,q1a73c1d4,d87e41d36
3,q1a73c1d4,d8b39b2a2
4,q1a73c1d4,d8e99e52a
...,...,...
46705,qd4989714,d8e4cbf8e
46706,qd4989714,d8c2c6ef2
46707,qd4989714,d8166161c
46708,qd4989714,d8ac57680


скор: 0.20620

BM25 + Dense

In [129]:
pip install rank_bm25

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
Note: you may need to restart the kernel to use updated packages.


In [156]:
from rank_bm25 import BM25Okapi
import numpy as np

class BM25Retriever:
    def __init__(self, documents):
        self.docs = documents['text+title'].tolist()
        self.ids = documents['_id'].tolist()

        tokenized = [d.split() for d in self.docs]
        self.bm25 = BM25Okapi(tokenized)

    def retrieve(self, query, top_k=5):
        scores = self.bm25.get_scores(query.split())
        idx = np.argsort(scores)[::-1][:top_k]

        return [
            {
                "id": self.ids[i],
                "text": self.docs[i],
                "score": float(scores[i])
            }
            for i in idx
        ]


In [163]:
class DenseRetriever:
    def __init__(self, embedder, client, collection):
        self.embedder = embedder
        self.client = client
        self.collection = collection

    def retrieve(self, query, top_k=5, metadata_filter=None):
        q_emb = self.embedder.encode(query)

        res = self.client.query_points(
            collection_name=self.collection,
            query=q_emb,           
            query_filter=metadata_filter,   
            limit=top_k
        ).points

        return [
            {
                "id": r.id,
                "text": r.payload["text"],
                "score": r.score
            }
            for r in res
        ]


In [164]:
dense = DenseRetriever(embedding_model, client, "icaif")
bm25 = BM25Retriever(cor_df)

In [160]:
class HybridRetriever:
    def __init__(self, dense_retriever, bm25_retriever, alpha=0.5):
        self.dense = dense_retriever
        self.bm25 = bm25_retriever
        self.alpha = alpha

    def retrieve(self, query, top_k=10, metadata_filter=None):
        bm25 = self.bm25.retrieve(query, top_k=top_k)
        dense = self.dense.retrieve(query, top_k=top_k, metadata_filter=metadata_filter)

        combined = {}

        # Dense
        for d in dense:
            combined[d["id"]] = {
                "source": d,
                "dense": d["score"],
                "bm25": 0
            }

        # BM25
        for b in bm25:
            doc_id = b["id"]
            if doc_id not in combined:
                combined[doc_id] = {"source": b, "dense": 0, "bm25": b["score"]}
            else:
                combined[doc_id]["bm25"] = b["score"]

        # финальный скор
        for c in combined.values():
            c["final"] = self.alpha * c["dense"] + (1 - self.alpha) * c["bm25"]

        final = sorted(combined.values(), key=lambda x: x["final"], reverse=True)[:top_k]

        return [f["source"] for f in final]


In [141]:
que_df['title+text'][0]

' In which year was interest income greater than 7,000 thousands?'

In [165]:
hybrid = HybridRetriever(dense, bm25, alpha=0.5)

In [166]:
results = hybrid.retrieve(" In which year was interest income greater than 7,000 thousands?", top_k=10)

In [168]:
results[0]['id']

'd869ea586'

In [169]:
results[0]['text']

" | Named Exeutive Officer | Stock Options |\n| J. Wayne Leonard | 175,000 |\n| Leo P. Denault | 50,000 |\n| Richard J. Smith | 35,000 |\n| E. Renae Conley | 15,600 |\n| Hugh T. McDonald | 7,000 |\n| Haley Fisackerly | 5,000 |\n| Joseph F. Domino | 7,000 |\n| Roderick K. West | 8,000 |\n| Theodore H. Bunting, Jr. | 18,000 |\n| Carolyn Shanks | 7,000 |\nThe option grants awarded to the Named Executive Officers (other than Mr.  Leonard and Mr.  Lewis) ranged in amount between 5,000 and 50,000 shares.\nMr.  Lewis did not receive any stock option awards in 2008.\nIn the case of Mr.  Leonard, who received 175,000 stock options, the Committee took special note of his performance as Entergy Corporation's Chief Executive Officer.\nAmong other things, the Committee noted that"

In [172]:
len(results)

10

In [170]:
que_df['title+text'][1]

' What was the Net Income (Loss) in 2019?'

In [175]:
rows = []
for i in tqdm(range(len(que_df))):
    results = []
    results = hybrid.retrieve(que_df['title+text'][i], top_k=10)
    for j in range(len(results)):
        row = {'query_id': que_df['_id'][i], 'corpus_id': results[j]['id']}
        rows.append(row)

100%|██████████| 4671/4671 [49:01<00:00,  1.59it/s]


In [176]:
df = pd.DataFrame(rows)

In [178]:
df.to_csv('/kaggle/working/xz4.csv', index=False)

In [177]:
df

Unnamed: 0,query_id,corpus_id
0,q1a73c1d4,d869ea586
1,q1a73c1d4,d863d98cc
2,q1a73c1d4,JNJ20231807
3,q1a73c1d4,GOOGL20231108
4,q1a73c1d4,d8dc0635e
...,...,...
46705,qd4989714,d8ea8bcd0
46706,qd4989714,d8d2c9b56
46707,qd4989714,d88000c08
46708,qd4989714,d8e42d0aa


скор 0.22276

# Подключим LLM

In [None]:
def llm_answer(query, context):
    prompt = f"""Information from the book:
{context}

Question:
{query}"""
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt}, 
    ]
    output = generation_pipeline(messages, max_new_tokens=512, do_sample=True, temperature=TEMP, top_p=0.9)

    return output[0]['generated_text'][-1]['content'], context

In [None]:
def predict(query):
    selected_chunks = semantic_search(client, query)
    context = ' ; '.join([f"Book content piece: {chunk['text']}" for chunk in selected_chunks])
    pages = [chunk['page'] for chunk in selected_chunks]

    return (llm_answer(query, context), pages)

In [None]:
for i, query in enumerate(queries):
    print(i, query)
    ans = predict(query)
    submission_data.append({
        'ID': i+1,
        'context': str(ans[0][1]),
        'answer': str(ans[0][0]),
        'references': json.dumps({"sections": ["section_1"], "pages": ans[1]})
    })

# Добавим кросс-энкодер

In [181]:
from sentence_transformers import SentenceTransformer, CrossEncoder

In [None]:
#или 
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

# --- Cross-Encoder Reranking Example ---
# This part can be used later during retrieval to refine similarity ranking
def rerank_with_cross_encoder(query, passages, top_k=5):
    """
    Given a query and a list of passage dicts (each with 'text' or 'content'),
    returns the top_k highest-ranked passages.
    """
    # Extract plain text for scoring
    texts = [p["text"] if "text" in p else p["content"] for p in passages]

    # Form query-passage pairs for the cross-encoder
    pairs = [(query, text) for text in texts]

    # Get relevance scores
    scores = cross_encoder.predict(pairs)

    # Sort by score
    ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)

    # Return the top_k *original* passage dicts
    return [passages[i] for i in ranked_indices[:top_k]]

In [179]:
#или
class CrossEncoderReranker:
    def __init__(self, model_name="cross-encoder/ms-marco-MiniLM-L-6-v2"):
        self.model = CrossEncoder(model_name)

    def rerank(self, query, candidates, top_k=5):
        """
        candidates — список dict: {"id", "text", "score"}
        """
        pairs = [(query, c["text"]) for c in candidates]

        scores = self.model.predict(pairs)

        reranked = sorted(
            [
                {**c, "rerank_score": float(s)}
                for c, s in zip(candidates, scores)
            ],
            key=lambda x: x["rerank_score"],
            reverse=True
        )

        return reranked[:top_k]


# -----------------------------
# 5) Общая система — двухступенчатый ретривал
# -----------------------------
class TwoStageRetriever:
    def __init__(self, hybrid, reranker):
        self.hybrid = hybrid
        self.reranker = reranker

    def retrieve(self, query, n_candidates=20, final_top_k=5):
        # Этап 1 — быстрый поиск
        stage1 = self.hybrid.retrieve(query, top_k=n_candidates)

        # Этап 2 — rerank через cross-encoder
        stage2 = self.reranker.rerank(query, stage1, top_k=final_top_k)

        return stage2


In [182]:
reranker = CrossEncoderReranker()

retriever = TwoStageRetriever(hybrid, reranker)

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [183]:
results = retriever.retrieve(' What was the Net Income (Loss) in 2019?', n_candidates=20, final_top_k=3)

results

[{'id': 'd1b35c0d6',
  'text': ' Net Income (Loss) per Common Share\nBasic net income per share is based on the weighted average common and Class A shares outstanding. Diluted net income per share includes any dilutive effects of stock options outstanding and unvested restricted shares.\nBasic net income per share was calculated by dividing net income by the weighted-average number of common and Class A shares outstanding during the period. Diluted net income per share was calculated by dividing net income by the weighted-average number of common shares outstanding during the period plus the dilutive effects of stock options and unvested restricted shares. Due to the net loss in the year ended June 3, 2017 restricted shares in the amount of 131,292 were excluded from the calculation of diluted earnings per share because their inclusion would have been\nantidilutive. The computations of basic net income per share and diluted net income per share are as follows (in thousands):\n\n       

In [184]:
rows = []
for i in tqdm(range(len(que_df))):
    results = []
    results = retriever.retrieve(que_df['title+text'][i], n_candidates=20, final_top_k=10)
    for j in range(len(results)):
        row = {'query_id': que_df['_id'][i], 'corpus_id': results[j]['id']}
        rows.append(row)

100%|██████████| 4671/4671 [59:58<00:00,  1.30it/s] 


In [185]:
res = pd.DataFrame(rows)
res

Unnamed: 0,query_id,corpus_id
0,q1a73c1d4,d893a246e
1,q1a73c1d4,d8dc0635e
2,q1a73c1d4,d8788d7c8
3,q1a73c1d4,dd2abb3e0
4,q1a73c1d4,d867936a2
...,...,...
46705,qd4989714,d8e42d0aa
46706,qd4989714,d882fa0a8
46707,qd4989714,d8b024a4c
46708,qd4989714,d8ea8bcd0


In [186]:
res.to_csv('/kaggle/working/xz5.csv', index=False)

скор: 0.22292

# Adaptive-K для Cross-Encoder

In [187]:
class AdaptiveKReranker:
    def __init__(self, model_name="cross-encoder/ms-marco-MiniLM-L-6-v2", threshold=0.3):
        """
        threshold — минимальный порог уверенности cross-encoder
        """
        self.model = CrossEncoder(model_name)
        self.threshold = threshold

    def rerank(self, query, candidates):
        """
        Возвращает все кандидаты с оценкой выше порога.
        """
        pairs = [(query, c["text"]) for c in candidates]
        scores = self.model.predict(pairs)

        reranked = [
            {**c, "score": float(s)}
            for c, s in zip(candidates, scores)
            if float(s) >= self.threshold
        ]

        # сортируем по score
        reranked = sorted(reranked, key=lambda x: x["score"], reverse=True)

        return reranked


In [188]:
class TwoStageAdaptiveRetriever:
    def __init__(self, retriever_stage1, adaptive_reranker):
        self.retriever_stage1 = retriever_stage1
        self.reranker = adaptive_reranker

    def retrieve(self, query, n_candidates=30):
        # Этап 1 — быстрый грубый поиск
        stage1 = self.retriever_stage1.retrieve(query, top_k=n_candidates)

        # Этап 2 — Adaptive K reranking
        stage2 = self.reranker.rerank(query, stage1)

        return stage2


# Модели Cross-encoder
английские модели
TOP-1 рекомендация
CROSS_ENCODER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"
Или
CROSS_ENCODER_MODEL = "cross-encoder/ms-marco-MiniLM-L-12-v2"
Для максимального качества
CROSS_ENCODER_MODEL = "cross-encoder/ms-marco-electra-base"

многоязычные модели 
Для мультиязычных данных
CROSS_ENCODER_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
Или
CROSS_ENCODER_MODEL = "sentence-transformers/distiluse-base-multilingual-cased-v2"

In [190]:
adaptive_reranker = AdaptiveKReranker(
    model_name="cross-encoder/ms-marco-MiniLM-L-6-v2,
    threshold=0.45   # выбираем порог
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at intfloat/multilingual-e5-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [192]:
retriever = TwoStageAdaptiveRetriever(hybrid, adaptive_reranker)

query = ' What was the Net Income (Loss) in 2019?'

results = retriever.retrieve(query, n_candidates=10)

results

[{'id': 'd873e2002',
  'text': ' CONSOLIDATED RESULTS Net sales and revenue and operating income numbers reported in our consolidated results do not include the activity of our discontinued operations: ?Containerboard, Packaging and Recycling operations (sold in August 2008); ?Australian operations (sold in July 2008); ?Trus Joist?Commercial division (held for sale as of December 2008); ?Fine Paper operations (divested in March 2007); ?Irish composite panel operations (sold in November 2006); and ?North American composite panel operations (sold in July 2006).\nWe report these activities and results as discontinued operations in our Consolidated Statement of Earnings.\nHowever, we include the results of these operations in the segment discussions that follow.\nSee Note 3: Discontinued Operations and Assets Held For Sale in the Notes to Consolidated Financial Statements for more information about our discontinued operations.\nHOW WE DID IN 2008 Net Sales and Revenues, Operating Income (L

In [193]:
rows = []
for i in tqdm(range(len(que_df))):
    results = []
    results = retriever.retrieve(que_df['title+text'][i], n_candidates=10)
    for j in range(len(results)):
        row = {'query_id': que_df['_id'][i], 'corpus_id': results[j]['id']}
        rows.append(row)

100%|██████████| 4671/4671 [1:57:28<00:00,  1.51s/it]  


In [194]:
res2 = pd.DataFrame(rows)

In [195]:
res2.to_csv('/kaggle/working/xz6.csv', index=False)

# Multi-hop/ Self-ask - разбивает запрос на части + Self-RAG + ReAct

In [None]:
import re
from typing import List, Dict, Any, Optional, Tuple
import json

In [None]:
class MultiHopReasoning:
    def __init__(self, llm):
        self.llm = llm
    
    def break_down_question(self, complex_question: str) -> List[str]:
        """
        Разбивка сложного вопроса на подвопросы
        """
        prompt = f"""Разбей сложный вопрос на последовательность простых подвопросов, 
        которые нужно ответить по очереди чтобы получить окончательный ответ.

        Сложный вопрос: {complex_question}

        Подвопросы:
        1."""

        response = self.llm.generate(prompt, max_length=300)
        
        # Парсинг подвопросов из ответа
        sub_questions = self._parse_subquestions(response)
        
        print(f"Сложный вопрос разбит на {len(sub_questions)} подвопросов:")
        for i, q in enumerate(sub_questions, 1):
            print(f"  {i}. {q}")
        
        return sub_questions
    
    def _parse_subquestions(self, text: str) -> List[str]:
        """Парсинг подвопросов из текста LLM"""
        lines = text.split('\n')
        questions = []
        
        for line in lines:
            line = line.strip()
            # Ищем строки с номерами (1., 2., и т.д.)
            if any(line.startswith(f"{i}.") for i in range(1, 10)):
                question = line.split('.', 1)[1].strip()
                if question and '?' in question:
                    questions.append(question)
        
        return questions if questions else [text]
    
    def solve_multi_hop(self, complex_question: str, rag_system, max_hops: int = 5) -> Dict:
        """
        Решение сложного вопроса через последовательность подвопросов
        """
        print(f"🧠 Multi-hop reasoning для: {complex_question}")
        
        # Разбиваем на подвопросы
        sub_questions = self.break_down_question(complex_question)
        sub_questions = sub_questions[:max_hops]  # Ограничиваем количество hops
        
        intermediate_answers = []
        context_chain = []
        
        for i, sub_q in enumerate(sub_questions, 1):
            print(f"\n🏃 Hop {i}/{len(sub_questions)}: {sub_q}")
            
            # Ищем релевантные документы для подвопроса
            search_results = rag_system.search_with_reranking(sub_q, vector_top_k=10, final_top_k=3)
            
            # Генерируем ответ на подвопрос
            context = "\n".join([doc['text'] for doc in search_results])
            answer = self.llm.generate(sub_q, context)
            
            intermediate_answers.append({
                'sub_question': sub_q,
                'answer': answer,
                'source_documents': search_results
            })
            
            # Сохраняем контекст для следующего шага
            context_chain.append(f"Q{i}: {sub_q}\nA{i}: {answer}")
            
            print(f"   Ответ: {answer[:100]}...")
        
        # Синтезируем финальный ответ на основе всех промежуточных ответов
        final_context = "\n\n".join(context_chain)
        final_prompt = f"""На основе следующих вопросов и ответов, дай окончательный ответ на исходный вопрос.

Промежуточные вопросы и ответы:
{final_context}

Исходный вопрос: {complex_question}

Окончательный ответ:"""

        final_answer = self.llm.generate(final_prompt, max_length=500)
        
        return {
            'final_answer': final_answer,
            'intermediate_answers': intermediate_answers,
            'reasoning_chain': context_chain
        }

In [None]:
class SelfRAG:
    def __init__(self, llm, vector_store, cross_encoder=None):
        self.llm = llm
        self.vector_store = vector_store
        self.cross_encoder = cross_encoder
    
    def retrieve_with_self_evaluation(self, query: str, max_docs: int = 10) -> List[Dict]:
        """
        Self-RAG: поиск с самооценкой релевантности
        """
        # Первоначальный поиск
        candidate_docs = self.vector_store.search(query, top_k=max_docs * 2)
        
        if not self.cross_encoder:
            return candidate_docs[:max_docs]
        
        # Self-evaluation релевантности
        evaluated_docs = []
        for doc in candidate_docs:
            relevance_score = self._evaluate_relevance(query, doc['text'])
            doc['self_rag_score'] = relevance_score
            evaluated_docs.append(doc)
        
        # Сортировка по self-evaluation score
        evaluated_docs.sort(key=lambda x: x.get('self_rag_score', 0), reverse=True)
        
        # Фильтрация низкорелевантных документов
        filtered_docs = [doc for doc in evaluated_docs 
                        if doc.get('self_rag_score', 0) > 0.3]
        
        return filtered_docs[:max_docs]
    
    def _evaluate_relevance(self, query: str, document: str) -> float:
        """
        Оценка релевантности документа запросу
        """
        prompt = f"""Оцени насколько следующий документ релевантен запросу.
        Верни только число от 0.0 (совсем не релевантен) до 1.0 (полностью релевантен).

        Запрос: {query}

        Документ: {document[:500]}

        Оценка релевантности:"""

        try:
            response = self.llm.generate(prompt, max_length=10)
            score = float(response.strip())
            return max(0.0, min(1.0, score))  # Клиппинг к [0, 1]
        except:
            return 0.5  # Дефолтная оценка при ошибке
    
    def generate_with_self_reflection(self, query: str, context: str) -> Dict:
        """
        Генерация с саморефлексией и проверкой фактов
        """
        # Генерация первоначального ответа
        initial_prompt = f"""Ответь на вопрос на основе предоставленного контекста.

        Контекст: {context}

        Вопрос: {query}

        Ответ:"""
        
        initial_answer = self.llm.generate(initial_prompt)
        
        # Self-reflection: проверка качества ответа
        reflection = self._reflect_on_answer(query, context, initial_answer)
        
        # Если ответ плохой, пытаемся улучшить
        if reflection.get('needs_improvement', True):
            improved_answer = self._improve_answer(query, context, initial_answer, reflection)
        else:
            improved_answer = initial_answer
        
        return {
            'initial_answer': initial_answer,
            'final_answer': improved_answer,
            'reflection': reflection,
            'is_verified': reflection.get('is_supported', False)
        }
    
    def _reflect_on_answer(self, query: str, context: str, answer: str) -> Dict:
        """
        Саморефлексия над сгенерированным ответом
        """
        prompt = f"""Проанализируй ответ на вопрос и оцени:
        1. Подтверждается ли ответ предоставленным контекстом?
        2. Является ли ответ полным и точным?
        3. Есть ли в ответе фактические ошибки?

        Вопрос: {query}
        Контекст: {context}
        Ответ: {answer}

        Верни ответ в формате:
        is_supported: [true/false]
        is_complete: [true/false] 
        has_errors: [true/false]
        needs_improvement: [true/false]
        critique: [краткая критика]"""

        response = self.llm.generate(prompt)
        
        # Парсинг ответа
        return self._parse_reflection(response)
    
    def _parse_reflection(self, text: str) -> Dict:
        """Парсинг self-reflection ответа"""
        result = {
            'is_supported': False,
            'is_complete': False,
            'has_errors': True,
            'needs_improvement': True,
            'critique': 'Не удалось проанализировать'
        }
        
        lines = text.split('\n')
        for line in lines:
            line = line.lower().strip()
            if 'is_supported:' in line:
                result['is_supported'] = 'true' in line
            elif 'is_complete:' in line:
                result['is_complete'] = 'true' in line
            elif 'has_errors:' in line:
                result['has_errors'] = 'true' in line
            elif 'needs_improvement:' in line:
                result['needs_improvement'] = 'true' in line
            elif 'critique:' in line:
                result['critique'] = line.split('critique:')[1].strip()
        
        return result
    
    def _improve_answer(self, query: str, context: str, initial_answer: str, reflection: Dict) -> str:
        """
        Улучшение ответа на основе саморефлексии
        """
        prompt = f"""Улучши следующий ответ на вопрос, учитывая критику.
        Используй только информацию из предоставленного контекста.

        Исходный вопрос: {query}
        Контекст: {context}
        Исходный ответ: {initial_answer}
        Критика: {reflection.get('critique', '')}

        Улучшенный ответ:"""

        return self.llm.generate(prompt)

In [None]:
class ReActAgent:
    def __init__(self, llm, rag_system, max_steps: int = 10):
        self.llm = llm
        self.rag_system = rag_system
        self.max_steps = max_steps
        self.actions = ['search', 'calculate', 'reason', 'final_answer']
    
    def solve_with_react(self, question: str) -> Dict:
        """
        ReAct подход: Reasoning + Acting
        """
        print(f"🤖 ReAct решение для: {question}")
        
        trajectory = []
        context = ""
        step = 0
        
        while step < self.max_steps:
            step += 1
            print(f"\n🔹 Step {step}:")
            
            # Генерация мыслей и действий
            react_response = self._generate_thought_and_action(question, context, trajectory)
            
            trajectory.append(react_response)
            action = react_response.get('action', 'reason')
            
            print(f"   Мысль: {react_response['thought'][:100]}...")
            print(f"   Действие: {action}")
            
            # Выполнение действия
            if action == 'search':
                search_query = react_response.get('query', question)
                search_results = self.rag_system.search_with_reranking(search_query, vector_top_k=5, final_top_k=2)
                new_context = "\n".join([doc['text'] for doc in search_results])
                context += f"\nНайденная информация: {new_context}"
                react_response['search_results'] = search_results
                
            elif action == 'final_answer':
                # Завершаем процесс
                react_response['is_final'] = True
                break
            
            # Если слишком долго думаем, принудительно завершаем
            if step >= self.max_steps:
                react_response['action'] = 'final_answer'
                react_response['thought'] = "Достигнут лимит шагов, формулирую итоговый ответ"
        
        # Генерация финального ответа
        final_answer = self._synthesize_final_answer(question, trajectory)
        
        return {
            'final_answer': final_answer,
            'trajectory': trajectory,
            'steps_taken': step
        }
    
    def _generate_thought_and_action(self, question: str, context: str, trajectory: List) -> Dict:
        """
        Генерация мысли и следующего действия
        """
        trajectory_text = "\n".join([
            f"Step {i+1}: {step['thought']} -> {step.get('action', 'reason')}"
            for i, step in enumerate(trajectory)
        ])
        
        prompt = f"""Ты - reasoning agent. Думай шаг за шагом и выбирай действия.

Доступные действия:
- search: поиск информации в базе знаний
- reason: логические размышления  
- calculate: вычисления (если нужны)
- final_answer: дать окончательный ответ

История:
{trajectory_text}

Текущий контекст: {context}

Вопрос: {question}

Сначала подумай (thought), затем выбери действие (action).
Если нужно искать информацию, укажи поисковый запрос (query).

Формат ответа:
Thought: [твои рассуждения]
Action: [search|reason|calculate|final_answer]
Query: [если action=search, укажи что искать]"""

        response = self.llm.generate(prompt)
        
        # Парсинг ответа
        return self._parse_react_response(response)
    
    def _parse_react_response(self, text: str) -> Dict:
        """Парсинг ReAct ответа"""
        result = {
            'thought': '',
            'action': 'reason',
            'query': ''
        }
        
        lines = text.split('\n')
        current_key = None
        
        for line in lines:
            line = line.strip()
            if line.startswith('Thought:'):
                current_key = 'thought'
                result['thought'] = line.replace('Thought:', '').strip()
            elif line.startswith('Action:'):
                current_key = 'action'
                action = line.replace('Action:', '').strip().lower()
                if action in self.actions:
                    result['action'] = action
            elif line.startswith('Query:'):
                current_key = 'query'
                result['query'] = line.replace('Query:', '').strip()
            elif current_key and line:
                result[current_key] += ' ' + line
        
        return result
    
    def _synthesize_final_answer(self, question: str, trajectory: List) -> str:
        """Синтез финального ответа из trajectory"""
        reasoning_chain = "\n".join([
            f"Шаг {i+1}: {step['thought']}"
            for i, step in enumerate(trajectory)
        ])
        
        prompt = f"""На основе цепочки рассуждений, дай окончательный ответ на вопрос.

Вопрос: {question}

Цепочка рассуждений:
{reasoning_chain}

Окончательный ответ:"""

        return self.llm.generate(prompt)

In [None]:
class AdvancedRAGSystem:
    def __init__(self, vector_store, llm, cross_encoder=None):
        self.vector_store = vector_store
        self.llm = llm
        self.cross_encoder = cross_encoder
        
        # Инициализация компонентов
        self.multi_hop = MultiHopReasoning(llm)
        self.self_rag = SelfRAG(llm, vector_store, cross_encoder)
        self.react_agent = ReActAgent(llm, self)
    
    def search_with_reranking(self, query: str, vector_top_k: int = 20, final_top_k: int = 5) -> List[Dict]:
        """Поиск с re-ranking"""
        # Векторный поиск
        vector_results = self.vector_store.search(query, top_k=vector_top_k)
        
        if self.cross_encoder and len(vector_results) > final_top_k:
            # Re-ranking с cross-encoder
            reranked = self.cross_encoder.rerank_documents(query, vector_results, top_k=final_top_k)
            return reranked
        else:
            return vector_results[:final_top_k]
    
    def query(self, 
             question: str, 
             method: str = "auto",
             use_self_reflection: bool = True) -> Dict:
        """
        Умный запрос с выбором метода reasoning
        
        Args:
            method: "multi_hop", "react", "self_rag", "auto"
        """
        print(f"🎯 Вопрос: {question}")
        print(f"📝 Метод: {method}")
        
        # Автоматический выбор метода
        if method == "auto":
            method = self._select_method(question)
            print(f"🤖 Автоматически выбран метод: {method}")
        
        if method == "multi_hop":
            result = self.multi_hop.solve_multi_hop(question, self)
            result['method'] = 'multi_hop'
            
        elif method == "react":
            result = self.react_agent.solve_with_react(question)
            result['method'] = 'react'
            
        elif method == "self_rag":
            # Self-RAG подход
            docs = self.self_rag.retrieve_with_self_evaluation(question)
            context = "\n".join([doc['text'] for doc in docs])
            
            if use_self_reflection:
                generation_result = self.self_rag.generate_with_self_reflection(question, context)
                result = {
                    'final_answer': generation_result['final_answer'],
                    'documents': docs,
                    'reflection': generation_result['reflection'],
                    'method': 'self_rag'
                }
            else:
                answer = self.llm.generate(question, context)
                result = {
                    'final_answer': answer,
                    'documents': docs,
                    'method': 'self_rag'
                }
        else:
            # Стандартный RAG
            docs = self.search_with_reranking(question)
            context = "\n".join([doc['text'] for doc in docs])
            answer = self.llm.generate(question, context)
            result = {
                'final_answer': answer,
                'documents': docs,
                'method': 'standard_rag'
            }
        
        return result
    
    def _select_method(self, question: str) -> str:
        """
        Автоматический выбор метода reasoning на основе вопроса
        """
        prompt = f"""Проанализируй вопрос и определи какой метод reasoning лучше подойдет:
        - multi_hop: для сложных вопросов требующих нескольких шагов
        - react: для вопросов требующих поиска и логических рассуждений  
        - self_rag: для точных ответов с проверкой фактов
        - standard_rag: для простых фактологических вопросов

        Вопрос: {question}

        Верни только название метода: multi_hop, react, self_rag или standard_rag"""

        response = self.llm.generate(prompt, max_length=20).strip().lower()
        
        if any(method in response for method in ['multi_hop', 'multi hop', 'hop']):
            return "multi_hop"
        elif any(method in response for method in ['react', 'reason']):
            return "react"
        elif any(method in response for method in ['self_rag', 'self', 'rag']):
            return "self_rag"
        else:
            return "standard_rag"

# Дополнительные утилиты
class ReasoningUtils:
    @staticmethod
    def visualize_reasoning(result: Dict):
        """Визуализация процесса reasoning"""
        method = result.get('method', 'unknown')
        print(f"\n{'='*60}")
        print(f"🧠 REASONING PROCESS: {method.upper()}")
        print(f"{'='*60}")
        
        if method == 'multi_hop':
            for i, step in enumerate(result.get('intermediate_answers', [])):
                print(f"🔹 Hop {i+1}: {step['sub_question']}")
                print(f"   Ответ: {step['answer'][:100]}...")
                
        elif method == 'react':
            for i, step in enumerate(result.get('trajectory', [])):
                print(f"🔹 Step {i+1}: {step['thought'][:80]}...")
                print(f"   Действие: {step.get('action', 'N/A')}")
                
        elif method == 'self_rag':
            reflection = result.get('reflection', {})
            print(f"✓ Подтверждено: {reflection.get('is_supported', False)}")
            print(f"✓ Полнота: {reflection.get('is_complete', False)}")
            print(f"📝 Критика: {reflection.get('critique', 'N/A')}")
        
        print(f"\n🎯 Финальный ответ: {result['final_answer']}")
        print(f"{'='*60}")

In [None]:
def demo_advanced_rag():
    """Демонстрация работы расширенной RAG системы"""
    
    # Инициализация компонентов
    vector_store = YourVectorStore()  # Ваша векторная БД
    llm = YourLLM()                  # Ваша LLM модель
    cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
    
    # Создание системы
    advanced_rag = AdvancedRAGSystem(vector_store, llm, cross_encoder)
    
    # Тестовые вопросы
    test_questions = [
        "Как изменилась политика компании Apple в отношении прав разработчиков после 2020 года и какие это имело последствия для доходов приложений?",
        "Рассчитай среднюю скорость движения автомобиля, который проехал 150 км за 2 часа, а затем 100 км за 1.5 часа.",
        "Какие основные принципы квантовой механики отличают ее от классической физики?",
    ]
    
    for question in test_questions:
        print(f"\n{'#'*80}")
        print(f"ВОПРОС: {question}")
        print(f"{'#'*80}")
        
        # Автоматический выбор метода
        result = advanced_rag.query(question, method="auto")
        
        # Визуализация процесса
        ReasoningUtils.visualize_reasoning(result)
        
        print(f"\n📚 Использовано документов: {len(result.get('documents', []))}")
        print(f"⚡ Метод: {result.get('method', 'N/A')}")

if __name__ == "__main__":
    demo_advanced_rag()

# Суммаризация контекста

In [None]:
class ContextSummarizer:
    def __init__(self, llm):
        self.llm = llm
    
    def summarize_with_sources(self, documents: List[Dict], query: str = None, 
                             max_summary_length: int = 1000) -> Dict:
        """
        Суммаризация контекста с указанием источников для каждого факта
        """
        if not documents:
            return {
                'summary': 'Нет доступной информации для суммаризации',
                'source_mapping': {},
                'key_points': []
            }
        
        print(f"📝 Суммаризация {len(documents)} документов...")
        
        # Подготавливаем документы с ID
        prepared_docs = self._prepare_documents_with_ids(documents)
        
        # Генерируем суммаризацию с источниками
        summary_result = self._generate_attributed_summary(prepared_docs, query, max_summary_length)
        
        # Извлекаем ключевые точки с источниками
        key_points = self._extract_key_points_with_sources(prepared_docs, query)
        
        return {
            'summary': summary_result['summary'],
            'source_mapping': summary_result['source_mapping'],
            'key_points': key_points,
            'total_documents': len(documents),
            'documents_used': list(summary_result['source_mapping'].keys())
        }
    
    def _prepare_documents_with_ids(self, documents: List[Dict]) -> List[Dict]:
        """Подготовка документов с уникальными идентификаторами"""
        prepared_docs = []
        
        for i, doc in enumerate(documents):
            doc_id = doc.get('id', f"doc_{i}")
            prepared_docs.append({
                'id': doc_id,
                'text': doc['text'],
                'metadata': doc.get('metadata', {}),
                'score': doc.get('score', 0.0),
                'source': doc.get('metadata', {}).get('source', f'Документ {i+1}')
            })
        
        return prepared_docs
    
    def _generate_attributed_summary(self, documents: List[Dict], query: str = None, 
                                   max_length: int = 1000) -> Dict:
        """Генерация суммаризации с привязкой фактов к источникам"""
        
        # Сортируем документы по релевантности
        sorted_docs = sorted(documents, key=lambda x: x.get('score', 0), reverse=True)
        
        # Создаем промпт для суммаризации с источниками
        prompt = self._build_summarization_prompt(sorted_docs, query, max_length)
        
        # Генерируем суммаризацию
        summary = self.llm.generate(prompt, max_length=max_length + 500)
        
        # Парсим источники из суммаризации
        source_mapping = self._parse_sources_from_summary(summary, sorted_docs)
        
        # Очищаем суммаризацию от маркеров источников для финального вывода
        clean_summary = self._clean_summary_text(summary)
        
        return {
            'summary': clean_summary,
            'source_mapping': source_mapping,
            'raw_summary': summary
        }
    
    def _build_summarization_prompt(self, documents: List[Dict], query: str = None, 
                                  max_length: int = 1000) -> str:
        """Создание промпта для суммаризации с источниками"""
        
        docs_text = ""
        for i, doc in enumerate(documents[:10]):  # Ограничиваем количество документов
            docs_text += f"\n\n[Документ {doc['id']}]: {doc['text'][:800]}"
            if len(doc['text']) > 800:
                docs_text += "..."
        
        query_context = f" по запросу: '{query}'" if query else ""
        
        prompt = f"""Суммаризируй предоставленные документы{query_context}. 
        Для КАЖДОГО факта в суммаризации указывай источник в формате [Документ X].
        Будь точен и используй только информацию из предоставленных документов.

        Инструкции:
        1. Указывай источник для каждого утверждения: [Документ ID]
        2. Если информация повторяется в нескольких документах, укажи все источники: [Документ A, Документ B]
        3. Не добавляй информацию, которой нет в документах
        4. Максимальная длина: {max_length} символов

        Документы:{docs_text}

        Суммаризация с источниками:"""
        
        return prompt
    
    def _parse_sources_from_summary(self, summary: str, documents: List[Dict]) -> Dict[str, List[str]]:
        """Парсинг источников из суммаризации"""
        source_mapping = {}
        
        # Ищем паттерны [Документ X], [Документ X, Документ Y], etc.
        source_patterns = [
            r'\[Документ\s+([^]]+)\]',
            r'\[Источник:\s*([^]]+)\]',
            r'\[Doc\s*([^]]+)\]',
        ]
        
        for pattern in source_patterns:
            matches = re.findall(pattern, summary)
            for match in matches:
                # Обрабатываем множественные источники
                source_ids = [src.strip() for src in match.split(',')]
                
                for source_id in source_ids:
                    if source_id not in source_mapping:
                        source_mapping[source_id] = []
                    
                    # Находим соответствующий документ
                    doc = next((d for d in documents if d['id'] == source_id), None)
                    if doc:
                        source_info = {
                            'text_preview': doc['text'][:200] + "...",
                            'score': doc.get('score', 0.0),
                            'source': doc.get('source', 'Неизвестный источник')
                        }
                        source_mapping[source_id].append(source_info)
        
        return source_mapping
    
    def _clean_summary_text(self, summary: str) -> str:
        """Очистка текста суммаризации от маркеров источников для финального вывода"""
        # Убираем маркеры источников, но сохраняем читаемость
        cleaned = re.sub(r'\[Документ\s+[^]]+\]', '', summary)
        cleaned = re.sub(r'\[Источник:\s*[^]]+\]', '', cleaned)
        cleaned = re.sub(r'\s+', ' ', cleaned).strip()
        return cleaned

# Извлечение источников

In [None]:
class ContextSummarizer:
    def __init__(self, llm):
        self.llm = llm
    
    def _extract_key_points_with_sources(self, documents: List[Dict], query: str = None) -> List[Dict]:
        """Извлечение ключевых точек с привязкой к источникам"""
        
        prompt = self._build_key_points_prompt(documents, query)
        
        try:
            response = self.llm.generate(prompt, max_length=800)
            key_points = self._parse_key_points_response(response, documents)
            return key_points
        except Exception as e:
            print(f"Ошибка при извлечении ключевых точек: {e}")
            return self._fallback_key_points(documents)
    
    def _build_key_points_prompt(self, documents: List[Dict], query: str = None) -> str:
        """Создание промпта для извлечения ключевых точек"""
        
        docs_text = ""
        for i, doc in enumerate(documents[:8]):  # Ограничиваем для экономии токенов
            docs_text += f"\n\n[Документ {doc['id']}]: {doc['text'][:500]}"
            if len(doc['text']) > 500:
                docs_text += "..."
        
        query_context = f" по запросу: '{query}'" if query else ""
        
        prompt = f"""Извлеки 3-5 ключевых точек из предоставленных документов{query_context}. 
        Для каждой точки укажи источники в формате [Документ X].

        Формат:
        • Точка 1 [Документ A, Документ B]
        • Точка 2 [Документ C]
        • ...

        Документы:{docs_text}

        Ключевые точки:"""
        
        return prompt
    
    def _parse_key_points_response(self, response: str, documents: List[Dict]) -> List[Dict]:
        """Парсинг ключевых точек из ответа LLM"""
        key_points = []
        
        lines = response.split('\n')
        for line in lines:
            line = line.strip()
            if line.startswith(('•', '-', '*', '▸')) or re.match(r'^\d+\.', line):
                # Извлекаем текст точки и источники
                point_text = re.sub(r'^[•\-\*\d\.\s▸]+', '', line).strip()
                
                # Ищем источники в формате [Документ X]
                sources = re.findall(r'\[Документ\s+([^]]+)\]', point_text)
                
                # Очищаем текст от маркеров источников
                clean_text = re.sub(r'\[Документ\s+[^]]+\]', '', point_text).strip()
                
                if clean_text and sources:
                    # Получаем информацию об источниках
                    source_info = []
                    for source_id in sources:
                        doc = next((d for d in documents if d['id'] == source_id), None)
                        if doc:
                            source_info.append({
                                'doc_id': source_id,
                                'text_preview': doc['text'][:150] + "...",
                                'score': doc.get('score', 0.0),
                                'source': doc.get('source', 'Неизвестный источник')
                            })
                    
                    key_points.append({
                        'point': clean_text,
                        'sources': source_info,
                        'source_ids': sources
                    })
        
        return key_points if key_points else self._fallback_key_points(documents)
    
    def _fallback_key_points(self, documents: List[Dict]) -> List[Dict]:
        """Fallback метод извлечения ключевых точек"""
        key_points = []
        
        for i, doc in enumerate(documents[:5]):
            # Простая эвристика: берем первое предложение как ключевую точку
            first_sentence = doc['text'].split('.')[0] + '.'
            key_points.append({
                'point': first_sentence[:200],
                'sources': [{
                    'doc_id': doc['id'],
                    'text_preview': doc['text'][:150] + "...",
                    'score': doc.get('score', 0.0),
                    'source': doc.get('source', f'Документ {i+1}')
                }],
                'source_ids': [doc['id']]
            })
        
        return key_points

# Визуализация по источникам

In [None]:
class SummaryVisualizer:
    """Класс для визуализации суммаризации с источниками"""
    
    @staticmethod
    def display_summary_with_sources(summary_result: Dict, query: str = None):
        """Отображение суммаризации с источниками"""
        
        print("\n" + "="*80)
        print("📊 СУММАРИЗАЦИЯ КОНТЕКСТА С ИСТОЧНИКАМИ")
        print("="*80)
        
        if query:
            print(f"📝 Запрос: {query}")
        
        print(f"\n📄 Обработано документов: {summary_result['total_documents']}")
        print(f"🔗 Использовано источников: {len(summary_result['documents_used'])}")
        
        print(f"\n📋 СВОДКА:")
        print("-" * 40)
        print(summary_result['summary'])
        
        print(f"\n🎯 КЛЮЧЕВЫЕ ТОЧКИ:")
        print("-" * 40)
        for i, point in enumerate(summary_result['key_points'], 1):
            print(f"{i}. {point['point']}")
            for source in point['sources']:
                print(f"   📍 Источник: {source['source']} (сходство: {source['score']:.3f})")
        
        print(f"\n🔍 ДЕТАЛИ ИСТОЧНИКОВ:")
        print("-" * 40)
        for doc_id, sources in summary_result['source_mapping'].items():
            for source in sources:
                print(f"📄 {source['source']} (ID: {doc_id}):")
                print(f"   {source['text_preview']}")
                print(f"   ⭐ Релевантность: {source['score']:.3f}")
                print()
    
    @staticmethod
    def generate_source_report(summary_result: Dict, format: str = "text") -> str:
        """Генерация отчета об источниках"""
        
        if format == "text":
            report = "ОТЧЕТ ОБ ИСТОЧНИКАХ\n"
            report += "=" * 50 + "\n\n"
            
            for doc_id, sources in summary_result['source_mapping'].items():
                for source in sources:
                    report += f"ИСТОЧНИК: {source['source']}\n"
                    report += f"ID: {doc_id}\n"
                    report += f"Релевантность: {source['score']:.3f}\n"
                    report += f"Фрагмент: {source['text_preview']}\n"
                    report += "-" * 30 + "\n"
            
            return report
        
        elif format == "markdown":
            report = "# Отчет об источниках\n\n"
            
            for doc_id, sources in summary_result['source_mapping'].items():
                for source in sources:
                    report += f"## {source['source']}\n"
                    report += f"- **ID**: {doc_id}\n"
                    report += f"- **Релевантность**: {source['score']:.3f}\n"
                    report += f"- **Фрагмент**: {source['text_preview']}\n\n"
            
            return report
        
        return ""

# Интеграция с Rag системой

In [None]:
class EnhancedRAGWithSummarization:
    """RAG система с расширенной суммаризацией"""
    
    def __init__(self, vector_store, llm, cross_encoder=None):
        self.vector_store = vector_store
        self.llm = llm
        self.cross_encoder = cross_encoder
        self.summarizer = ContextSummarizer(llm)
        self.visualizer = SummaryVisualizer()
    
    def query_with_attributed_summary(self, question: str, 
                                    include_sources: bool = True,
                                    generate_summary: bool = True) -> Dict:
        """
        Запрос с суммаризацией и указанием источников
        """
        print(f"🔍 Поиск информации для: {question}")
        
        # Поиск релевантных документов
        search_results = self.vector_store.search(question, top_k=10)
        
        if not search_results:
            return {
                'answer': 'Не найдено релевантной информации',
                'summary': 'Нет данных для суммаризации',
                'documents': []
            }
        
        # Re-ranking если доступен cross-encoder
        if self.cross_encoder:
            search_results = self.cross_encoder.rerank_documents(question, search_results, top_k=7)
        
        # Генерация ответа
        context = "\n\n".join([doc['text'] for doc in search_results])
        answer = self.llm.generate(question, context)
        
        result = {
            'question': question,
            'answer': answer,
            'documents': search_results,
            'total_documents_found': len(search_results)
        }
        
        # Суммаризация если требуется
        if generate_summary:
            summary_result = self.summarizer.summarize_with_sources(search_results, question)
            result['summary'] = summary_result
            
            # Визуализация
            if include_sources:
                self.visualizer.display_summary_with_sources(summary_result, question)
        
        return result
    
    def generate_detailed_report(self, question: str, format: str = "text") -> Dict:
        """
        Генерация детального отчета с источниками
        """
        result = self.query_with_attributed_summary(question, include_sources=False, generate_summary=True)
        
        if 'summary' in result:
            source_report = self.visualizer.generate_source_report(result['summary'], format)
            result['source_report'] = source_report
        
        return result

# Пример использования

In [None]:
def demo_context_summarization():
    """Демонстрация суммаризации контекста с источниками"""
    
    # Мок данные для демонстрации
    mock_documents = [
        {
            'id': 'doc_1',
            'text': 'Машинное обучение - это область искусственного интеллекта, которая позволяет компьютерам обучаться на данных без явного программирования. Основные подходы включают обучение с учителем, без учителя и с подкреплением.',
            'metadata': {'source': 'Википедия', 'year': 2023},
            'score': 0.95
        },
        {
            'id': 'doc_2', 
            'text': 'Глубокое обучение использует нейронные сети с множеством слоев для извлечения признаков из данных. Этот метод особенно эффективен для обработки изображений, текста и речи.',
            'metadata': {'source': 'Научная статья', 'author': 'Иванов'},
            'score': 0.88
        },
        {
            'id': 'doc_3',
            'text': 'В 2020 году исследователи достигли значительных успехов в области трансферного обучения, что позволило использовать предобученные модели для различных задач с минимальной дообучкой.',
            'metadata': {'source': 'Исследовательский отчет', 'year': 2021},
            'score': 0.82
        }
    ]
    
    # Инициализация суммаризатора
    class MockLLM:
        def generate(self, prompt, max_length=1000):
            return """Машинное обучение позволяет компьютерам обучаться на данных без программирования [Документ doc_1]. 
            Глубокое обучение использует многослойные нейронные сети для обработки сложных данных [Документ doc_2]. 
            В последние годы трансферное обучение стало важным направлением [Документ doc_3]."""
    
    mock_llm = MockLLM()
    summarizer = ContextSummarizer(mock_llm)
    
    # Суммаризация
    query = "Что такое машинное обучение и его современные направления?"
    summary_result = summarizer.summarize_with_sources(mock_documents, query)
    
    # Визуализация
    visualizer = SummaryVisualizer()
    visualizer.display_summary_with_sources(summary_result, query)
    
    # Генерация отчета
    report = visualizer.generate_source_report(summary_result, "markdown")
    print("\n📄 ОТЧЕТ В MARKDOWN:")
    print(report)

if __name__ == "__main__":
    demo_context_summarization()

# Отдельная LLM проверяет соответствие ответ-контекст

In [None]:
class AnswerContextVerifier:
    def __init__(self, verification_llm, verification_mode: str = "strict"):
        """
        Инициализация верификатора
        
        Args:
            verification_llm: отдельная LLM для проверки
            verification_mode: режим проверки (strict, balanced, lenient)
        """
        self.verification_llm = verification_llm
        self.verification_mode = verification_mode
        self.verification_history = []
    
    def verify_answer_context_alignment(self, 
                                      question: str, 
                                      answer: str, 
                                      context: str,
                                      documents: List[Dict] = None) -> Dict[str, Any]:
        """
        Проверка соответствия ответа контексту
        
        Args:
            question: исходный вопрос
            answer: сгенерированный ответ
            context: исходный контекст
            documents: список документов с метаданными
            
        Returns:
            Результаты верификации
        """
        print("🔍 Проверка соответствия ответа контексту...")
        
        # Выбираем метод проверки в зависимости от режима
        if self.verification_mode == "strict":
            verification_result = self._strict_verification(question, answer, context, documents)
        elif self.verification_mode == "balanced":
            verification_result = self._balanced_verification(question, answer, context, documents)
        elif self.verification_mode == "lenient":
            verification_result = self._lenient_verification(question, answer, context, documents)
        else:
            verification_result = self._balanced_verification(question, answer, context, documents)
        
        # Сохраняем в историю
        self.verification_history.append({
            'timestamp': datetime.now().isoformat(),
            'question': question,
            'answer_preview': answer[:100] + "..." if len(answer) > 100 else answer,
            'verification_result': verification_result
        })
        
        return verification_result
    
    def _strict_verification(self, question: str, answer: str, context: str, 
                           documents: List[Dict]) -> Dict[str, Any]:
        """Строгая проверка - каждый факт должен быть подтвержден"""
        
        prompt = f"""Ты - строгий верификатор. Проверь, что КАЖДЫЙ факт в ответе 
        подтверждается предоставленным контекстом. Будь максимально строгим.

        ВОПРОС: {question}

        КОНТЕКСТ ДЛЯ ПРОВЕРКИ:
        {context[:2500]}

        ОТВЕТ ДЛЯ ВЕРИФИКАЦИИ:
        {answer}

        Проанализируй и верни JSON:

        {{
            "alignment_score": 0.85,
            "is_fully_supported": false,
            "supported_facts": [
                {{
                    "fact": "утверждение из ответа",
                    "confidence": 0.95,
                    "evidence": "подтверждающий фрагмент из контекста"
                }}
            ],
            "unsupported_facts": [
                {{
                    "fact": "утверждение без подтверждения", 
                    "reason": "почему не подтверждено",
                    "severity": "high/medium/low"
                }}
            ],
            "hallucinations": [
                "полностью выдуманные факты"
            ],
            "verdict": "approved/needs_revision/rejected",
            "explanation": "подробное объяснение решения"
        }}

        Критерии строгой проверки:
        - Факт должен быть явно подтвержден в контексте
        - Не принимай интерпретации или выводы
        - Отмечай любые расхождения как неподтвержденные"""

        try:
            response = self.verification_llm.generate(prompt, max_length=1500, temperature=0.1)
            return self._parse_verification_response(response, "strict")
        except Exception as e:
            print(f"❌ Ошибка при строгой верификации: {e}")
            return self._get_error_result()
    
    def _balanced_verification(self, question: str, answer: str, context: str,
                             documents: List[Dict]) -> Dict[str, Any]:
        """Сбалансированная проверка - допускает логические выводы"""
        
        prompt = f"""Проверь соответствие ответа контексту. Учитывай, что ответ 
        может содержать логические выводы на основе контекста.

        ВОПРОС: {question}

        КОНТЕКСТ:
        {context[:3000]}

        ОТВЕТ:
        {answer}

        Проанализируй и верни JSON:

        {{
            "alignment_score": 0.90,
            "is_context_aligned": true,
            "factual_accuracy": 0.85,
            "logical_inferences": [
                {{
                    "inference": "логический вывод",
                    "is_justified": true,
                    "basis": "на чем основан вывод"
                }}
            ],
            "factual_errors": [
                {{
                    "error": "фактическая ошибка",
                    "correct_information": "как должно быть",
                    "severity": "high/medium/low"
                }}
            ],
            "coverage_analysis": {{
                "answered_adequately": true,
                "missing_aspects": ["аспекты вопроса, которые не раскрыты"],
                "overclaimed": ["утверждения без достаточных оснований"]
            }},
            "recommendation": "accept/revise_minor/revise_major/reject"
        }}"""

        try:
            response = self.verification_llm.generate(prompt, max_length=1800, temperature=0.2)
            return self._parse_verification_response(response, "balanced")
        except Exception as e:
            print(f"❌ Ошибка при сбалансированной верификации: {e}")
            return self._get_error_result()
    
    def _lenient_verification(self, question: str, answer: str, context: str,
                            documents: List[Dict]) -> Dict[str, Any]:
        """Либеральная проверка - допускает интерпретации"""
        
        prompt = f"""Оцени, насколько хорошо ответ соответствует духу и содержанию контекста.
        Допускай разумные интерпретации и выводы.

        ВОПРОС: {question}
        КОНТЕКСТ: {context[:3500]}
        ОТВЕТ: {answer}

        Верни JSON:

        {{
            "semantic_alignment": 0.88,
            "faithfulness": 0.82,
            "completeness": 0.75,
            "strengths": [
                "сильные стороны ответа"
            ],
            "weaknesses": [
                "слабые стороны или неточности"
            ],
            "overall_quality": "excellent/good/adequate/poor",
            "trust_level": "high/medium/low"
        }}"""

        try:
            response = self.verification_llm.generate(prompt, max_length=1200, temperature=0.3)
            return self._parse_verification_response(response, "lenient")
        except Exception as e:
            print(f"❌ Ошибка при либеральной верификации: {e}")
            return self._get_error_result()

In [None]:
class AnswerContextVerifier:
    def __init__(self, verification_llm, verification_mode: str = "strict"):
        self.verification_llm = verification_llm
        self.verification_mode = verification_mode
        self.verification_history = []
    
    def _parse_verification_response(self, response: str, verification_type: str) -> Dict[str, Any]:
        """Парсинг ответа верификации"""
        try:
            # Пытаемся извлечь JSON
            json_match = re.search(r'\{[^{}]*\{[^{}]*\}[^{}]*\}|\{[^{}]*\}', response, re.DOTALL)
            if json_match:
                json_str = json_match.group()
                # Чистим JSON от возможных проблем
                json_str = self._clean_json_string(json_str)
                data = json.loads(json_str)
                return self._enrich_verification_data(data, verification_type)
            else:
                return self._parse_text_verification(response, verification_type)
                
        except json.JSONDecodeError as e:
            print(f"⚠️ Ошибка парсинга JSON: {e}")
            return self._parse_text_verification(response, verification_type)
        except Exception as e:
            print(f"❌ Неожиданная ошибка при парсинге: {e}")
            return self._get_error_result()
    
    def _clean_json_string(self, json_str: str) -> str:
        """Очистка JSON строки от распространенных проблем"""
        # Заменяем одинарные кавычки на двойные
        json_str = json_str.replace("'", '"')
        
        # Исправляем незакрытые кавычки
        json_str = re.sub(r'(\w+):\s*([^",}\s]+)(?=\s*[,}])', r'\1: "\2"', json_str)
        
        # Исправляем булевы значения
        json_str = re.sub(r':\s*true', ': true', json_str, flags=re.IGNORECASE)
        json_str = re.sub(r':\s*false', ': false', json_str, flags=re.IGNORECASE)
        
        # Удаляем лишние запятые
        json_str = re.sub(r',\s*}', '}', json_str)
        json_str = re.sub(r',\s*]', ']', json_str)
        
        return json_str
    
    def _parse_text_verification(self, response: str, verification_type: str) -> Dict[str, Any]:
        """Парсинг текстового ответа верификации"""
        base_result = {
            'verification_type': verification_type,
            'alignment_score': 0.5,
            'is_reliable': False,
            'parsing_method': 'text_heuristic',
            'raw_response': response[:500] + "..." if len(response) > 500 else response
        }
        
        response_lower = response.lower()
        
        # Эвристический анализ текста
        score_keywords = {
            'high': ['отличн', 'прекрасн', 'идеальн', 'полност', 'perfect', 'excellent', 'high'],
            'good': ['хорош', 'удовлетвор', 'достаточн', 'good', 'adequate', 'sufficient'],
            'poor': ['плох', 'слаб', 'неудовлетвор', 'poor', 'bad', 'inadequate']
        }
        
        # Определяем score по ключевым словам
        detected_score = 0.5
        for level, keywords in score_keywords.items():
            if any(keyword in response_lower for keyword in keywords):
                if level == 'high':
                    detected_score = 0.85
                elif level == 'good':
                    detected_score = 0.7
                elif level == 'poor':
                    detected_score = 0.3
                break
        
        base_result['alignment_score'] = detected_score
        base_result['is_reliable'] = detected_score > 0.6
        
        # Извлекаем основные проблемы
        problems = []
        problem_patterns = [
            (r'не подтвержд[а-я]+', 'Неподтвержденные утверждения'),
            (r'ошибк[а-я]+', 'Фактические ошибки'),
            (r'выдум[а-я]+', 'Выдуманные факты'),
            (r'отсутств[а-я]+', 'Отсутствующая информация')
        ]
        
        for pattern, description in problem_patterns:
            if re.search(pattern, response_lower):
                problems.append(description)
        
        if problems:
            base_result['detected_issues'] = problems
        
        return base_result
    
    def _enrich_verification_data(self, data: Dict, verification_type: str) -> Dict[str, Any]:
        """Обогащение данных верификации"""
        enriched = data.copy()
        
        # Добавляем метаданные
        enriched['verification_type'] = verification_type
        enriched['timestamp'] = datetime.now().isoformat()
        enriched['parsing_method'] = 'json_direct'
        
        # Нормализуем score если нужно
        if 'alignment_score' in enriched:
            enriched['alignment_score'] = float(enriched['alignment_score'])
        elif 'semantic_alignment' in enriched:
            enriched['alignment_score'] = float(enriched['semantic_alignment'])
        
        # Определяем надежность
        if 'alignment_score' in enriched:
            enriched['is_reliable'] = enriched['alignment_score'] > 0.7
        elif 'is_fully_supported' in enriched:
            enriched['is_reliable'] = enriched['is_fully_supported']
        elif 'is_context_aligned' in enriched:
            enriched['is_reliable'] = enriched['is_context_aligned']
        
        return enriched
    
    def _get_error_result(self) -> Dict[str, Any]:
        """Результат при ошибке верификации"""
        return {
            'alignment_score': 0.0,
            'is_reliable': False,
            'verification_type': self.verification_mode,
            'error': True,
            'message': 'Не удалось выполнить проверку',
            'timestamp': datetime.now().isoformat()
        }

# Визуализация

In [None]:
class VerificationVisualizer:
    """Визуализация результатов верификации"""
    
    @staticmethod
    def display_verification_report(verification_result: Dict, 
                                  question: str = None, 
                                  answer: str = None):
        """Отображение отчета о верификации"""
        
        print("\n" + "="*80)
        print("🔍 ОТЧЕТ О ВЕРИФИКАЦИИ: СООТВЕТСТВИЕ ОТВЕТА КОНТЕКСТУ")
        print("="*80)
        
        if question:
            print(f"📝 Вопрос: {question}")
        
        if answer:
            answer_preview = answer[:150] + "..." if len(answer) > 150 else answer
            print(f"💬 Ответ: {answer_preview}")
        
        # Основные метрики
        alignment_score = verification_result.get('alignment_score', 0)
        is_reliable = verification_result.get('is_reliable', False)
        verification_type = verification_result.get('verification_type', 'unknown')
        
        # Визуализация скора
        score_bar = VerificationVisualizer._create_score_bar(alignment_score)
        status_icon = "✅" if is_reliable else "⚠️" if alignment_score > 0.4 else "❌"
        
        print(f"\n{status_icon} ОЦЕНКА СООТВЕТСТВИЯ: {alignment_score:.2f}/1.0")
        print(f"📊 {score_bar}")
        print(f"🔧 Режим проверки: {verification_type}")
        print(f"🎯 Надежность: {'ВЫСОКАЯ' if is_reliable else 'СРЕДНЯЯ' if alignment_score > 0.4 else 'НИЗКАЯ'}")
        
        # Детальная информация в зависимости от типа проверки
        if verification_type == "strict":
            VerificationVisualizer._display_strict_verification_details(verification_result)
        elif verification_type == "balanced":
            VerificationVisualizer._display_balanced_verification_details(verification_result)
        elif verification_type == "lenient":
            VerificationVisualizer._display_lenient_verification_details(verification_result)
        
        # Рекомендации
        recommendation = verification_result.get('recommendation') or verification_result.get('verdict')
        if recommendation:
            print(f"\n💡 РЕКОМЕНДАЦИЯ: {recommendation.upper()}")
    
    @staticmethod
    def _create_score_bar(score: float, width: int = 20) -> str:
        """Создание визуальной шкалы оценки"""
        filled = int(score * width)
        bar = "█" * filled + "░" * (width - filled)
        return f"[{bar}]"
    
    @staticmethod
    def _display_strict_verification_details(result: Dict):
        """Детали строгой проверки"""
        supported_facts = result.get('supported_facts', [])
        unsupported_facts = result.get('unsupported_facts', [])
        hallucinations = result.get('hallucinations', [])
        
        if supported_facts:
            print(f"\n✅ ПОДТВЕРЖДЕННЫЕ ФАКТЫ ({len(supported_facts)}):")
            for i, fact in enumerate(supported_facts[:3], 1):  # Показываем первые 3
                confidence = fact.get('confidence', 0)
                confidence_icon = "🟢" if confidence > 0.8 else "🟡"
                print(f"   {confidence_icon} {fact.get('fact', 'N/A')[:80]}...")
        
        if unsupported_facts:
            print(f"\n❌ НЕПОДТВЕРЖДЕННЫЕ ФАКТЫ ({len(unsupported_facts)}):")
            for i, fact in enumerate(unsupported_facts[:3], 1):
                severity = fact.get('severity', 'medium')
                severity_icon = "🔴" if severity == 'high' else "🟡" if severity == 'medium' else "🔵"
                print(f"   {severity_icon} {fact.get('fact', 'N/A')[:80]}...")
                print(f"      Причина: {fact.get('reason', 'N/A')}")
        
        if hallucinations:
            print(f"\n🚫 ВЫДУМАННЫЕ ФАКТЫ ({len(hallucinations)}):")
            for i, hallu in enumerate(hallucinations[:3], 1):
                print(f"   ⚠️ {hallu[:80]}...")
    
    @staticmethod
    def _display_balanced_verification_details(result: Dict):
        """Детали сбалансированной проверки"""
        factual_accuracy = result.get('factual_accuracy', 0)
        logical_inferences = result.get('logical_inferences', [])
        factual_errors = result.get('factual_errors', [])
        
        print(f"\n📈 Фактическая точность: {factual_accuracy:.2f}")
        
        if logical_inferences:
            justified = sum(1 for inf in logical_inferences if inf.get('is_justified', False))
            print(f"🧠 Логические выводы: {justified}/{len(logical_inferences)} обоснованы")
        
        if factual_errors:
            print(f"\n📉 Фактические ошибки ({len(factual_errors)}):")
            for i, error in enumerate(factual_errors[:2], 1):
                severity = error.get('severity', 'medium')
                severity_icon = "🔴" if severity == 'high' else "🟡"
                print(f"   {severity_icon} {error.get('error', 'N/A')[:60]}...")
    
    @staticmethod
    def _display_lenient_verification_details(result: Dict):
        """Детали либеральной проверки"""
        faithfulness = result.get('faithfulness', 0)
        completeness = result.get('completeness', 0)
        strengths = result.get('strengths', [])
        weaknesses = result.get('weaknesses', [])
        
        print(f"\n🎯 Верность контексту: {faithfulness:.2f}")
        print(f"📋 Полнота ответа: {completeness:.2f}")
        
        if strengths:
            print(f"\n🌟 Сильные стороны:")
            for strength in strengths[:2]:
                print(f"   ✅ {strength[:80]}...")
        
        if weaknesses:
            print(f"\n📝 Области улучшения:")
            for weakness in weaknesses[:2]:
                print(f"   📝 {weakness[:80]}...")

# Интеграция с Rag

In [None]:
class VerifiedRAGSystem:
    """RAG система с верификацией ответов отдельной LLM"""
    
    def __init__(self, 
                 vector_store,
                 answer_llm,  # Основная LLM для генерации ответов
                 verification_llm,  # Отдельная LLM для проверки
                 verification_mode: str = "balanced"):
        
        self.vector_store = vector_store
        self.answer_llm = answer_llm
        self.verifier = AnswerContextVerifier(verification_llm, verification_mode)
        self.visualizer = VerificationVisualizer()
    
    def query_with_verification(self, 
                              question: str, 
                              max_retries: int = 2,
                              confidence_threshold: float = 0.7) -> Dict[str, Any]:
        """
        Запрос с верификацией ответа отдельной LLM
        
        Args:
            question: вопрос пользователя
            max_retries: максимальное количество попыток улучшения
            confidence_threshold: порог уверенности для принятия ответа
        """
        print(f"🎯 Запрос: {question}")
        
        # Поиск релевантных документов
        documents = self.vector_store.search(question, top_k=10)
        
        if not documents:
            return {
                'question': question,
                'answer': 'Не найдено релевантной информации',
                'verification_result': None,
                'documents': [],
                'status': 'no_documents'
            }
        
        # Генерация первоначального ответа
        context = "\n\n".join([doc['text'] for doc in documents])
        answer = self.answer_llm.generate(question, context)
        
        # Верификация ответа
        verification_result = self.verifier.verify_answer_context_alignment(
            question, answer, context, documents
        )
        
        # Попытки улучшения если нужно
        improved_answer = answer
        for attempt in range(max_retries):
            alignment_score = verification_result.get('alignment_score', 0)
            
            if alignment_score >= confidence_threshold:
                break  # Ответ достаточно хорош
                
            print(f"🔄 Попытка улучшения {attempt + 1}/{max_retries}")
            improved_answer = self._improve_answer_based_on_verification(
                question, improved_answer, context, verification_result
            )
            
            # Повторная верификация
            verification_result = self.verifier.verify_answer_context_alignment(
                question, improved_answer, context, documents
            )
        
        # Финальные результаты
        final_answer = improved_answer if improved_answer != answer else answer
        
        result = {
            'question': question,
            'answer': final_answer,
            'original_answer': answer,
            'verification_result': verification_result,
            'documents': documents,
            'context_preview': context[:300] + "..." if len(context) > 300 else context,
            'improvement_attempts': min(max_retries, 1 if improved_answer != answer else 0),
            'final_score': verification_result.get('alignment_score', 0)
        }
        
        # Визуализация результатов
        self.visualizer.display_verification_report(verification_result, question, final_answer)
        
        return result
    
    def _improve_answer_based_on_verification(self, 
                                            question: str, 
                                            current_answer: str,
                                            context: str,
                                            verification_result: Dict) -> str:
        """Улучшение ответа на основе результатов верификации"""
        
        issues = self._extract_issues_from_verification(verification_result)
        
        prompt = f"""Улучши следующий ответ, исправив выявленные проблемы. 
        Используй ТОЛЬКО информацию из предоставленного контекста.

        ВОПРОС: {question}

        КОНТЕКСТ:
        {context[:3500]}

        ТЕКУЩИЙ ОТВЕТ (требует улучшения):
        {current_answer}

        ВЫЯВЛЕННЫЕ ПРОБЛЕМЫ:
        {issues}

        УЛУЧШЕННЫЙ ОТВЕТ (будь точным и используй только подтвержденные факты):"""

        try:
            improved_answer = self.answer_llm.generate(prompt, max_length=1200)
            return improved_answer
        except Exception as e:
            print(f"⚠️ Не удалось улучшить ответ: {e}")
            return current_answer
    
    def _extract_issues_from_verification(self, verification_result: Dict) -> str:
        """Извлечение проблем из результатов верификации"""
        issues = []
        
        verification_type = verification_result.get('verification_type', '')
        
        if verification_type == "strict":
            unsupported = verification_result.get('unsupported_facts', [])
            hallucinations = verification_result.get('hallucinations', [])
            
            for fact in unsupported[:3]:  # Берем первые 3
                issues.append(f"Неподтвержденный факт: {fact.get('fact', 'N/A')}")
            
            for hallu in hallucinations[:2]:
                issues.append(f"Выдуманный факт: {hallu}")
                
        elif verification_type == "balanced":
            errors = verification_result.get('factual_errors', [])
            for error in errors[:3]:
                issues.append(f"Фактическая ошибка: {error.get('error', 'N/A')}")
        
        elif verification_type == "lenient":
            weaknesses = verification_result.get('weaknesses', [])
            for weakness in weaknesses[:3]:
                issues.append(f"Слабая сторона: {weakness}")
        
        return "\n".join(issues) if issues else "Конкретные проблемы не выявлены"
    
    def get_verification_statistics(self) -> Dict[str, Any]:
        """Статистика верификации"""
        if not self.verifier.verification_history:
            return {}
        
        history = self.verifier.verification_history
        
        total_verifications = len(history)
        reliable_count = sum(1 for item in history 
                           if item['verification_result'].get('is_reliable', False))
        
        scores = [item['verification_result'].get('alignment_score', 0) 
                 for item in history if 'alignment_score' in item['verification_result']]
        
        avg_score = np.mean(scores) if scores else 0
        
        return {
            'total_verifications': total_verifications,
            'reliable_answers': reliable_count,
            'reliability_rate': reliable_count / total_verifications,
            'average_alignment_score': avg_score,
            'verification_mode': self.verifier.verification_mode,
            'recent_verifications': history[-5:]  # Последние 5 проверок
        }

# Пример использваниям 

In [None]:
def demo_verification_system():
    """Демонстрация системы верификации"""
    
    # Мок LLM для демонстрации
    class MockLLM:
        def __init__(self, name="MockLLM"):
            self.name = name
        
        def generate(self, prompt, max_length=1000, temperature=0.1):
            if "проверь" in prompt.lower() or "verify" in prompt.lower():
                # Ответ верификатора
                return '''{
                    "alignment_score": 0.78,
                    "is_fully_supported": false,
                    "supported_facts": [
                        {
                            "fact": "Машинное обучение позволяет компьютерам обучаться на данных",
                            "confidence": 0.95,
                            "evidence": "Машинное обучение - это область искусственного интеллекта, которая позволяет компьютерам обучаться на данных"
                        }
                    ],
                    "unsupported_facts": [
                        {
                            "fact": "Машинное обучение было изобретено в 2020 году", 
                            "reason": "В контексте нет информации о дате изобретения",
                            "severity": "medium"
                        }
                    ],
                    "verdict": "needs_revision",
                    "explanation": "Ответ в основном корректен, но содержит одно неподтвержденное утверждение"
                }'''
            else:
                # Ответ основной LLM
                return "Машинное обучение позволяет компьютерам обучаться на данных без явного программирования. Эта технология была изобретена в 2020 году и с тех пор активно развивается."
    
    # Создание системы
    answer_llm = MockLLM("AnswerLLM")
    verification_llm = MockLLM("VerificationLLM")
    
    verified_rag = VerifiedRAGSystem(
        vector_store=None,  # В реальности здесь была бы векторная БД
        answer_llm=answer_llm,
        verification_llm=verification_llm,
        verification_mode="strict"
    )
    
    # Тестовый запрос
    question = "Что такое машинное обучение и когда оно было изобретено?"
    
    print("🚀 Запуск системы с верификацией ответов...")
    result = verified_rag.query_with_verification(question)
    
    # Статистика
    stats = verified_rag.get_verification_statistics()
    print(f"\n📊 Статистика верификации:")
    print(f"   Всего проверок: {stats.get('total_verifications', 0)}")
    print(f"   Надежных ответов: {stats.get('reliable_answers', 0)}")
    print(f"   Средняя оценка: {stats.get('average_alignment_score', 0):.2f}")

if __name__ == "__main__":
    demo_verification_system()

# дообучение cross-encoder

# Подготовка данных для обучения

In [None]:
import spacy
import random
from typing import List, Dict, Tuple
from datasets import Dataset
import torch
from transformers import AutoTokenizer
from sentence_transformers import InputExample

class CrossEncoderDataGenerator:
    def __init__(self, spacy_model: str = "ru_core_news_sm"):
        try:
            self.nlp = spacy.load(spacy_model)
        except OSError:
            print(f"⚠️ Модель Spacy {spacy_model} не найдена. Установите: python -m spacy download {spacy_model}")
            self.nlp = None
    
    def generate_chunk_triplets(self, documents: List[Dict], num_triplets: int = 1000) -> List[Tuple[str, str, str]]:
        """
        Генерация триплетов: anchor (фраза), positive (чанк), negative (случайный чанк)
        """
        triplets = []
        all_chunks = [doc['text'] for doc in documents]
        
        print(f"Генерация {num_triplets} триплетов из {len(documents)} документов...")
        
        for _ in range(num_triplets):
            # Выбираем случайный документ как positive
            positive_doc = random.choice(documents)
            positive_chunk = positive_doc['text']
            
            # Извлекаем случайную фразу из positive чанка как anchor
            anchor_phrase = self._extract_random_phrase(positive_chunk)
            if not anchor_phrase:
                continue
            
            # Выбираем случайный чанк из другого документа как negative
            negative_doc = random.choice([d for d in documents if d['id'] != positive_doc['id']])
            negative_chunk = negative_doc['text']
            
            triplets.append((anchor_phrase, positive_chunk, negative_chunk))
        
        return triplets
    
    def _extract_random_phrase(self, text: str, min_words: int = 3, max_words: int = 8) -> str:
        """Извлечение случайной фразы из текста"""
        words = text.split()
        if len(words) < min_words:
            return text
        
        # Выбираем случайную начальную позицию
        start_idx = random.randint(0, len(words) - min_words)
        end_idx = min(start_idx + random.randint(min_words, max_words), len(words))
        
        phrase = ' '.join(words[start_idx:end_idx])
        return phrase.strip()
    
    def generate_title_paragraph_pairs(self, structured_documents: List[Dict]) -> List[Tuple[str, str]]:
        """
        Генерация пар заголовок-параграф для документов со структурой
        """
        pairs = []
        
        for doc in structured_documents:
            if 'title' in doc and 'paragraphs' in doc:
                title = doc['title']
                for paragraph in doc['paragraphs']:
                    pairs.append((title, paragraph))
        
        return pairs
    
    def extract_keyword_paragraph_pairs(self, documents: List[Dict]) -> List[Tuple[str, str]]:
        """
        Извлечение пар ключевое слово-абзац с помощью Spacy
        """
        if not self.nlp:
            print("❌ Spacy модель не загружена")
            return []
        
        pairs = []
        
        for doc in documents:
            text = doc['text']
            
            # Разбиваем на предложения/абзацы
            paragraphs = self._split_into_paragraphs(text)
            
            for paragraph in paragraphs:
                if len(paragraph.split()) < 10:  # Слишком короткий абзац
                    continue
                
                # Извлекаем ключевые слова с помощью Spacy
                keywords = self._extract_keywords(paragraph, top_k=1)
                if keywords:
                    pairs.append((keywords[0], paragraph))
        
        return pairs
    
    def _split_into_paragraphs(self, text: str) -> List[str]:
        """Разбивка текста на абзацы"""
        # Простая разбивка по двойным переносам строк
        paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
        return paragraphs if paragraphs else [text]
    
    def _extract_keywords(self, text: str, top_k: int = 3) -> List[str]:
        """Извлечение ключевых слов с помощью Spacy"""
        doc = self.nlp(text)
        
        # Извлекаем существительные и собственные существительные
        keywords = []
        for token in doc:
            if (token.pos_ in ['NOUN', 'PROPN'] and 
                not token.is_stop and 
                len(token.text) > 2):
                keywords.append(token.lemma_)
        
        # Убираем дубликаты и возвращаем top_k
        unique_keywords = list(dict.fromkeys(keywords))
        return unique_keywords[:top_k]
    
    def generate_llm_question_answer_pairs(self, documents: List[Dict], llm, num_pairs: int = 500) -> List[Tuple[str, str]]:
        """
        Генерация пар вопрос-ответ с помощью LLM
        """
        pairs = []
        
        print(f"Генерация {num_pairs} пар вопрос-ответ с помощью LLM...")
        
        for doc in random.sample(documents, min(num_pairs, len(documents))):
            text = doc['text']
            
            prompt = f"""На основе следующего текста сгенерируй естественный вопрос и точный ответ на него.

Текст: {text[:1000]}

Верни в формате:
Вопрос: [сгенерированный вопрос]
Ответ: [точный ответ из текста]"""

            try:
                response = llm.generate(prompt, max_length=200)
                question, answer = self._parse_qa_response(response)
                if question and answer:
                    pairs.append((question, answer))
            except Exception as e:
                print(f"⚠️ Ошибка генерации QA пары: {e}")
        
        return pairs
    
    def _parse_qa_response(self, response: str) -> Tuple[str, str]:
        """Парсинг ответа LLM на вопрос и ответ"""
        question = None
        answer = None
        
        lines = response.split('\n')
        for line in lines:
            if line.startswith('Вопрос:') or line.startswith('Question:'):
                question = line.split(':', 1)[1].strip()
            elif line.startswith('Ответ:') or line.startswith('Answer:'):
                answer = line.split(':', 1)[1].strip()
        
        return question, answer

# подготовка датасета для обучения 

In [None]:
class CrossEncoderDatasetBuilder:
    def __init__(self, tokenizer_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    
    def prepare_training_dataset(self, positive_pairs: List[Tuple[str, str]], 
                               negative_pairs: List[Tuple[str, str]] = None,
                               test_size: float = 0.1) -> Dict:
        """
        Подготовка датасета для обучения кросс-энкодера
        """
        from sklearn.model_selection import train_test_split
        
        # Создаем примеры для обучения
        train_examples = []
        
        # Positive примеры
        for query, document in positive_pairs:
            train_examples.append(InputExample(
                texts=[query, document], 
                label=1.0  # Positive relevance
            ))
        
        # Negative примеры (если предоставлены)
        if negative_pairs:
            for query, document in negative_pairs:
                train_examples.append(InputExample(
                    texts=[query, document],
                    label=0.0  # Negative relevance
                ))
        else:
            # Автоматическая генерация negative примеров
            train_examples.extend(self._generate_negative_examples(positive_pairs))
        
        print(f"Создано {len(train_examples)} примеров для обучения")
        
        # Разделение на train/test
        train_examples, test_examples = train_test_split(
            train_examples, test_size=test_size, random_state=42
        )
        
        return {
            'train': train_examples,
            'test': test_examples,
            'total_positive': len(positive_pairs),
            'total_negative': len(train_examples) - len(positive_pairs)
        }
    
    def _generate_negative_examples(self, positive_pairs: List[Tuple[str, str]], 
                                  num_negative: int = None) -> List[InputExample]:
        """Генерация negative примеров через негативный сэмплинг"""
        if num_negative is None:
            num_negative = len(positive_pairs)  # Столько же negative, сколько positive
        
        negative_examples = []
        
        for _ in range(num_negative):
            # Берем случайный query
            query, _ = random.choice(positive_pairs)
            # Берем случайный document из другого query
            attempts = 0
            while attempts < 10:
                _, negative_doc = random.choice(positive_pairs)
                # Проверяем, что это действительно negative пример
                if not self._is_similar_pair((query, negative_doc), positive_pairs):
                    negative_examples.append(InputExample(
                        texts=[query, negative_doc],
                        label=0.0
                    ))
                    break
                attempts += 1
        
        return negative_examples
    
    def _is_similar_pair(self, pair: Tuple[str, str], positive_pairs: List[Tuple[str, str]]) -> bool:
        """Проверка, является ли пара похожей на positive пары"""
        query, doc = pair
        for pos_query, pos_doc in positive_pairs:
            if query == pos_query and doc == pos_doc:
                return True
        return False
    
    def create_huggingface_dataset(self, examples: List[InputExample]) -> Dataset:
        """Создание датасета в формате HuggingFace"""
        texts1 = []
        texts2 = []
        labels = []
        
        for example in examples:
            texts1.append(example.texts[0])
            texts2.append(example.texts[1])
            labels.append(example.label)
        
        return Dataset.from_dict({
            'text1': texts1,
            'text2': texts2,
            'label': labels
        })

# обучение кросс энкодера 

In [None]:
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
import logging

class CrossEncoderTrainer:
    def __init__(self, model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
        self.model_name = model_name
        self.trainer = None
        
    def train_cross_encoder(self, 
                          train_examples: List[InputExample],
                          test_examples: List[InputExample] = None,
                          output_path: str = "./cross-encoder-model",
                          num_epochs: int = 3,
                          batch_size: int = 16,
                          warmup_steps: int = 100) -> CrossEncoder:
        """
        Обучение кросс-энкодера
        """
        print("🚀 Начало обучения кросс-энкодера...")
        
        # Инициализация модели
        model = CrossEncoder(self.model_name, num_labels=1)
        
        # Настройка логгера
        logging.basicConfig(format='%(asctime)s - %(message)s',
                           datefmt='%Y-%m-%d %H:%M:%S',
                           level=logging.INFO)
        
        # Подготовка данных для обучения
        train_dataloader = torch.utils.data.DataLoader(
            train_examples, 
            shuffle=True, 
            batch_size=batch_size
        )
        
        # Evaluator для валидации (если есть test данные)
        evaluator = None
        if test_examples:
            evaluator = CECorrelationEvaluator.from_input_examples(test_examples, name='test')
        
        # Обучение модели
        model.fit(train_dataloader=train_dataloader,
                 evaluator=evaluator,
                 epochs=num_epochs,
                 warmup_steps=warmup_steps,
                 output_path=output_path,
                 show_progress_bar=True)
        
        print(f"✅ Обучение завершено. Модель сохранена в: {output_path}")
        return model
    
    def evaluate_model(self, model: CrossEncoder, test_examples: List[InputExample]) -> Dict[str, float]:
        """Оценка модели на тестовых данных"""
        from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
        import numpy as np
        
        # Предсказания
        predictions = model.predict([(example.texts[0], example.texts[1]) 
                                   for example in test_examples])
        
        # Истинные метки
        true_labels = [example.label for example in test_examples]
        
        # Бинаризация предсказаний (порог 0.5)
        binary_predictions = [1 if pred > 0.5 else 0 for pred in predictions]
        
        # Метрики
        accuracy = accuracy_score(true_labels, binary_predictions)
        precision = precision_score(true_labels, binary_predictions, zero_division=0)
        recall = recall_score(true_labels, binary_predictions, zero_division=0)
        f1 = f1_score(true_labels, binary_predictions, zero_division=0)
        
        return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'mean_prediction': float(np.mean(predictions)),
            'std_prediction': float(np.std(predictions))
        }

# полный пайплайн

In [None]:
class CrossEncoderTrainingPipeline:
    def __init__(self, documents: List[Dict], llm=None):
        self.documents = documents
        self.llm = llm
        self.data_generator = CrossEncoderDataGenerator()
        self.dataset_builder = CrossEncoderDatasetBuilder()
        self.trainer = CrossEncoderTrainer()
        
    def run_full_pipeline(self, output_path: str = "./trained-cross-encoder") -> CrossEncoder:
        """
        Полный пайплайн подготовки данных и обучения
        """
        print("🔧 Запуск полного пайплайна обучения кросс-энкодера...")
        
        # 1. Генерация различных типов данных
        all_positive_pairs = []
        
        # Триплеты: фраза - чанк (позитив) - случайный чанк (негатив)
        print("📊 Генерация триплетов...")
        triplets = self.data_generator.generate_chunk_triplets(self.documents, num_triplets=2000)
        for anchor, positive, negative in triplets:
            all_positive_pairs.append((anchor, positive))
        
        # Ключевые слова - абзацы (Spacy)
        print("🔑 Извлечение ключевых слов...")
        keyword_pairs = self.data_generator.extract_keyword_paragraph_pairs(self.documents)
        all_positive_pairs.extend(keyword_pairs)
        
        # Вопросы-ответы (LLM)
        if self.llm:
            print("🤖 Генерация вопросов-ответов...")
            qa_pairs = self.data_generator.generate_llm_question_answer_pairs(
                self.documents, self.llm, num_pairs=1000
            )
            all_positive_pairs.extend(qa_pairs)
        
        print(f"📈 Всего сгенерировано positive пар: {len(all_positive_pairs)}")
        
        # 2. Подготовка датасета
        print("📚 Подготовка датасета...")
        dataset = self.dataset_builder.prepare_training_dataset(
            positive_pairs=all_positive_pairs,
            test_size=0.15
        )
        
        # 3. Обучение модели
        print("🎯 Обучение модели...")
        model = self.trainer.train_cross_encoder(
            train_examples=dataset['train'],
            test_examples=dataset['test'],
            output_path=output_path,
            num_epochs=4,
            batch_size=16
        )
        
        # 4. Оценка модели
        if dataset['test']:
            metrics = self.trainer.evaluate_model(model, dataset['test'])
            print(f"📊 Результаты оценки:")
            for metric, value in metrics.items():
                print(f"   {metric}: {value:.4f}")
        
        return model
    
    def analyze_training_data(self, positive_pairs: List[Tuple[str, str]]) -> Dict[str, Any]:
        """Анализ сгенерированных данных для обучения"""
        query_lengths = [len(query.split()) for query, _ in positive_pairs]
        doc_lengths = [len(doc.split()) for _, doc in positive_pairs]
        
        return {
            'total_pairs': len(positive_pairs),
            'avg_query_length': sum(query_lengths) / len(query_lengths),
            'avg_doc_length': sum(doc_lengths) / len(doc_lengths),
            'max_query_length': max(query_lengths),
            'max_doc_length': max(doc_lengths),
            'min_query_length': min(query_lengths),
            'min_doc_length': min(doc_lengths)
        }

# использование обученной модели

In [None]:
class TrainedCrossEncoderRAG:
    def __init__(self, cross_encoder_model_path: str, vector_store):
        self.cross_encoder = CrossEncoder(cross_encoder_model_path)
        self.vector_store = vector_store
    
    def search_with_reranking(self, query: str, top_k: int = 10, 
                            vector_top_k: int = 50) -> List[Dict]:
        """
        Поиск с реранкингом с помощью обученного кросс-энкодера
        """
        # Первоначальный векторный поиск (берем больше кандидатов)
        candidate_docs = self.vector_store.search(query, top_k=vector_top_k)
        
        if not candidate_docs:
            return []
        
        # Подготовка пар для кросс-энкодера
        pairs = [(query, doc['text']) for doc in candidate_docs]
        
        # Получение скоринга от кросс-энкодера
        scores = self.cross_encoder.predict(pairs)
        
        # Объединение скоринга с документами
        for i, doc in enumerate(candidate_docs):
            doc['cross_encoder_score'] = float(scores[i])
        
        # Сортировка по cross-encoder score
        ranked_docs = sorted(candidate_docs, 
                           key=lambda x: x['cross_encoder_score'], 
                           reverse=True)
        
        return ranked_docs[:top_k]
    
    def evaluate_retrieval_quality(self, test_queries: List[str], 
                                 ground_truth: Dict[str, List[str]]) -> Dict[str, float]:
        """
        Оценка качества retrieval на тестовых запросах
        """
        from sklearn.metrics import ndcg_score
        
        ndcg_scores = []
        precision_scores = []
        recall_scores = []
        
        for query in test_queries:
            # Получаем предсказания
            results = self.search_with_reranking(query, top_k=10)
            predicted_doc_ids = [doc.get('id', '') for doc in results]
            
            # Получаем ground truth
            true_relevant_ids = ground_truth.get(query, [])
            
            if not true_relevant_ids:
                continue
            
            # Вычисляем метрики
            relevance_scores = [1 if doc_id in true_relevant_ids else 0 
                              for doc_id in predicted_doc_ids]
            
            # NDCG
            if sum(relevance_scores) > 0:
                true_ideal = [1] * len(true_relevant_ids) + [0] * (len(predicted_doc_ids) - len(true_relevant_ids))
                ndcg = ndcg_score([true_ideal], [relevance_scores])
                ndcg_scores.append(ndcg)
            
            # Precision@k
            precision = sum(relevance_scores[:5]) / 5  # Precision@5
            precision_scores.append(precision)
            
            # Recall@k
            recall = sum(relevance_scores) / len(true_relevant_ids)
            recall_scores.append(recall)
        
        return {
            'ndcg_mean': np.mean(ndcg_scores) if ndcg_scores else 0,
            'precision_mean': np.mean(precision_scores) if precision_scores else 0,
            'recall_mean': np.mean(recall_scores) if recall_scores else 0,
            'num_queries_evaluated': len(test_queries)
        }

# пример использования

In [None]:
def demo_cross_encoder_training():
    """Демонстрация обучения кросс-энкодера"""
    
    # Пример документов
    sample_documents = [
        {
            'id': 'doc1',
            'text': 'Машинное обучение - это область искусственного интеллекта. Алгоритмы машинного обучения позволяют компьютерам обучаться на данных. Существуют различные типы машинного обучения: с учителем, без учителя и с подкреплением.',
            'title': 'Введение в машинное обучение'
        },
        {
            'id': 'doc2', 
            'text': 'Глубокое обучение использует нейронные сети с множеством слоев. Этот подход особенно эффективен для обработки изображений и естественного языка. Сверточные сети используются для компьютерного зрения.',
            'title': 'Глубокое обучение'
        },
        {
            'id': 'doc3',
            'text': 'Трансформеры - это архитектура нейронных сетей для обработки последовательностей. Они используют механизм внимания и стали основой для современных языковых моделей like BERT и GPT.',
            'title': 'Архитектура трансформеров'
        }
    ]
    
    # Мок LLM для генерации вопросов
    class MockLLM:
        def generate(self, prompt, max_length=200):
            return """Вопрос: Что такое машинное обучение?
Ответ: Машинное обучение - это область искусственного интеллекта."""
    
    mock_llm = MockLLM()
    
    # Запуск пайплайна
    pipeline = CrossEncoderTrainingPipeline(sample_documents, mock_llm)
    
    # Анализ данных
    positive_pairs = []
    # Добавляем примеры для анализа
    data_generator = CrossEncoderDataGenerator()
    keyword_pairs = data_generator.extract_keyword_paragraph_pairs(sample_documents)
    positive_pairs.extend(keyword_pairs)
    
    analysis = pipeline.analyze_training_data(positive_pairs)
    print("📋 Анализ данных для обучения:")
    for key, value in analysis.items():
        print(f"   {key}: {value}")
    
    # Обучение модели (в реальном сценарии)
    # model = pipeline.run_full_pipeline("./my-cross-encoder")
    
    print("✅ Демонстрация завершена. В реальном сценарии запустите pipeline.run_full_pipeline()")

if __name__ == "__main__":
    demo_cross_encoder_training()