In [1]:
document = "doc/virus-book.pdf"

In [22]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(document)
doc = loader.load(
)

child_splitter = RecursiveCharacterTextSplitter(chunk_size = 200)
parent_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000)

embeddings = OllamaEmbeddings(model = "embeddinggemma:latest")

storage = InMemoryStore()

vectorstore = Chroma(embedding_function=embeddings)

parent_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=storage,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

parent_retriever.add_documents(doc)

result = parent_retriever.invoke("Characteristics of a Virus")
print(result[0].page_content)

Characteristics of a Virus
Viruses have four essential
characteristics.
SELF REPLICATION : First, viruses are
notable for the ability to replicate itself
to infect computers , much like its
biological counterpart. By replicating
itself it is able to spread across
computer systems and networks to
infect as much as it possibly can.


In [23]:
splitter = RecursiveCharacterTextSplitter(chunk_size = 1000)
red = splitter.split_documents(doc)

lol_vec = Chroma.from_documents(doc, embeddings)

res2 = lol_vec.similarity_search("Characteristics of a Virus")
print(res2[0].page_content)

Characteristics of a Virus
Viruses have four essential
characteristics.
SELF REPLICATION : First, viruses are
notable for the ability to replicate itself
to infect computers , much like its


In [29]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import HypotheticalDocumentEmbedder

# 1. Настройка моделей
llm = Ollama(model="llama3.2:latest")
base_embeddings = OllamaEmbeddings(model="embeddinggemma:latest")

# 2. Настройка HyDE
hyde_embeddings = HypotheticalDocumentEmbedder.from_llm(
    llm, 
    base_embeddings, 
    prompt_key="web_search"
)

# 3. Обычная нарезка (только 200 символов, без Parent-структуры)
splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
chunks = splitter.split_documents(doc) # Твой загруженный документ

# 4. Создаем обычную базу с HyDE-эмбеддингами
hyde_vectorstore = Chroma.from_documents(chunks, hyde_embeddings)

# 5. Поиск
query = "Characteristics of a Virus"
results = hyde_vectorstore.similarity_search(query, k=3)

print("--- РЕЗУЛЬТАТ: ТОЛЬКО HyDE (Chunk 200) ---")
print(results[0].page_content)

--- РЕЗУЛЬТАТ: ТОЛЬКО HyDE (Chunk 200) ---
Characteristics of a Virus
Viruses have four essential
characteristics.
SELF REPLICATION : First, viruses are
notable for the ability to replicate itself
to infect computers , much like its


In [28]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import HypotheticalDocumentEmbedder
from langchain.retrievers import EnsembleRetriever
# 1. Настройка моделей
llm = Ollama(model="llama3.2:latest")
base_embeddings = OllamaEmbeddings(model="embeddinggemma:latest")

# 2. Настройка HyDE
hyde_embeddings = HypotheticalDocumentEmbedder.from_llm(
    llm, 
    base_embeddings, 
    prompt_key="web_search"
)

 # Твой загруженный документ

# 4. Создаем обычную базу с HyDE-эмбеддингами


child_splitter = RecursiveCharacterTextSplitter(chunk_size = 200)
parent_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000)

hyde_vectorstore = Chroma(embedding_function=hyde_embeddings)
hyde_parent_retriever = ParentDocumentRetriever(
    vectorstore=hyde_vectorstore,
    docstore=storage,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

# 3. ВТОРОЙ РЕТРИВЕР: Обычные эмбеддинги + Parent (Классический "Буквальный")
# Используем base_embeddings без HyDE
classic_vectorstore = Chroma(embedding_function=base_embeddings)
classic_parent_retriever = ParentDocumentRetriever(
    vectorstore=classic_vectorstore,
    docstore=storage,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

# 4. Добавляем документы в ОБА (чтобы оба проиндексировали текст)
hyde_parent_retriever.add_documents(doc)
classic_parent_retriever.add_documents(doc)

# 5. СОЗДАЕМ АНСАМБЛЬ
# weights=[0.4, 0.6] означает, что мы чуть больше доверяем классике, 
# чтобы не пропускать такие куски как "Characteristics"
final_retriever = EnsembleRetriever(
    retrievers=[hyde_parent_retriever, classic_parent_retriever],
    weights=[0.4, 0.6] 
)

# 6. ТЕСТ
result = final_retriever.invoke("computer virus")


# 5. Поиск
query = "Characteristics of a Virus"
result = parent_retriever.invoke(query)

print("--- РЕЗУЛЬТАТ: ТОЛЬКО HyDE (Chunk 200) ---")
print(results[0].page_content)

--- РЕЗУЛЬТАТ: ТОЛЬКО HyDE (Chunk 200) ---
Characteristics of a Virus
Viruses have four essential
characteristics.
SELF REPLICATION : First, viruses are
notable for the ability to replicate itself
to infect computers , much like its



HYDE + PDR + EMBEDDINGSFILTER COMPRESSOR Когда вопросы задаются простыми словами, а информация в PDF сложная и техническая. Мы используем HyDE для понимания сути и PDR для выдачи полных ответов.


In [None]:
from langchain.retrievers import ParentDocumentRetriever, ContextualCompressionRetriever
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.storage import InMemoryStore
from langchain.chains import HypotheticalDocumentEmbedder


llm = Ollama(model="llama3.2:latest")
base_embeddings = OllamaEmbeddings(model="gemma")

hyde_embeddings = HypotheticalDocumentEmbedder(
    llm,
    base_embeddings,
    prompt_key="web_search"
)

child_splitter = RecursiveCharacterTextSplitter(chunk_size = 200, chunk_overlap=20)
parent_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)

store = InMemoryStore()
vectorstore = Chroma(embedding_function=hyde_embeddings)

parent_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=storage,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter
)

parent_retriever.add_documents(doc)


compressor = EmbeddingsFilter(embeddings=base_embeddings, similarity_threshold=0.75)

compressor_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=parent_retriever
)


"Максимальная чистота" (PDR + Ensemble + LLM Compressor)
Для чего: Это "тяжелая артиллерия". Используем ансамбль для поиска, а потом просим LLM вырезать из длинных документов PDR только самое важное.

In [None]:
from langchain.retrievers.document_compressors import LLMChainExtractor

# 1. Базовый ретривер - Ансамбль (уже созданный ранее)
base_retriever = EnsembleRetriever(
    retrievers=[pdr_retriever, bm25_retriever],
    weights=[0.5, 0.5]
)

# 2. Умный компрессор (LLMChainExtractor)
# Он прочитает найденные куски и удалит из них лишний текст (воду)
smart_compressor = LLMChainExtractor.from_llm(llm)

# 3. Финальный пайплайн
super_retriever = ContextualCompressionRetriever(
    base_compressor=smart_compressor,
    base_retriever=base_retriever
)

# Ответ будет очень коротким, точным и без лишнего мусора
clean_context = super_retriever.invoke("Какие 4 характеристики у вирусов?")

Буквальный и Смысловой Гибрид" (Hybrid + Ensemble)
Для чего: Чтобы находить и точные термины (как "XJ-99"), и общие понятия. Здесь мы объединяем классический поиск (BM25) и векторный.

In [None]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

# 1. Ретривер для точных слов (Sparse)
bm25_retriever = BM25Retriever.from_documents(child_chunks)
bm25_retriever.k = 3

# 2. Ретривер для смысла (Dense)
vector_retriever = Chroma(embedding_function=base_embeddings).as_retriever(search_kwargs={"k": 3})

# 3. Ансамбль (Hybrid Search)
# Даем 70% веса смыслу и 30% точным словам
hybrid_ensemble = EnsembleRetriever(
    retrievers=[bm25_retriever, vector_retriever],
    weights=[0.3, 0.7]
)

# 4. Проверка
# Найдет "вирус XJ-99" даже если векторная модель не знает такого названия
results = hybrid_ensemble.invoke("расскажи про вирус XJ-99")