# Построение пайплайна для извлечения нужной информации из БД и генерации ответа на его основе

## Создание retriever

*Загрузим готовую БД и retriever с определенными раннее параметрами.*

In [2]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
bd_dir = "../db_collecting/chroma_artilcles"

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
vectordb = Chroma(persist_directory=bd_dir, embedding_function=embedding_model)

*Правильная настройка retriever позволит улучшить релевантность выборки в целом. После проведения небольшого эксперимента с разными значениями lambda_mult, fetch_k и k, которые позволяют регулировать различия и релевантность в выдаваемых документах, количество всей выборки, из которой выбираются итоговые и количество итоговых, были выбраны следующие. Из всех документов берется 35 наиболее релевантных, из них отбирается 5, чьи эмбеддинги достаточно далеки друг от друга. Это позволяет взять документы с разными смыслами, из-за чего нужные нам сущноти точно попадут в БЯМ.*

In [4]:
retriever = vectordb.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 5, "fetch_k": 35, "lambda_mult": 0.3},
)

In [5]:
retriever.invoke("What are potential targets for Alzheimer's disease treatment?")

[Document(id='f9ad16ee-92ac-4237-934c-e37707a7081f', metadata={'file': '8.pdf', 'year': 2024, 'id': 25, 'type': 'introduction', 'title': "Associations of semaglutide with first‐time diagnosis of Alzheimer's disease in patients with type 2 diabetes: Target trial emulation using nationwide real‐world data in the US", 'link': 'https://doi.org/10.1002/alz.14313'}, page_content='An estimated 6.9 million Americans aged 65 and older will be living with Alzheimer’s disease (AD) in 2024, a number that is projected to increase to 13.8 million by 2060.1 AD has no cure, and about 40% of cases are linked to modifiable risk factors.2 Given its growing prevalence, profound societal and economic impact, and absence of a cure, targeting these modifiable risk factors is crucial to prevent or delay AD and related dementia. 3,4'),
 Document(id='96986312-5a98-4615-99d1-599c522885b6', metadata={'id': 35, 'year': 2021, 'type': 'result', 'title': "Targeting transthyretin in Alzheimer's disease: Drug discovery

In [6]:
[d.metadata for d in retriever.invoke("What are potential targets for Alzheimer's disease treatment?")]

[{'file': '8.pdf',
  'title': "Associations of semaglutide with first‐time diagnosis of Alzheimer's disease in patients with type 2 diabetes: Target trial emulation using nationwide real‐world data in the US",
  'year': 2024,
  'type': 'introduction',
  'link': 'https://doi.org/10.1002/alz.14313',
  'id': 25},
 {'year': 2021,
  'file': '12.pdf',
  'link': 'https://doi.org/10.1016/j.ejmech.2021.113847',
  'title': "Targeting transthyretin in Alzheimer's disease: Drug discovery of small-molecule chaperones as disease-modifying drug candidates for Alzheimer's disease",
  'id': 35,
  'type': 'result'},
 {'title': "Natural acetylcholinesterase inhibitors: A multi-targeted therapeutic potential in Alzheimer's disease",
  'file': '2.pdf',
  'year': 2024,
  'id': 8,
  'type': 'result',
  'link': 'https://doi.org/10.1016/j.ejmcr.2024.100154'},
 {'id': 19,
  'title': "Intelligent lesion blood–brain barrier targeting nano-missiles for Alzheimer's disease treatment by anti-neuroinflammation and ne

## Подключение к моделям БЯМ для генерации ответов

*Попробуем подать в БЯМ запрос пользователя и извлеченные документы и сгенерировать ответ. Берется модель серверная, чтобы скрипт запускался на любом компьютере. Локальная модель отвечала бы быстрее. Сервис OpenRouter выдает API-токен для доступа к моделям, который позволяет отправлять запрос в модели из России. Сервис разрешает отправлять не более 20 запросов в минуту и не более 50 в день.*

In [7]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.output_parsers import StrOutputParser

In [8]:
import os
from dotenv import load_dotenv

load_dotenv()

True

*Используем модель Gemma 3 27B для генерации ответа и Llama 3.3 70B Instruct, которая поддерживает tools, что может впоследствии пригодиться, для структурирования выхода. Ответ ограничим в токенах для избежания генерации больших текстов.*

In [9]:
llm_generation = ChatOpenAI(
    model="google/gemma-3-27b-it:free",
    temperature=0.2,
    max_tokens = 1500,
    max_retries=3,
    api_key=os.environ.get("OPENROUTER_API_KEY"),
    base_url="https://openrouter.ai/api/v1"
)

llm_structurizing = ChatOpenAI(
    model="meta-llama/llama-3.3-70b-instruct:free",
    temperature=0,
    max_retries=3,
    api_key=os.environ.get("OPENROUTER_API_KEY"),
    base_url="https://openrouter.ai/api/v1"
)

In [10]:
prompt_generation = ChatPromptTemplate.from_template(
"""You are a scientific assistant specializing in Alzheimer's disease research. Your task is to answer questions based ONLY on the provided context from scientific articles.

STRICT RULES:
1. Answer ONLY using information explicitly stated in the context below
2. If the context does not contain enough information to answer fully, say "The provided documents do not contain sufficient information about [topic]"
3. Never use external knowledge or make assumptions beyond the given text
4. Always cite the sources: include article titles and DOI/URLs when mentioning findings

CONTEXT:
{docs}

QUESTION: {question}

ANSWER (cite sources with titles and links):"""
)

prompt_structurize = ChatPromptTemplate.from_template(
"""You are a formatting assistant. 
Input: 
1. An "Answer" text.
2. A list of "Source Documents" with metadata.

Task:
Return a JSON object with two keys:
- "message": The exact "Answer" text provided below.
- "sources": A list of objects ["title": "...", "file": "..."] for every document from the "Source Documents" list that supports the answer.

Answer:
{initial_answer}

Source Documents:
{docs}
"""
)

In [11]:
def retrAndGenerate(user_query):
    docs_retrieved = retriever.invoke(user_query)

    answer_chain = prompt_generation | llm_generation | StrOutputParser()
    initial_answer = answer_chain.invoke({"docs": docs_retrieved, "question": user_query})

    structurized_answer_chain = prompt_structurize | llm_structurizing | JsonOutputParser()
    result = structurized_answer_chain.invoke({"initial_answer": initial_answer, "docs": docs_retrieved})

    return result

*Если подавать второй модели только метаданные, то ухудшается ее работа, поэтому подаются те же документы полностью.*

In [80]:
user_query = "What are potential targets for Alzheimer's disease treatment?"
result = retrAndGenerate(user_query)

In [81]:
print(dict(result)["message"])

According to the provided documents, potential therapeutic targets for Alzheimer's disease include: 
*   Microtubule-associated protein, anti-neuroinflammation, synaptic and neuroprotection, metabolism, neurogenesis, vascular system and epigenetic drugs ( "Intelligent lesion blood–brain barrier targeting nano-missiles for Alzheimer's disease treatment by anti-neuroinflammation and neuroprotection", https://doi.org/10.1016/j.apsb.2022.02.001) 
*   Transthyretin (TTR) and amyloid peptide (Ab) systems (“Targeting transthyretin in Alzheimer's disease: Drug discovery of small-molecule chaperones as disease-modifying drug candidates for Alzheimer's disease”, https://doi.org/10.1016/j.ejmech.2021.113847) 
*   Disease-related genes identified through transcriptomic data analysis (“Multiomics Identification of Potential Targets for Alzheimer Disease and Antrocin as a Therapeutic Candidate”, https://doi.org/10.3390/pharmaceutics13101555)


In [82]:
dict(result)["sources"]

[{'title': "Intelligent lesion blood–brain barrier targeting nano-missiles for Alzheimer's disease treatment by anti-neuroinflammation and neuroprotection",
  'file': '6.pdf'},
 {'title': "Targeting transthyretin in Alzheimer's disease: Drug discovery of small-molecule chaperones as disease-modifying drug candidates for Alzheimer's disease",
  'file': '12.pdf'},
 {'title': 'Multiomics Identification of Potential Targets for Alzheimer Disease and Antrocin as a Therapeutic Candidate',
  'file': '21.pdf'}]

In [84]:
file_dict = {}
for source in (dict(result)["sources"]):
    file_dict[source["file"]] = source["title"]

file_dict

{'6.pdf': "Intelligent lesion blood–brain barrier targeting nano-missiles for Alzheimer's disease treatment by anti-neuroinflammation and neuroprotection",
 '12.pdf': "Targeting transthyretin in Alzheimer's disease: Drug discovery of small-molecule chaperones as disease-modifying drug candidates for Alzheimer's disease",
 '21.pdf': 'Multiomics Identification of Potential Targets for Alzheimer Disease and Antrocin as a Therapeutic Candidate'}

## Оценка системы

*Для оценки качества будет использоваться БЯМ, которая будет оценивать систему по трехюальной шкале по трем критериям: Faithfulness (достоверность, отсутствие галлюцинаций), Answer Relevance (соответствие ответа вопросу), Context Precision (релевантность извлекаемых из базы документов).*

Итоговой оценкой будет являться среднее значение суммированных баллов по всем запросам:
* 1-3 балла - система слабая,
* 4-6 - средняя,
* 7-8 - хорошая,
* 9 - отличная.

*Сперва с помощью Gemini 3 Pro составим список из 10 конкретных вопросов на основе случайных документов.*

In [12]:
questions_list = [
    "What effect do TR-ZRA nanoparticles have on CD22 expression in BV2 cells after exposure to Aβ1-42, and what does the Western blotting result indicate about the delivered CD22shRNA?",
    "According to structural MRI studies, what is the relationship between the volume of the Ventral Tegmental Area (VTA) and the hippocampal formation in Alzheimer's disease patients?",
    "Compare the Aβ-degrading enzymes APN, APA, and DPP4: which one has the highest protein content, and which one exhibits the highest specific activity?",
    "What is the established role of Transthyretin (TTR) in Alzheimer's disease, and how are small-molecule chaperones expected to affect the interaction between TTR and Aβ peptide?",
    "List the kinases, other than GSK-3β and CDK5, whose inhibitors have been evaluated in AD mouse models according to the text.",
    "In Hsp27 transgenic mice, how are p-GSK3βTyr216 and p-GSK3βSer9 levels altered, and what pathological consequence does this alteration cause?",
    "What confidence score threshold was set in the STRING database to construct the PPI network for quercetin's common targets in Type 2 Diabetes and Alzheimer's disease?",
    "Out of 104 compounds inhibiting HDAC6 by at least 75%, how many were found to be specific for the histone H3 substrate in the second screening?",
    "To which family of deacetylases does SIRT1 belong, and upon which cofactor is its activity dependent?",
    "At what age do Tg2576 mice begin to show substantial apoptotic cell death of VTA dopamine neurons and significant local neuroinflammation?"
]

In [13]:
prompt_evaluation = ChatPromptTemplate.from_template(
"""You are an evaluator for a RAG (Retrieval-Augmented Generation) system.

INPUTS:
- Question: {question}
- AssistantAnswer: {answer}
- RetrievedDocuments: {docs}

TASK:
Evaluate the AssistantAnswer using ONLY the RetrievedDocuments and assign scores from 1 to 3 for:
1) Faithfulness (answer grounded in context, no hallucinations)
2) Answer Relevance (answer addresses the question)
3) Context Precision (retrieved docs are useful/supportive, little noise)

SCORING RUBRICS (1–3):
Faithfulness:
- 3 = All factual claims are supported by the documents; no contradictions.
- 2 = Mostly supported, but 1–2 minor claims are unsupported/unclear.
- 1 = Several unsupported claims OR contradictions with the documents OR answer not grounded.

Answer Relevance:
- 3 = Directly answers the question with the needed specifics.
- 2 = Partially answers / somewhat vague / misses key aspects.
- 1 = Mostly off-topic / does not answer the question.

Context Precision:
- 3 = Most retrieved documents are clearly relevant and support the answer (low noise).
- 2 = Mixed relevance: some helpful docs, some noise.
- 1 = Mostly irrelevant docs; little support for answering.

OUTPUT:
Return ONLY valid JSON (no markdown, no extra text) with this schema:
  "faithfulness": score between 1-3,
  "answer_relevance": score between 1-3,
  "context_precision": score between 1-3
"""
)

evaluation = prompt_evaluation | llm_structurizing | JsonOutputParser()

In [14]:
import pandas as pd
from tqdm import tqdm

In [15]:
answers_eval = []
docs_eval = []

for question in questions_list:
    docs_retrieved = retriever.invoke(question)
    docs_eval.append(docs_retrieved)

    answer_chain = prompt_generation | llm_generation | StrOutputParser()
    initial_answer = answer_chain.invoke({"docs": docs_retrieved, "question": question})

    structurized_answer_chain = prompt_structurize | llm_structurizing | JsonOutputParser()
    result = structurized_answer_chain.invoke({"initial_answer": initial_answer, "docs": docs_retrieved})

    answers_eval.append(dict(result)["message"])

eval_df = pd.DataFrame({"answer": answers_eval, "docs": docs_eval})
eval_df

Unnamed: 0,answer,docs
0,The provided documents do not contain sufficie...,"[page_content='Figure 9c and Figure S17, Suppo..."
1,The VTA volume is strongly associated with the...,[page_content='structural MRI to link the volu...
2,"According to the provided documents, protein c...",[page_content='and dipeptidyl peptidase 4 (DPP...
3,**The established role of Transthyretin (TTR) ...,[page_content='Transthyretin (TTR) has a well-...
4,"Compounds targeting p38 MAPK, ERK1/2, JNK3 or ...",[page_content='to overcome these drawbacks [22...
5,"In Hsp27 transgenic mice, p-GSK3βTyr216 is upr...",[page_content='Heat shock protein 27 (Hsp27) t...
6,The provided documents do not contain sufficie...,[page_content='Quercetin has demonstrated anti...
7,"According to the provided documents, 23 molecu...",[page_content='confirmation rate is reasonable...
8,SIRT1 is a member of the Sirtuin family of NAD...,[page_content='treatments only offer limited s...
9,According to the article “The VTA dopaminergic...,[page_content='evidence linking the dopaminerg...


In [16]:
eval_df['question'] = questions_list
eval_df.head(3)

Unnamed: 0,answer,docs,question
0,The provided documents do not contain sufficie...,"[page_content='Figure 9c and Figure S17, Suppo...",What effect do TR-ZRA nanoparticles have on CD...
1,The VTA volume is strongly associated with the...,[page_content='structural MRI to link the volu...,"According to structural MRI studies, what is t..."
2,"According to the provided documents, protein c...",[page_content='and dipeptidyl peptidase 4 (DPP...,"Compare the Aβ-degrading enzymes APN, APA, and..."


In [18]:
result_evals = []
for idx, row in tqdm(eval_df.iterrows()):
    result_evals.append(dict(evaluation.invoke({"question": row["question"], "answer": row["answer"], "docs": row["docs"]})))

10it [00:41,  4.12s/it]


In [20]:
metrics_df = pd.DataFrame(result_evals)
metrics_df

Unnamed: 0,faithfulness,answer_relevance,context_precision
0,2,2,2
1,3,3,2
2,3,3,3
3,3,3,2
4,3,3,2
5,3,3,3
6,3,2,2
7,3,3,2
8,3,3,2
9,3,3,2


In [23]:
metrics_df["sum"] = metrics_df["faithfulness"] + metrics_df["answer_relevance"] + metrics_df["context_precision"]

In [24]:
metrics_df.describe()

Unnamed: 0,faithfulness,answer_relevance,context_precision,sum
count,10.0,10.0,10.0,10.0
mean,2.9,2.8,2.2,7.9
std,0.316228,0.421637,0.421637,0.875595
min,2.0,2.0,2.0,6.0
25%,3.0,3.0,2.0,8.0
50%,3.0,3.0,2.0,8.0
75%,3.0,3.0,2.0,8.0
max,3.0,3.0,3.0,9.0


Оценка системы: 7.9 из 9

*Модель почти всегда строго следует контексту, она отвечает именно на поставленный вопрос, однако в документах попадается информация, не соответсвующая заданному вопросу, можно уменьшить количество документов, но все же за счет этого количества и разброса по семантике документов повышается вероятность того, что хотя бы один из документов будет релевантным. Можно проверить, в скольки случаях из этих 10 в списке документов содержались тексты, на основе которых были придуманы вопросы.*

In [25]:
eval_df = pd.concat([eval_df.reset_index(drop=True), metrics_df.reset_index(drop=True)], axis=1)
eval_df.head(3)

Unnamed: 0,answer,docs,question,faithfulness,answer_relevance,context_precision,sum
0,The provided documents do not contain sufficie...,"[page_content='Figure 9c and Figure S17, Suppo...",What effect do TR-ZRA nanoparticles have on CD...,2,2,2,6
1,The VTA volume is strongly associated with the...,[page_content='structural MRI to link the volu...,"According to structural MRI studies, what is t...",3,3,2,8
2,"According to the provided documents, protein c...",[page_content='and dipeptidyl peptidase 4 (DPP...,"Compare the Aβ-degrading enzymes APN, APA, and...",3,3,3,9


In [31]:
source_titles_list = [
    "Study on the Role of an Erythrocyte Membrane‐Coated Nanotheranostic System in Targeted Immune Regulation of Alzheimer's Disease",
    "The VTA dopaminergic system as diagnostic and therapeutical target for Alzheimer's disease",
    "Metabolic resistance of Aβ3pE-42, a target epitope of the anti-Alzheimer therapeutic antibody, donanemab",
    "Targeting transthyretin in Alzheimer's disease: Drug discovery of small-molecule chaperones as disease-modifying drug candidates for Alzheimer's disease",
    "Death-associated protein kinase 1 as a therapeutic target for Alzheimer's disease",
    "GSK3: A potential target and pending issues for treatment of Alzheimer's disease",
    "Mechanism of quercetin therapeutic targets for Alzheimer disease and type 2 diabetes mellitus",
    "A hybrid approach unveils drug repurposing candidates targeting an Alzheimer pathophysiology mechanism",
    "Identification of Sirtuin 1-Targeted Anti-Alzheimer Agents Using Structure-Based Drug Design and Multi-Database Screening",
    "The VTA dopaminergic system as diagnostic and therapeutical target for Alzheimer's disease"
]


In [33]:
target_article = []
for i, row in eval_df.iterrows():
    if source_titles_list[i] in [doc.metadata.get('title') for doc in row['docs']]:
        target_article.append(1)
    else:
        target_article.append(0)

target_article

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

*Статьи, на основе которых были сгенерированы вопросы, каждый раз попадали в список извлеченных документов.*

In [37]:
eval_df["target_article"] = target_article

In [43]:
eval_df.to_csv("eval_df.csv")

Система хорошо работает.

## Подготовка скрипта для вызова из main.py
*Попробуем упаковать это все в функции: первая должна инициализировать все, вторая извлекать данные из БД, генерировать ответ, возвращать его и названия документов, которые нужны для отображения в интерфейсе. Кроме того, надо учесть, что пользователь может задать вопрос по предыщему контексту. Для этого можно добавить еще один промпт, чтобы ассистент принимал на вход 5 предыдущих записей в истории чата, а затем возвращал один вопрос.*

In [None]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.output_parsers import StrOutputParser

def chat_update(chat, role, text):
    chat.append({"role": role, "text": text})
    return chat

def startRAG(bd_dir, api_key):
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    vectordb = Chroma(persist_directory=bd_dir, embedding_function=embedding_model)

    retriever = vectordb.as_retriever(
        search_type="mmr",
        search_kwargs={"k": 5, "fetch_k": 35, "lambda_mult": 0.3}
        )
    
    llm_generation = ChatOpenAI(
        model="google/gemma-3-27b-it:free",
        temperature=0.2,
        max_tokens = 2000,
        api_key=api_key,
        base_url="https://openrouter.ai/api/v1"
    )

    llm_structurizing = ChatOpenAI(
        model="meta-llama/llama-3.3-70b-instruct:free",
        temperature=0,
        api_key=api_key,
        base_url="https://openrouter.ai/api/v1"
    )

    prompt_identify = ChatPromptTemplate.from_template(
    """Given a chat history and the latest user question which might reference context in the chat history,
    formulate a standalone question which can be understood without the chat history. Do NOT answer the question,
    just reformulate it if needed and otherwise return it as is.

        Chat History:
        {chat_history}

        Latest Question: 
        {question}

        Standalone Question:
    """
    )

    prompt_generate = ChatPromptTemplate.from_template(
    """You are a scientific assistant specializing in Alzheimer's disease research. Your task is to answer questions based ONLY on the provided context from scientific articles.

    STRICT RULES:
    1. Answer ONLY using information explicitly stated in the context below
    2. If the context does not contain enough information to answer fully, say "The provided documents do not contain sufficient information about [topic]"
    3. Never use external knowledge or make assumptions beyond the given text
    4. Always cite the sources: include article titles and DOI/URLs when mentioning findings

    CONTEXT:
    {docs}

    QUESTION: {question}

    ANSWER (cite sources with titles and links):"""
    )

    prompt_structurize = ChatPromptTemplate.from_template(
    """You are a formatting assistant. 
    Input: 
    1. An "Answer" text.
    2. A list of "Source Documents" with metadata.

    Task:
    Return a JSON object with two keys:
    - "message": The exact "Answer" text provided below.
    - "sources": A list of objects ["title": "...", "file": "..."] for every document from the "Source Documents" list that supports the answer.

    Answer:
    {initial_answer}

    Source Documents:
    {docs}
    """
    )

    identified_answer_chain = prompt_identify | llm_generation | StrOutputParser()
    answer_chain = prompt_generate | llm_generation | StrOutputParser()
    structurized_answer_chain = prompt_structurize | llm_structurizing | JsonOutputParser()

    return identified_answer_chain, answer_chain, structurized_answer_chain, retriever

def answer_question(identified_answer_chain, answer_chain, structurized_answer_chain, retriever, chat):
    
    last_messages = chat[-6:]

    user_query = (identified_answer_chain.invoke({"chat_history": str(last_messages), "question": last_messages[-1]["text"]})).strip()
    
    docs_retrieved = retriever.invoke(user_query)

    initial_answer = answer_chain.invoke({"docs": docs_retrieved, "question": user_query})
    result = structurized_answer_chain.invoke({"initial_answer": initial_answer, "docs": docs_retrieved})

    message = dict(result)["message"]
    sources_list = dict(result)["sources"]
    
    file_dict = {source.get("file"): source.get("title") for source in sources_list}

    chat = chat_update(chat, "ai", message)

    return chat, file_dict