In [1]:
import pandas as pd
import sqlite3
from langchain_community.document_loaders import DataFrameLoader
from langchain.docstore.document import Document

DB_NAME = "fightaging_articles.db"

def load_data_from_db(db_name):
    try:
        conn = sqlite3.connect(db_name)
        query = "SELECT url, publish_date, title, body FROM articles"
        df = pd.read_sql(query, conn)
        conn.close()

        df['full_text'] = df['title'] + ' \n\n' + df['body']
        print(f"✅ Successfully loaded {len(df)} articles from '{db_name}'.")

        return df
    
    except Exception as e:
            print(f"❌ Could not load data from the database. Error: {e}")
            return pd.DataFrame()
    
articles_df = load_data_from_db(DB_NAME)

if not articles_df.empty:
     loader = DataFrameLoader(articles_df, page_content_column="full_text")

     documents = loader.load()

     # Verification Step
     print(f"\n✅ Successfully loaded {len(documents)} documents into LangChain.")
     print("\n--- Example of the first document ---")
     # Using repr() provides a more detailed output of the object structure
     print(repr(documents[0]))

else:
    print("❌ DataFrame is empty. Cannot proceed.")

✅ Successfully loaded 18753 articles from 'fightaging_articles.db'.

✅ Successfully loaded 18753 documents into LangChain.

--- Example of the first document ---
Document(metadata={'url': 'https://www.fightaging.org/archives/2004/01/welcome-to-fight-aging/', 'publish_date': '2004-01-31', 'title': 'Welcome to Fight Aging!', 'body': "Welcome aboard! This new collaborative blog will extend the slightly bloggish daily news at the Longevity Meme into a more friendly and informative format. We will be bringing in informative, intelligent folks from the front lines in the fight against aging as authors, and plan to keep you educated and aware. As a society, we are on the verge of being able to understand, treat and ultimately prevent the degenerative conditions of aging. But we can't sit around and wait for this to happen! Join us in helping to support and document the advance of medicine for greatly extended healthy lifespans within our lifetime."}, page_content="Welcome to Fight Aging! \n\n

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

if 'documents' in locals() and documents:
    print(f"Original number of documents: {len(documents)}")

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        add_start_index=True
    )

    docs_chunks = text_splitter.split_documents(documents)

    print(f"\n✅ Successfully split {len(documents)} documents into {len(docs_chunks)} smaller chunks.")
    
    print("\n--- Example of the first chunk ---")
    print(repr(docs_chunks[0]))

    print("\n--- Example of the second chunk from the same original document ---")
    print(repr(docs_chunks[1]))
else:
    print("❌ 'documents' list not found. Please run Cell 1 first to load the data.")

Original number of documents: 18753

✅ Successfully split 18753 documents into 90882 smaller chunks.

--- Example of the first chunk ---
Document(metadata={'url': 'https://www.fightaging.org/archives/2004/01/welcome-to-fight-aging/', 'publish_date': '2004-01-31', 'title': 'Welcome to Fight Aging!', 'body': "Welcome aboard! This new collaborative blog will extend the slightly bloggish daily news at the Longevity Meme into a more friendly and informative format. We will be bringing in informative, intelligent folks from the front lines in the fight against aging as authors, and plan to keep you educated and aware. As a society, we are on the verge of being able to understand, treat and ultimately prevent the degenerative conditions of aging. But we can't sit around and wait for this to happen! Join us in helping to support and document the advance of medicine for greatly extended healthy lifespans within our lifetime.", 'start_index': 0}, page_content="Welcome to Fight Aging! \n\nWelcome

In [3]:
# Cell 3B: Create Embeddings and the Vector Store
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
import os

# Check if the chunks exist from the previous cell
if 'docs_chunks' in locals() and docs_chunks:
    
    # 1. Define the path for the persistent vector store directory
    persist_directory = 'chroma_db'
    
    # 2. Define the embedding model we want to use
    # We'll use the same powerful and efficient model from your BERTopic analysis
    embedding_model_name = "all-MiniLM-L6-v2"
    embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
    
    # 3. Create and persist the vector store
    # This is the most computationally expensive step. It will iterate through all chunks,
    # create an embedding for each one, and store it in the Chroma database.
    # We add a check to only run this if the database doesn't already exist.
    if not os.path.exists(persist_directory):
        print(f"Creating new vector store in '{persist_directory}'...")
        print("This will take a while, but you only have to do it once.")
        
        # This single command does all the work: embedding and storing
        vectorstore = Chroma.from_documents(
            documents=docs_chunks, 
            embedding=embedding_model,
            persist_directory=persist_directory
        )
        
        print("\n✅ Vector store created and persisted to disk.")
    else:
        print(f"✅ Vector store already exists in '{persist_directory}'. Loading is not needed in this step.")
        # If it already exists, we can simply load it in the next cell like this:
        # vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embedding_model)

else:
    print("❌ 'docs_chunks' not found. Please run Cell 2 first.")

  from .autonotebook import tqdm as notebook_tqdm


✅ Vector store already exists in 'chroma_db'. Loading is not needed in this step.


In [4]:
import getpass
import os
from langchain_deepseek.chat_models import ChatDeepSeek
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda  # Corrected: Added RunnableLambda
from langchain.schema.output_parser import StrOutputParser

# --- 1. SET UP API KEY ---
if 'DEEPSEEK_API_KEY' not in os.environ:
    os.environ['DEEPSEEK_API_KEY'] = getpass.getpass('Enter your DeepSeek API Key: ')

# --- 2. LOAD THE COMPONENTS ---
persist_directory = 'chroma_db'
embedding_model_name = "all-MiniLM-L6-v2"
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
vectorstore = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding_model
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 50})
llm = ChatDeepSeek(model="deepseek-chat")

# --- 3. DEFINE THE PROMPT TEMPLATE ---
template = """
You are an expert assistant for answering questions about longevity and anti-aging research.
Use only the following context from the FightAging.org blog to answer the question.
If you don't know the answer from the context provided, just say that you don't know.
Keep the answer concise and based on the provided sources.

CONTEXT:
{context}

QUESTION:
{question}

ANSWER:
"""
prompt = PromptTemplate.from_template(template)

# --- DEBUGGING FUNCTIONS ---
def log_retrieved_documents(input_dict):
    """A simple function to print the retrieved documents."""
    print("--- 💻 Documents Retrieved by the Retriever ---")
    for i, doc in enumerate(input_dict['context']):
        print(f"Doc {i+1} | Source: {doc.metadata.get('url', 'N/A')}")
        print(f"Content Snippet: {doc.page_content[:250]}...\n")
    return input_dict

def log_final_prompt(prompt_value):
    """A simple function to print the final prompt sent to the LLM."""
    print("\n--- ➡️  Final Prompt Being Sent to the LLM ---")
    print(prompt_value.to_string())
    print("---------------------------------------------")
    return prompt_value

# --- 4. BUILD THE RAG CHAIN ---
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | RunnableLambda(log_retrieved_documents)
    | prompt
    | RunnableLambda(log_final_prompt)
    | llm
    | StrOutputParser()
)

# --- 5. INVOKE THE CHAIN ---
question = """

Based on the full content of the FightAging.org blog, provide a multi-part summary addressing the following:

1.  Describe the main evolution in research focus over the last two decades. Contrast the foundational topics that were dominant in the early years, such as the initial work of the SENS Research Foundation, with the more recent focus on translational science and emerging fields like the gut microbiome.

2.  Drilling down on the most significant modern topic, explain the specific molecular reasons why cellular senescence became so central to the conversation. What is the SASP, and how did its discovery shift the therapeutic focus?

3.  Finally, synthesizing this evolution, what are the most promising therapeutic strategies discussed on the blog today, particularly those that are mentioned in the context of moving towards human clinical trials?
"""


print(f"Asking the RAG chain: '{question}'")

answer = rag_chain.invoke(question)

print("\n--- ⬅️  Final Answer ---")
print(answer)

  vectorstore = Chroma(


Asking the RAG chain: '

Based on the full content of the FightAging.org blog, provide a multi-part summary addressing the following:

1.  Describe the main evolution in research focus over the last two decades. Contrast the foundational topics that were dominant in the early years, such as the initial work of the SENS Research Foundation, with the more recent focus on translational science and emerging fields like the gut microbiome.

2.  Drilling down on the most significant modern topic, explain the specific molecular reasons why cellular senescence became so central to the conversation. What is the SASP, and how did its discovery shift the therapeutic focus?

3.  Finally, synthesizing this evolution, what are the most promising therapeutic strategies discussed on the blog today, particularly those that are mentioned in the context of moving towards human clinical trials?
'
--- 💻 Documents Retrieved by the Retriever ---
Doc 1 | Source: https://www.fightaging.org/archives/2014/07/sum