# Memory project - Database vectorization

## Secrets

In [18]:
import os
from openai import OpenAI
from dotenv import load_dotenv
from pathlib import Path

# Load path from the environment variable
env_ih1 = os.getenv("ENV_IH1")

dotenv_path = Path(env_ih1)
load_dotenv(dotenv_path=dotenv_path)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY= os.getenv('PINECONE_KEY')
SERPAPI_API_KEY = os.getenv('SERPAPI_API_KEY')
STEAMSHIP_API_KEY = os.getenv('STEAMSHIP_API_KEY')
LANGSMITH_API_KEY = os.getenv('LANGSMITH_API_KEY')
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
GEMINI_KEY = os.getenv('GEMINI_KEY')

os.environ['PATH'] += os.pathsep + '/usr/bin'

## Libraries

In [19]:
import os
import json
import pandas as pd
import shutil
import openai
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from pinecone import Index  # Import Index for Pinecone operations

# Install missing packages
%pip install sentence-transformers

from sentence_transformers import SentenceTransformer


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


## Family safe 

## Summarizer local


In [20]:
from huggingface_hub import login

# Use your token here
login(token=HUGGINGFACEHUB_API_TOKEN)

## Read json + create local df/csv

In [21]:
# V2

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

def process_json_files(data_dir=r'.\Family safe', processed_dir=r'.\Processed JSONs', output_file=r'.\combined_data.csv'):
    """
    Processes all JSON files in the specified directory, adds new data to an existing DataFrame, 
    and ensures no duplicates.

    Args:
        data_dir: Directory containing JSON files.
        processed_dir: Directory where processed files will be moved.
        output_file: Filepath for the saved combined DataFrame.

    Returns:
        A DataFrame containing the updated processed data.
    """
    all_data = []

    # Ensure the processed directory exists
    os.makedirs(processed_dir, exist_ok=True)

    # Process each JSON file
    for filename in os.listdir(data_dir):
        if filename.endswith(".json"):
            filepath = os.path.join(data_dir, filename)
            with open(filepath, 'r', encoding='utf-8') as f:
                json_data = json.load(f)

            # Extract document-level metadata
            name = json_data.get("Name", "Unknown")
            doc_type = json_data.get("Type", "Unknown")
            author = json_data.get("Author", "Unknown")
            date = json_data.get("Date", "Unknown")

            # Initialize a page counter
            page_counter = 1

            # Process each page
            for page in json_data.get("Pages", []):
                page_number = page.get("Page Number", None)
                
                # Assign a sequential number if Page Number is NaN
                if page_number is None:
                    page_number = page_counter
                    page_counter += 1  # Increment the counter

                page_data = {
                    "Doc name": name,
                    "Type": doc_type,
                    "Author": author,
                    "Date": date,
                    "Text": page.get("Extracted Text", ""),
                    "Page_number": page_number,
                }
                all_data.append(page_data)


            # Move the processed file
            processed_filepath = os.path.join(processed_dir, filename)
            os.rename(filepath, processed_filepath)

    # Create a DataFrame from new data
    new_df = pd.DataFrame(all_data)

    # Add summaries to the new DataFrame
    new_df = populate_summary_column(new_df)

    # Reorder columns
    new_df = new_df[["Doc name", "Type", "Author", "Date", "Text", "Page_number", "Summary"]]

    # Check if an existing DataFrame exists
    if os.path.exists(output_file):
        existing_df = pd.read_csv(output_file)
        combined_df = pd.concat([existing_df, new_df], ignore_index=True).drop_duplicates()
    else:
        combined_df = new_df

    # Save the updated DataFrame
    combined_df.to_csv(output_file, index=False)
    print(f"Updated DataFrame saved to {output_file}")

    return combined_df

def populate_summary_column(df):
    """
    Populates the 'Summary' column in the DataFrame using generate_text_summaries.
    
    Args:
        df: The DataFrame containing the text data.

    Returns:
        The DataFrame with the 'Summary' column populated.
    """
    # Extract texts from the DataFrame
    texts = df["Text"].tolist()

    # Generate summaries for the texts
    text_summaries, _ = generate_text_summaries(texts, tables=None, summarize_texts=True)

    # Assign the summaries back to the DataFrame
    df["Summary"] = text_summaries

    return df

def generate_text_summaries(texts, tables=None, summarize_texts=False):
    """
    Summarize text elements
    texts: List of str
    tables: List of str
    summarize_texts: Bool to summarize texts
    """

    # Prompt
    prompt_text = """You are an assistant tasked with summarizing tables and text for retrieval. \
    These summaries will be embedded and used to retrieve the raw text or table elements. \
    Give a concise summary of the table or text that is well optimized for retrieval. Table or text: {element} 
    Do not include "this is a summary" at the begining of the summary.
    The summary must be in French. """
    prompt = ChatPromptTemplate.from_template(prompt_text)

    # Text summary chain
    model = ChatOpenAI(temperature=0, model="gpt-4o-mini")
    summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

    # Initialize empty summaries
    text_summaries = []
    table_summaries = []

    # Apply to text if texts are provided and summarization is requested
    if texts and summarize_texts:
        text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
    elif texts:
        text_summaries = texts

    # Apply to tables if tables are provided
    if tables:
        table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

    return text_summaries, table_summaries


In [22]:
df=process_json_files(data_dir=r'data\Family safe', processed_dir=r'data\Family safe')

Updated DataFrame saved to .\combined_data.csv


In [23]:
# # Run the function
# updated_df = process_json_files(data_dir=r'.\Family safe', processed_dir=r'.\Family safe')

# # Preview the updated DataFrame
# print(updated_df.head())

In [37]:
import pandas as pd

def split_text_dataframe(df, min_len=400, max_len=600, overlap=0.2):
    """
    Splits the 'Text' column in the DataFrame into chunks of 400-600 characters with 20% overlap.
    Keeps short texts unchanged and maintains metadata (excluding word_count).
    """
    def split_text(text):
        """Helper function to split individual text entries."""
        if len(text) <= max_len:
            return [text]  # Keep short texts as a single chunk
        
        step = int(max_len * (1 - overlap))  # Calculate step size for overlap
        chunks = []
        start = 0
        while start < len(text):
            end = min(start + max_len, len(text))
            chunks.append(text[start:end])
            if end == len(text):
                break  # Stop if the end of text is reached
            start += step  # Move forward with overlap
        
        return chunks

    # Expand DataFrame by splitting text
    expanded_rows = []
    for _, row in df.iterrows():
        chunks = split_text(row["Text"])
        for i, chunk in enumerate(chunks):
            new_row = row.drop(labels=["word_count"], errors="ignore").copy()  # Remove word_count
            new_row["Text"] = chunk
            new_row["Chunk_ID"] = f"{row['Doc name']}_Page{row['Page_number']}_Chunk{i+1}"
            expanded_rows.append(new_row)

    # Create a new DataFrame with the split text chunks
    df_expanded = pd.DataFrame(expanded_rows)
    
    return df_expanded

# Process the DataFrame
df_expanded = split_text_dataframe(df)





In [38]:
df_expanded.head(50)

Unnamed: 0,Doc name,Type,Author,Date,Text,Page_number,Summary,Chunk_ID
0,Pdf img,scan,John Doe,2025-01-23 15:43:26,"C'est le dernier, Isaac, dit Hovel, le plus so...",118,"Isaac, souvent appelé Louis, est l'ancêtre dir...",Pdf img_Page118_Chunk1
0,Pdf img,scan,John Doe,2025-01-23 15:43:26,"ément, Marx, marchand de bétail, et Julie ex S...",118,"Isaac, souvent appelé Louis, est l'ancêtre dir...",Pdf img_Page118_Chunk2
0,Pdf img,scan,John Doe,2025-01-23 15:43:26,Adèle ont eu neuf enfants dont j'ai connaissan...,118,"Isaac, souvent appelé Louis, est l'ancêtre dir...",Pdf img_Page118_Chunk3
0,Pdf img,scan,John Doe,2025-01-23 15:43:26,"cle, il parle de certains sous un prénom, et r...",118,"Isaac, souvent appelé Louis, est l'ancêtre dir...",Pdf img_Page118_Chunk4
0,Pdf img,scan,John Doe,2025-01-23 15:43:26,"ail, car c'est une génération que mon père a c...",118,"Isaac, souvent appelé Louis, est l'ancêtre dir...",Pdf img_Page118_Chunk5
0,Pdf img,scan,John Doe,2025-01-23 15:43:26,e est de plus courte durée que celle de Maman ...,118,"Isaac, souvent appelé Louis, est l'ancêtre dir...",Pdf img_Page118_Chunk6
0,Pdf img,scan,John Doe,2025-01-23 15:43:26,voiture à deux chevaux assez spacieuse et con...,118,"Isaac, souvent appelé Louis, est l'ancêtre dir...",Pdf img_Page118_Chunk7
1,Pour la mémoire familiale 1-50,Unknown,Jean Lambert,2025-01-28 16:01:49,POUR LA MÉMOIRE\nFAMILIALE\n\nFAMILLE HISTOIRE...,1,"Mémoires familiales de Jean-Georges Lambert, V...",Pour la mémoire familiale 1-50_Page1_Chunk1
2,Pour la mémoire familiale 1-50,Unknown,Jean Lambert,2025-01-28 16:01:49,J'ai entrepris ce travail pour mes fils qui ti...,2,"L'auteur réalise ce travail pour ses fils, qui...",Pour la mémoire familiale 1-50_Page2_Chunk1
3,Pour la mémoire familiale 1-50,Unknown,Jean Lambert,2025-01-28 16:01:49,VOLUME 1\n\nPage\n6- Préambule.\n\nTABLE DES M...,3,VOLUME 1\n\nCe volume comprend un préambule et...,Pour la mémoire familiale 1-50_Page3_Chunk1


In [26]:
df["word_count"] = df["Text"].astype(str).apply(lambda x: len(x.split()))
df.head(50)

Unnamed: 0,Doc name,Type,Author,Date,Text,Page_number,Summary,word_count
0,Pdf img,scan,John Doe,2025-01-23 15:43:26,"C'est le dernier, Isaac, dit Hovel, le plus so...",118,"Isaac, souvent appelé Louis, est l'ancêtre dir...",578
1,Pour la mémoire familiale 1-50,Unknown,Jean Lambert,2025-01-28 16:01:49,POUR LA MÉMOIRE\nFAMILIALE\n\nFAMILLE HISTOIRE...,1,"Mémoires familiales de Jean-Georges Lambert, V...",15
2,Pour la mémoire familiale 1-50,Unknown,Jean Lambert,2025-01-28 16:01:49,J'ai entrepris ce travail pour mes fils qui ti...,2,"L'auteur réalise ce travail pour ses fils, qui...",33
3,Pour la mémoire familiale 1-50,Unknown,Jean Lambert,2025-01-28 16:01:49,VOLUME 1\n\nPage\n6- Préambule.\n\nTABLE DES M...,3,VOLUME 1\n\nCe volume comprend un préambule et...,479
4,Pour la mémoire familiale 1-50,Unknown,Jean Lambert,2025-01-28 16:01:49,"-Les sentiments de Bonaparte, puis de Napoléon...",4,Les sentiments de Bonaparte et de Napoléon env...,523
5,Pour la mémoire familiale 1-50,Unknown,Jean Lambert,2025-01-28 16:01:49,sont pas seuls visés par cette campagne Mon pè...,5,Table des matières détaillant les chapitres et...,341
6,Pour la mémoire familiale 1-50,Unknown,Jean Lambert,2025-01-28 16:01:49,PREAMBULLE\n\nLa mort de ma mère m'a profondém...,6,La narrateur évoque la profonde douleur ressen...,175
7,Pour la mémoire familiale 1-50,Unknown,Jean Lambert,2025-01-28 16:01:49,"vieillots, et quand je les entendais d'une ore...",7,L'auteur évoque sa prise de conscience tardive...,237
8,Pour la mémoire familiale 1-50,Unknown,Jean Lambert,2025-01-28 16:01:49,"ans, Francis en avait dix, Jean-Paul huit et G...",8,"Francis avait dix souvenirs, Jean-Paul huit et...",248
9,Pour la mémoire familiale 1-50,Unknown,Jean Lambert,2025-01-28 16:01:49,"Maintenant avec la retraite, même en m'occupan...",9,"L'auteur, maintenant à la retraite, souhaite r...",84


In [27]:
# del df
print(df)

                           Doc name     Type        Author  \
0                           Pdf img     scan      John Doe   
1    Pour la mémoire familiale 1-50  Unknown  Jean Lambert   
2    Pour la mémoire familiale 1-50  Unknown  Jean Lambert   
3    Pour la mémoire familiale 1-50  Unknown  Jean Lambert   
4    Pour la mémoire familiale 1-50  Unknown  Jean Lambert   
..                              ...      ...           ...   
100  Pour la mémoire familiale 1-50  Unknown  Jean Lambert   
101  Pour la mémoire familiale 1-50  Unknown  Jean Lambert   
102  Pour la mémoire familiale 1-50  Unknown  Jean Lambert   
103  Pour la mémoire familiale 1-50  Unknown  Jean Lambert   
105  Pour la mémoire familiale 1-50  Unknown  Jean Lambert   

                    Date                                               Text  \
0    2025-01-23 15:43:26  C'est le dernier, Isaac, dit Hovel, le plus so...   
1    2025-01-28 16:01:49  POUR LA MÉMOIRE\nFAMILIALE\n\nFAMILLE HISTOIRE...   
2    2025-01-28 16

## Creating Pinecone DB

In [28]:
from pinecone import Pinecone

In [29]:
import pinecone as pc
from pinecone import Pinecone, ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

# connect to pinecone environment
pc = Pinecone(
    api_key = PINECONE_API_KEY,
    environment='us-east-1'  # find next to API key in console
)

In [41]:
import time

index_name = "memory-project"
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of ada 002
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [31]:
# Install missing packages
%pip install langchain

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [32]:
import os
from getpass import getpass
import torch

from langchain.embeddings.openai import OpenAIEmbeddings

# get API key from top-right dropdown on OpenAI website
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or getpass("Enter your OpenAI API key: ")
model_name = 'text-embedding-3-small'

# set device to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY,
    # device=device -> Not compatible with OpenAI embeddings
)

  embed = OpenAIEmbeddings(


In [39]:
# Indexing v1

import uuid
from tqdm.auto import tqdm
import base64

batch_size = 100

texts = []
metadata = []

for i in tqdm(range(0, len(df), batch_size)):
    # Get end of batch
    i_end = min(len(df), i + batch_size)
    batch = df.iloc[i:i_end]

    # Generate metadata fields for this batch
    metadata = [{
        'Doc name': record['Doc name'],
        'Author': record['Author'],
        'Date': record['Date'],
        'Page_number': record['Page_number'],
        'Summary': record['Summary'],
        'full_text': record['Text']
    } for j, record in batch.iterrows()]

    # Extract full text for embeddings
    embeddings = embed.embed_documents(batch["Text"].tolist())

    # Generate UUIDs for document IDs
    ids = [str(uuid.uuid4()) for _ in range(len(batch))]

    # Upsert to Pinecone
    index.upsert(vectors=zip(ids, embeddings, metadata))


  0%|          | 0/1 [00:00<?, ?it/s]

In [51]:
# Indexing v2

import uuid
from tqdm.auto import tqdm
import base64

batch_size = 100

texts = []
metadata = []

for i in tqdm(range(0, len(df_expanded), batch_size)):
    # Get end of batch
    i_end = min(len(df_expanded), i + batch_size)
    batch = df_expanded.iloc[i:i_end]

    # Generate metadata fields for this batch
    metadata = [{
        'Doc name': record['Doc name'],
        'Author': record['Author'],
        'Date': record['Date'],
        'Chunk_ID': record['Chunk_ID'],  # Updated from 'Chunks'
        'Page_number': record['Page_number'],
        'Summary': record['Summary'],
        'Text': record['Text']
    } for _, record in batch.iterrows()]

    # Extract full text for embeddings
    embeddings = embed.embed_documents(batch["Text"].tolist())

    # Generate UUIDs for document IDs
    ids = [str(uuid.uuid4()) for _ in range(len(batch))]

    # Upsert to Pinecone
    index.upsert(vectors=zip(ids, embeddings, metadata))


  0%|          | 0/6 [00:00<?, ?it/s]

## Querying the Vector Store

In [None]:
# # Install the langgraph package
# %pip install langgraph

In [49]:
from langchain.vectorstores import Pinecone

text_field = "Summary"  # the metadata field that contains our text

# initialize the vector store object
vectorstore = Pinecone(
    index, embed.embed_query, text_field
)

In [50]:
query = "What happened in 1963"

vectorstore.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)

[Document(metadata={'Author': 'Jean Lambert', 'Chunk_ID': 'Pour la mémoire familiale 1-50_Page21_Chunk5', 'Date': '2025-01-28 16:01:49', 'Doc name': 'Pour la mémoire familiale 1-50', 'Page_number': 21.0}, page_content="Le texte aborde la complexité de l'antisémitisme et du génocide nazi, soulignant que le racisme anti-noir vise à dominer plutôt qu'à exterminer. Il met en avant l'importance unique du génocide juif, distinct des crimes de Staline, en raison de son caractère industriel et délibéré. Des auteurs comme Max I. Dimont et Raymond Aron évoquent les motivations derrière le nazisme, notamment une volonté de déicide et une lutte contre le christianisme. Le texte mentionne également le choc post-guerre face à l'Holocauste, qui a instauré un tabou sur l'antisémitisme, brisé par de Gaulle, dont les discours ont ravivé des attitudes traditionnelles envers les juifs. Raymond Aron critique cette évolution, soulignant que cela a ouvert la voie à une nouvelle ère d'antisémitisme, marquée p

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA

# chat completion llm
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-4o-mini',
    temperature=0.0
)
# conversational memory
conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=5,
    return_messages=True
)
# retrieval qa chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="map_reduce",
    retriever=vectorstore.as_retriever()
)

In [None]:
qa.run(query)

"I don't know."

## Tool query to be edited

In [50]:
from langchain.agents import Tool

tools = [
    Tool(
        name='Knowledge Base',
        func=qa.run,
        description=(
            'use this tool when answering general knowledge queries to get '
            'more information about the topic'
        )
    )
]

In [51]:
from langchain.agents import initialize_agent

agent = initialize_agent(
    agent='chat-conversational-react-description',
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=conversational_memory
)

  agent = initialize_agent(


In [None]:
agent("Who si Rachel Gugenheim?")

  agent("Who si Rachel Gugenheim?")




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Knowledge Base",
    "action_input": "Rachel Gugenheim"
}
```[0m
Observation: [36;1m[1;3mJe ne sais pas.[0m
Thought:[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "I couldn't find any information about Rachel Gugenheim."
}
```[0m

[1m> Finished chain.[0m


{'input': 'Who si Rachel Gugenheim?',
 'chat_history': [],
 'output': "I couldn't find any information about Rachel Gugenheim."}

In [None]:
agent("What can you tell me about her husband?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Knowledge Base",
    "action_input": "Rachel Gugenheim husband"
}
```[0m
Observation: [36;1m[1;3mRachel Gugenheim was married to Isaac, often called Louis, who was born in 1766.[0m
Thought:[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "Rachel Gugenheim was married to Isaac, often called Louis, who was born in 1766."
}
```[0m

[1m> Finished chain.[0m


{'input': 'What can you tell me about her husband?',
 'chat_history': [HumanMessage(content='Who si Rachel Gugenheim?', additional_kwargs={}, response_metadata={}),
  AIMessage(content="I couldn't find any information about Rachel Gugenheim.", additional_kwargs={}, response_metadata={})],
 'output': 'Rachel Gugenheim was married to Isaac, often called Louis, who was born in 1766.'}

In [None]:
agent("What's the common point between Jacques Dreyfus and Sleeping Beauty?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Knowledge Base",
    "action_input": "common point between Jacques Dreyfus and Sleeping Beauty"
}
```[0m
Observation: [36;1m[1;3mI don't know.[0m
Thought:[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "I couldn't find a common point between Jacques Dreyfus and Sleeping Beauty."
}
```[0m

[1m> Finished chain.[0m


{'input': "What's the common point between Jacques Dreyfus and Sleeping Beauty?",
 'chat_history': [HumanMessage(content='Who si Rachel Gugenheim?', additional_kwargs={}, response_metadata={}),
  AIMessage(content="I couldn't find any information about Rachel Gugenheim.", additional_kwargs={}, response_metadata={}),
  HumanMessage(content='What can you tell me about her husband?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Rachel Gugenheim was married to Isaac, often called Louis, who was born in 1766.', additional_kwargs={}, response_metadata={})],
 'output': "I couldn't find a common point between Jacques Dreyfus and Sleeping Beauty."}

In [56]:
agent("Who died in 1963?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Knowledge Base",
    "action_input": "Notable deaths in 1963"
}
```[0m
Observation: [36;1m[1;3mI don't know.[0m
Thought:[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "I couldn't find specific information about notable deaths in 1963."
}
```[0m

[1m> Finished chain.[0m


{'input': 'Who died in 1963?',
 'chat_history': [HumanMessage(content='Who si Rachel Gugenheim?', additional_kwargs={}, response_metadata={}),
  AIMessage(content="I couldn't find any information about Rachel Gugenheim.", additional_kwargs={}, response_metadata={}),
  HumanMessage(content='What can you tell me about her husband?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Rachel Gugenheim was married to Isaac, often called Louis, who was born in 1766.', additional_kwargs={}, response_metadata={}),
  HumanMessage(content="What's the common point between Jacques Dreyfus and Sleeping Beauty?", additional_kwargs={}, response_metadata={}),
  AIMessage(content="I couldn't find a common point between Jacques Dreyfus and Sleeping Beauty.", additional_kwargs={}, response_metadata={}),
  HumanMessage(content='Tell me all you know about Jacques Dreyfus.', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Jacques Dreyfus was born in Froeningen on January 9, 

In [None]:
# Convert vector store to retriever
retriever = vector_store.as_retriever()


# Configure Retrieval QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff"  # Use "stuff" or "map_reduce" depending on complexity
)

# Query the chain
query = "Why is my agent not behaving like I want it? I thought coding was easy and magic!"
response = qa_chain.run(query)
print("Response:", response)