# Memory project - Database vectorization

## Secrets

In [18]:
import os
from openai import OpenAI
from dotenv import load_dotenv
from pathlib import Path

# Load path from the environment variable
env_ih1 = os.getenv("ENV_IH1")

dotenv_path = Path(env_ih1)
load_dotenv(dotenv_path=dotenv_path)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY= os.getenv('PINECONE_KEY')
SERPAPI_API_KEY = os.getenv('SERPAPI_API_KEY')
STEAMSHIP_API_KEY = os.getenv('STEAMSHIP_API_KEY')
LANGSMITH_API_KEY = os.getenv('LANGSMITH_API_KEY')
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
GEMINI_KEY = os.getenv('GEMINI_KEY')

os.environ['PATH'] += os.pathsep + '/usr/bin'

## Libraries

In [19]:
import os
import json
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import shutil
from pinecone import Index  # Import Index for Pinecone operations
import openai


## Family safe 

## Summarizer local


In [20]:
from huggingface_hub import login

# Use your token here
login(token=HUGGINGFACEHUB_API_TOKEN)

## Read json + create local df/csv

In [21]:
# from langchain_core.output_parsers import StrOutputParser
# from langchain_core.prompts import ChatPromptTemplate
# from langchain_openai import ChatOpenAI

# def process_json_files(data_dir=r'.\Family safe', processed_dir=r'.\Processed JSONs'):
#     """
#     Processes all JSON files in the specified directory and extracts relevant data.

#     Args:
#         data_dir: Directory containing JSON files.
#         processed_dir: Directory where processed files will be moved.

#     Returns:
#         A DataFrame containing the processed data.
#     """
#     all_data = []

#     # Ensure the processed directory exists
#     os.makedirs(processed_dir, exist_ok=True)

#     # Process each JSON file
#     for filename in os.listdir(data_dir):
#         if filename.endswith(".json"):
#             filepath = os.path.join(data_dir, filename)
#             with open(filepath, 'r', encoding='utf-8') as f:
#                 json_data = json.load(f)

#             # Extract document-level metadata
#             name = json_data.get("Name", "Unknown")
#             doc_type = json_data.get("Type", "Unknown")
#             author = json_data.get("Author", "Unknown")
#             date = json_data.get("Date", "Unknown")

#             # Process each page
#             for page in json_data.get("Pages", []):
#                 page_data = {
#                     "Doc name": name,
#                     "Type": doc_type,
#                     "Author": author,
#                     "Date": date,
#                     "Text": page.get("Extracted Text", ""),
#                     "Page number": page.get("Page Number", None),
#                 }
#                 all_data.append(page_data)

#             # Move the processed file
#             processed_filepath = os.path.join(processed_dir, filename)
#             os.rename(filepath, processed_filepath)

#     # Create a DataFrame
#     df = pd.DataFrame(all_data)

#     # Add summaries to the DataFrame
#     df = populate_summary_column(df)

#     # Reorder columns
#     df = df[["Doc name", "Type", "Author", "Date", "Text", "Page number", "Summary"]]

#     return df

# def populate_summary_column(df):
#     """
#     Populates the 'Summary' column in the DataFrame using generate_text_summaries.
    
#     Args:
#         df: The DataFrame containing the text data.

#     Returns:
#         The DataFrame with the 'Summary' column populated.
#     """
#     # Extract texts from the DataFrame
#     texts = df["Text"].tolist()

#     # Generate summaries for the texts
#     text_summaries, _ = generate_text_summaries(texts, tables=None, summarize_texts=True)

#     # Assign the summaries back to the DataFrame
#     df["Summary"] = text_summaries

#     return df

# def generate_text_summaries(texts, tables=None, summarize_texts=False):
#     """
#     Summarize text elements
#     texts: List of str
#     tables: List of str
#     summarize_texts: Bool to summarize texts
#     """

#     # Prompt
#     prompt_text = """You are an assistant tasked with summarizing tables and text for retrieval. \
#     These summaries will be embedded and used to retrieve the raw text or table elements. \
#     Give a concise summary of the table or text that is well optimized for retrieval. Table or text: {element} 
#     Do not include "this is a summary" at the begining of the summary. """
#     prompt = ChatPromptTemplate.from_template(prompt_text)

#     # Text summary chain
#     model = ChatOpenAI(temperature=0, model="gpt-4o-mini")
#     summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

#     # Initialize empty summaries
#     text_summaries = []
#     table_summaries = []

#     # Apply to text if texts are provided and summarization is requested
#     if texts and summarize_texts:
#         text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
#     elif texts:
#         text_summaries = texts

#     # Apply to tables if tables are provided
#     if tables:
#         table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

#     return text_summaries, table_summaries


In [22]:
# V2

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

def process_json_files(data_dir=r'.\Family safe', processed_dir=r'.\Processed JSONs', output_file=r'.\combined_data.csv'):
    """
    Processes all JSON files in the specified directory, adds new data to an existing DataFrame, 
    and ensures no duplicates.

    Args:
        data_dir: Directory containing JSON files.
        processed_dir: Directory where processed files will be moved.
        output_file: Filepath for the saved combined DataFrame.

    Returns:
        A DataFrame containing the updated processed data.
    """
    all_data = []

    # Ensure the processed directory exists
    os.makedirs(processed_dir, exist_ok=True)

    # Process each JSON file
    for filename in os.listdir(data_dir):
        if filename.endswith(".json"):
            filepath = os.path.join(data_dir, filename)
            with open(filepath, 'r', encoding='utf-8') as f:
                json_data = json.load(f)

            # Extract document-level metadata
            name = json_data.get("Name", "Unknown")
            doc_type = json_data.get("Type", "Unknown")
            author = json_data.get("Author", "Unknown")
            date = json_data.get("Date", "Unknown")

            # Initialize a page counter
            page_counter = 1

            # Process each page
            for page in json_data.get("Pages", []):
                page_number = page.get("Page Number", None)
                
                # Assign a sequential number if Page Number is NaN
                if page_number is None:
                    page_number = page_counter
                    page_counter += 1  # Increment the counter

                page_data = {
                    "Doc name": name,
                    "Type": doc_type,
                    "Author": author,
                    "Date": date,
                    "Text": page.get("Extracted Text", ""),
                    "Chunks": page_number,
                }
                all_data.append(page_data)


            # Move the processed file
            processed_filepath = os.path.join(processed_dir, filename)
            os.rename(filepath, processed_filepath)

    # Create a DataFrame from new data
    new_df = pd.DataFrame(all_data)

    # Add summaries to the new DataFrame
    new_df = populate_summary_column(new_df)

    # Reorder columns
    new_df = new_df[["Doc name", "Type", "Author", "Date", "Text", "Chunks", "Summary"]]

    # Check if an existing DataFrame exists
    if os.path.exists(output_file):
        existing_df = pd.read_csv(output_file)
        combined_df = pd.concat([existing_df, new_df], ignore_index=True).drop_duplicates()
    else:
        combined_df = new_df

    # Save the updated DataFrame
    combined_df.to_csv(output_file, index=False)
    print(f"Updated DataFrame saved to {output_file}")

    return combined_df

def populate_summary_column(df):
    """
    Populates the 'Summary' column in the DataFrame using generate_text_summaries.
    
    Args:
        df: The DataFrame containing the text data.

    Returns:
        The DataFrame with the 'Summary' column populated.
    """
    # Extract texts from the DataFrame
    texts = df["Text"].tolist()

    # Generate summaries for the texts
    text_summaries, _ = generate_text_summaries(texts, tables=None, summarize_texts=True)

    # Assign the summaries back to the DataFrame
    df["Summary"] = text_summaries

    return df

def generate_text_summaries(texts, tables=None, summarize_texts=False):
    """
    Summarize text elements
    texts: List of str
    tables: List of str
    summarize_texts: Bool to summarize texts
    """

    # Prompt
    prompt_text = """You are an assistant tasked with summarizing tables and text for retrieval. \
    These summaries will be embedded and used to retrieve the raw text or table elements. \
    Give a concise summary of the table or text that is well optimized for retrieval. Table or text: {element} 
    Do not include "this is a summary" at the begining of the summary. """
    prompt = ChatPromptTemplate.from_template(prompt_text)

    # Text summary chain
    model = ChatOpenAI(temperature=0, model="gpt-4o-mini")
    summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

    # Initialize empty summaries
    text_summaries = []
    table_summaries = []

    # Apply to text if texts are provided and summarization is requested
    if texts and summarize_texts:
        text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})
    elif texts:
        text_summaries = texts

    # Apply to tables if tables are provided
    if tables:
        table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

    return text_summaries, table_summaries


In [23]:
df=process_json_files(data_dir=r'data\Family safe', processed_dir=r'data\Family safe')

Updated DataFrame saved to .\combined_data.csv


In [24]:
# # Run the function
# updated_df = process_json_files(data_dir=r'.\Family safe', processed_dir=r'.\Family safe')

# # Preview the updated DataFrame
# print(updated_df.head())

In [1]:
df.head(50)

NameError: name 'df' is not defined

In [26]:
# del df
print(df)

                     Doc name  Type          Author                 Date  \
0                example_name  scan  example_author  2025-01-23 17:17:14   
1                example_name  scan  example_author  2025-01-23 17:17:14   
2                example_name  scan  example_author  2025-01-23 17:17:14   
3                example_name  scan  example_author  2025-01-23 17:17:14   
4                example_name  scan  example_author  2025-01-23 17:17:14   
..                        ...   ...             ...                  ...   
134              example_name  scan  example_author  2025-01-23 17:17:14   
137              example_name  scan  example_author  2025-01-23 17:17:14   
142  La belle au bois dormant  scan        Perrault  2025-01-24 12:16:38   
143  La belle au bois dormant  scan        Perrault  2025-01-24 12:16:38   
144  La belle au bois dormant  scan        Perrault  2025-01-24 12:16:38   

                                                  Text  Chunks  \
0    CC-OCR: A Compre

Pinecone DB

## Creating Pinecone DB

In [27]:
from pinecone import Pinecone

In [28]:
import pinecone as pc
from pinecone import Pinecone, ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

# connect to pinecone environment
pc = Pinecone(
    api_key = PINECONE_API_KEY,
    environment='us-east-1'  # find next to API key in console
)

In [29]:
import time

index_name = "memory-project"
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of ada 002
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 3}},
 'total_vector_count': 3}

In [30]:
import os
from getpass import getpass
from langchain.embeddings.openai import OpenAIEmbeddings
import torch

# get API key from top-right dropdown on OpenAI website
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") or getpass("Enter your OpenAI API key: ")
model_name = 'text-embedding-3-small'

# set device to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY,
    # device=device -> Not compatible with OpenAI embeddings
)

  embed = OpenAIEmbeddings(


In [31]:
# Indexing

from tqdm.auto import tqdm

batch_size = 100

texts = []
metadata = []

for i in tqdm(range(0, len(df), batch_size)):
    # get end of batch
    i_end = min(len(df), i+batch_size)
    batch = df.iloc[i:i_end]

    # first get metadata fields for this record
    metadata = [{
        'Doc name': record['Doc name'],
        'Author': record['Author'],
        'Date': record['Date'],
        'Chunks': record['Chunks'],
        'Summary': record['Summary'],

    } for j, record in batch.iterrows()]
    # Extract full text for embeddings
    embeddings = embed.embed_documents(batch["Text"].tolist())

    # Prepare IDs
    ids = batch["Doc name"].tolist()

    # Upsert to Pinecone
    index.upsert(vectors=zip(ids, embeddings, metadata))

  0%|          | 0/2 [00:00<?, ?it/s]

## Querying the Vector Store

In [32]:
from langchain.vectorstores import Pinecone

text_field = "Summary"  # the metadata field that contains our text

# initialize the vector store object
vectorstore = Pinecone(
    index, embed.embed_query, text_field
)

  vectorstore = Pinecone(


In [33]:
query = "Qui est Rachel Gugenheim?"

vectorstore.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)

[Document(metadata={'Author': 'John Doe', 'Chunks': 118.0, 'Date': '2025-01-23 15:43:26', 'Doc name': 'Pdf img'}, page_content="Isaac, souvent appelé Louis, est l'ancêtre direct né en 1766 à Froeningen, marié à Rachel Gugenheim (1772-1843). Ils ont eu six enfants, dont Lehmann (Clément), né en 1800, qui a épousé Athelle Bloch en 1827. Ils ont eu neuf enfants, parmi lesquels Jacques Dreyfus, né le 9 janvier 1829, qui a épousé Catherine Lévy en 1860 et est décédé en 1893. Jacques, colporteur de tissus, a joué un rôle significatif dans la famille, voyageant pour approvisionner des revendeurs dans des bourgs non desservis."),
 Document(metadata={'Author': 'Perrault', 'Chunks': 3.0, 'Date': '2025-01-24 12:16:38', 'Doc name': 'La belle au bois dormant'}, page_content='In "La Belle au Bois Dormant," a princess encounters an old woman spinning. Curious, she tries to spin but pricks her finger on a spindle, falling into a deep sleep as foretold by the fairies. Despite efforts to revive her, she

In [34]:
print(vectorstore.similarity_search(query, k=3))

[Document(metadata={'Author': 'John Doe', 'Chunks': 118.0, 'Date': '2025-01-23 15:43:26', 'Doc name': 'Pdf img'}, page_content="Isaac, souvent appelé Louis, est l'ancêtre direct né en 1766 à Froeningen, marié à Rachel Gugenheim (1772-1843). Ils ont eu six enfants, dont Lehmann (Clément), né en 1800, qui a épousé Athelle Bloch en 1827. Ils ont eu neuf enfants, parmi lesquels Jacques Dreyfus, né le 9 janvier 1829, qui a épousé Catherine Lévy en 1860 et est décédé en 1893. Jacques, colporteur de tissus, a joué un rôle significatif dans la famille, voyageant pour approvisionner des revendeurs dans des bourgs non desservis."), Document(metadata={'Author': 'Perrault', 'Chunks': 3.0, 'Date': '2025-01-24 12:16:38', 'Doc name': 'La belle au bois dormant'}, page_content='In "La Belle au Bois Dormant," a princess encounters an old woman spinning. Curious, she tries to spin but pricks her finger on a spindle, falling into a deep sleep as foretold by the fairies. Despite efforts to revive her, she 

In [35]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA

# chat completion llm
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-4o-mini',
    temperature=0.0
)
# conversational memory
conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=5,
    return_messages=True
)
# retrieval qa chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

  llm = ChatOpenAI(
  conversational_memory = ConversationBufferWindowMemory(


In [36]:
qa.run(query)

  qa.run(query)


"Rachel Gugenheim est l'épouse d'Isaac, souvent appelé Louis. Elle est née en 1772 et est décédée en 1843. Ils ont eu six enfants ensemble."

## Tool query to be edited

In [37]:
from langchain.agents import Tool

tools = [
    Tool(
        name='Knowledge Base',
        func=qa.run,
        description=(
            'use this tool when answering general knowledge queries to get '
            'more information about the topic'
        )
    )
]

In [38]:
from langchain.agents import initialize_agent

agent = initialize_agent(
    agent='chat-conversational-react-description',
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=conversational_memory
)

  agent = initialize_agent(


In [39]:
agent("Who si Rachel Gugenheim?")

  agent("Who si Rachel Gugenheim?")




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Knowledge Base",
    "action_input": "Rachel Gugenheim"
}
```[0m
Observation: [36;1m[1;3mRachel Gugenheim (1772-1843) était l'épouse d'Isaac, souvent appelé Louis, et ils ont eu six enfants ensemble.[0m
Thought:[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "Rachel Gugenheim (1772-1843) was the wife of Isaac, often referred to as Louis, and they had six children together."
}
```[0m

[1m> Finished chain.[0m


{'input': 'Who si Rachel Gugenheim?',
 'chat_history': [],
 'output': 'Rachel Gugenheim (1772-1843) was the wife of Isaac, often referred to as Louis, and they had six children together.'}

In [40]:
agent("What can you tell me about her husband?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Knowledge Base",
    "action_input": "Isaac Gugenheim, husband of Rachel Gugenheim"
}
```[0m
Observation: [36;1m[1;3mIsaac Gugenheim, souvent appelé Louis, est l'ancêtre direct né en 1766 à Froeningen. Il était marié à Rachel Gugenheim, qui a vécu de 1772 à 1843. Ils ont eu six enfants ensemble.[0m
Thought:[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "Isaac Gugenheim, often referred to as Louis, was born in 1766 in Froeningen. He was married to Rachel Gugenheim, who lived from 1772 to 1843, and they had six children together."
}
```[0m

[1m> Finished chain.[0m


{'input': 'What can you tell me about her husband?',
 'chat_history': [HumanMessage(content='Who si Rachel Gugenheim?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Rachel Gugenheim (1772-1843) was the wife of Isaac, often referred to as Louis, and they had six children together.', additional_kwargs={}, response_metadata={})],
 'output': 'Isaac Gugenheim, often referred to as Louis, was born in 1766 in Froeningen. He was married to Rachel Gugenheim, who lived from 1772 to 1843, and they had six children together.'}

In [41]:
agent("What's the common point between Jacques Dreyfus and Sleeping Beauty?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Knowledge Base",
    "action_input": "common point between Jacques Dreyfus and Sleeping Beauty"
}
```[0m
Observation: [36;1m[1;3mThe common point between Jacques Dreyfus and Sleeping Beauty is that both involve themes of waiting and transformation. In "Sleeping Beauty," the princess is in a deep sleep for a long period, awaiting a transformative awakening. Similarly, Jacques Dreyfus, as a traveling merchant, may have experienced periods of waiting during his journeys, and his life story reflects a transformation through his role in the family and community. Both narratives highlight the passage of time and the eventual change that comes with it.[0m
Thought:[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "The common point between Jacques Dreyfus and Sleeping Beauty is that both involve themes of waiting and transformation. In 'Sleeping Beauty,' the princess is in a deep sle

{'input': "What's the common point between Jacques Dreyfus and Sleeping Beauty?",
 'chat_history': [HumanMessage(content='Who si Rachel Gugenheim?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Rachel Gugenheim (1772-1843) was the wife of Isaac, often referred to as Louis, and they had six children together.', additional_kwargs={}, response_metadata={}),
  HumanMessage(content='What can you tell me about her husband?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Isaac Gugenheim, often referred to as Louis, was born in 1766 in Froeningen. He was married to Rachel Gugenheim, who lived from 1772 to 1843, and they had six children together.', additional_kwargs={}, response_metadata={})],
 'output': "The common point between Jacques Dreyfus and Sleeping Beauty is that both involve themes of waiting and transformation. In 'Sleeping Beauty,' the princess is in a deep sleep for a long period, awaiting a transformative awakening. Similarly, Jacques Dre

In [42]:
agent("Tell me all you know about Jacques Dreyfus.")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Knowledge Base",
    "action_input": "Jacques Dreyfus"
}
```[0m
Observation: [36;1m[1;3mJacques Dreyfus, né le 9 janvier 1829, était un colporteur de tissus qui a joué un rôle significatif dans sa famille. Il a épousé Catherine Lévy en 1860 et est décédé en 1893. Il était l'un des neuf enfants de Lehmann (Clément) et Athelle Bloch.[0m
Thought:[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "Jacques Dreyfus, born on January 9, 1829, was a significant figure as a traveling fabric merchant. He married Catherine Lévy in 1860 and passed away in 1893. He was one of nine children of Lehmann (Clément) and Athelle Bloch."
}
```[0m

[1m> Finished chain.[0m


{'input': 'Tell me all you know about Jacques Dreyfus.',
 'chat_history': [HumanMessage(content='Who si Rachel Gugenheim?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Rachel Gugenheim (1772-1843) was the wife of Isaac, often referred to as Louis, and they had six children together.', additional_kwargs={}, response_metadata={}),
  HumanMessage(content='What can you tell me about her husband?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Isaac Gugenheim, often referred to as Louis, was born in 1766 in Froeningen. He was married to Rachel Gugenheim, who lived from 1772 to 1843, and they had six children together.', additional_kwargs={}, response_metadata={}),
  HumanMessage(content="What's the common point between Jacques Dreyfus and Sleeping Beauty?", additional_kwargs={}, response_metadata={}),
  AIMessage(content="The common point between Jacques Dreyfus and Sleeping Beauty is that both involve themes of waiting and transformation. In 'Sleep