In [1]:
import argparse
import os
import shutil
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from get_embedding_function import get_embedding_function
from langchain.vectorstores.chroma import Chroma

In [2]:
CHUNK_SIZE=800
CHUNK_OVERLAP=80

DATA_PATH = "data/Galapagos/pdfs"

SIMILARITY_METRIC = "cosine" # l1/l2/cosine and negative inner product

## Populate the database using the PDFS

In [3]:
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()

def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)


def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks


In [4]:
documents = load_documents()

In [5]:
len(documents)

164

In [6]:
chunks = split_documents(documents)

In [7]:
len(chunks)

494

In [8]:
chunks[0].metadata

{'source': 'data/Galapagos/pdfs/CnE_GUI-CE-027 Guidance on Declaration of Interest (1).pdf',
 'page': 0}

In [9]:
chunks[0].page_content

'Do you need to\n(Also known as a Conflict of Interest)DECLARE AN INTEREST?\nYou may need to declare an interest where a potential conflict arises…\nA potential conflict of interest just means that your personal interests could conflict with your role and decisions at \nGalapagos. \nIt doesn’t mean that there is an actual conflict or that you’ve done anything wrong. In fact, conflicts are usually a \nresult of good things — like having good relationships through friendships or investments. \nBut they could also be perceived by someone else as impacting your judgment, or could harm the trust between \ncolleagues, and that’s why we need to be aware of them so we can take any steps we need to manage them. \nHere are some examples…\nWhat should I do if I think I might need to declare an interest?'

In [10]:
# Connect to DB

import os
import dotenv

from sqlalchemy import Column, Integer, create_engine, Text
from sqlalchemy.orm import declarative_base, Session
from tidb_vector.sqlalchemy import VectorType

from sqlalchemy import URL

dotenv.load_dotenv()

# Step 1: Connect to TiDB using SQLAlchemy.
tidb_connection_string = URL(
    "mysql+pymysql",
    username=os.environ['TIDB_USERNAME'],
    password=os.environ['TIDB_PASSWORD'],
    host=os.environ['TIDB_HOST'],
    port=4000,
    database="test",
    query={"ssl_verify_cert": True, "ssl_verify_identity": True},
)
engine = create_engine(tidb_connection_string)

# Step 2: Define a table with a vector column.
Base = declarative_base()
session = Session(engine)

# Issue : https://stackoverflow.com/questions/454854/no-module-named-mysqldb

In [11]:
# Create a Demo Class `RChunk` containing content and embedding and indexed

class RChunk(Base):
    __tablename__ = 'r_chunk'
    id = Column(Integer, primary_key=True)
    chunk_id = Column(Text)
    content = Column(Text)
    embedding = Column(VectorType(1536),comment=f"hnsw(distance={SIMILARITY_METRIC})")

In [12]:
# Base.metadata.drop_all(engine)
Base.metadata.create_all(engine)

In [13]:
# Get text and embeddings
EMBEDDING_MODEL_ENDPOINT = "https://api.openai.com/v1/embeddings"
import os
import openai
from dotenv import load_dotenv
load_dotenv()


# Load environment variables
OPENAI_ORG = os.getenv('OPENAI_ORG')
OPENAI_APIKEY = os.getenv('OPENAI_APIKEY')

openai.organization = OPENAI_ORG
openai.api_key = OPENAI_APIKEY


openai_client = openai.OpenAI(api_key=OPENAI_APIKEY)
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai_client.embeddings.create(input = [text], model=model).data[0].embedding

In [14]:
def add_to_db(chunks: list[Document]):
    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)
    existing_ids = session.query(RChunk).with_entities(RChunk.chunk_id).all()

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        for chunk in new_chunks:
            session.add(RChunk(chunk_id=chunk.metadata['id'],content=chunk.page_content, embedding=get_embedding(chunk.page_content)))
    else:
        print("✅ No new documents to add")
    session.commit()


In [15]:
# add_to_db(chunks=chunks) # Uncomment this if no db setup in 'tidb'
# Might require modification to check for updates to the same document, or may be replace the document with a new name

## Query Database

In [16]:
USER_QUERY = "How to behave during a meeting?"

In [17]:
user_query_embedding = get_embedding(USER_QUERY)

In [18]:
# Get 3-nearest neighbor documents
distance1 = RChunk.embedding.cosine_distance(user_query_embedding).label('distance')
results1 = session.query(RChunk, distance1).order_by(distance1).limit(5).all()

for doc, distance1 in results1:
    print(f'  - distance: {distance1}\n'
            f'    document: {doc.content}')

  - distance: 0.13562019500389022
    document: |   1ATTENDEES - Best meeting practices
•Decide if you are joining the meeting
RSVP (respond) to the meeting organizer as soon as possible
•Prepare: read the agenda and the pre- work
check tasks from last meeting, prepare insights and /or questions
•Be on time
•Be present, participate and avoid distractions
turn off notifications, put away your phone
•Make sure you are heard, even if your opinion is less popular 
•Reflect: did I participate, and did I share my opinion?
•Provide feedback to the meeting organizer for potential improvement
•Read and review the minutes 
•Check the follow -up tasks and act on yours
Before
During
After
  - distance: 0.14838117168087994
    document: |   1ORGANIZERS - Best meeting practices
•Decide if you really need the meeting and select correct meeting type
•Define a clear and timed agenda, including the meeting objective
•Use the Outlook Scheduling Assistant to plan at an appropriate time
•Carefully choos

In [19]:
# Get documents within a certain distance.
distance2 = RChunk.embedding.cosine_distance(user_query_embedding).label('distance')
results2 = session.query(
    RChunk, distance2
).filter(distance2 < 0.2).order_by(distance2).limit(5).all()

for doc, distance2 in results2:
    print(f'  - distance: {distance2}\n'
            f'    document: {doc.content}')


  - distance: 0.13562019500389022
    document: |   1ATTENDEES - Best meeting practices
•Decide if you are joining the meeting
RSVP (respond) to the meeting organizer as soon as possible
•Prepare: read the agenda and the pre- work
check tasks from last meeting, prepare insights and /or questions
•Be on time
•Be present, participate and avoid distractions
turn off notifications, put away your phone
•Make sure you are heard, even if your opinion is less popular 
•Reflect: did I participate, and did I share my opinion?
•Provide feedback to the meeting organizer for potential improvement
•Read and review the minutes 
•Check the follow -up tasks and act on yours
Before
During
After
  - distance: 0.14838117168087994
    document: |   1ORGANIZERS - Best meeting practices
•Decide if you really need the meeting and select correct meeting type
•Define a clear and timed agenda, including the meeting objective
•Use the Outlook Scheduling Assistant to plan at an appropriate time
•Carefully choos

In [20]:
results1_content = ""
for chunk,distance in results1:
    results1_content += chunk.content

results2_content = ""
for chunk,distance in results2:
    results2_content += chunk.content

In [21]:
results1_content

'|   1ATTENDEES - Best meeting practices\n•Decide if you are joining the meeting\n\uf0fcRSVP (respond) to the meeting organizer as soon as possible\n•Prepare: read the agenda and the pre- work\n\uf0fccheck tasks from last meeting, prepare insights and /or questions\n•Be on time\n•Be present, participate and avoid distractions\n\uf0fcturn off notifications, put away your phone\n•Make sure you are heard, even if your opinion is less popular \n•Reflect: did I participate, and did I share my opinion?\n•Provide feedback to the meeting organizer for potential improvement\n•Read and review the minutes \n•Check the follow -up tasks and act on yours\nBefore\nDuring\nAfter|   1ORGANIZERS - Best meeting practices\n•Decide if you really need the meeting and select correct meeting type\n•Define a clear and timed agenda, including the meeting objective\n•Use the Outlook Scheduling Assistant to plan at an appropriate time\n•Carefully choose the attendees, share the agenda and materials in advance\n•S

In [22]:
results2_content

'|   1ATTENDEES - Best meeting practices\n•Decide if you are joining the meeting\n\uf0fcRSVP (respond) to the meeting organizer as soon as possible\n•Prepare: read the agenda and the pre- work\n\uf0fccheck tasks from last meeting, prepare insights and /or questions\n•Be on time\n•Be present, participate and avoid distractions\n\uf0fcturn off notifications, put away your phone\n•Make sure you are heard, even if your opinion is less popular \n•Reflect: did I participate, and did I share my opinion?\n•Provide feedback to the meeting organizer for potential improvement\n•Read and review the minutes \n•Check the follow -up tasks and act on yours\nBefore\nDuring\nAfter|   1ORGANIZERS - Best meeting practices\n•Decide if you really need the meeting and select correct meeting type\n•Define a clear and timed agenda, including the meeting objective\n•Use the Outlook Scheduling Assistant to plan at an appropriate time\n•Carefully choose the attendees, share the agenda and materials in advance\n•S

## Refining Output

In [23]:
PROMPT_USER_TEMPLATE = """
Answer the question based only on the following context:

{question}
{context}

"""

SYSTEM_PROMPT = """
You are an ICF MCC certified coach who has a lot of experience with life coaching.
You are give certain context and a question. Use the context and output an answer that is precise and clear.

"""

In [24]:
formatted_content1 = PROMPT_USER_TEMPLATE.format(context=results1_content, question=USER_QUERY)
formatted_content2 = PROMPT_USER_TEMPLATE.format(context=results2_content, question=USER_QUERY)

In [25]:
MODEL = "gpt-4o"
client = openai.OpenAI(api_key=os.environ['OPENAI_APIKEY'])
response = client.chat.completions.create(
    model=MODEL,
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": formatted_content1}
    ],
)
assistant_message = response.choices[0].message.content
tokens = response.usage.total_tokens
assistant_message

'During a meeting, you should:\n\n- Be on time.\n- Be present and participate actively.\n- Avoid distractions by turning off notifications and putting away your phone.\n- Ensure your opinions are heard, even if they are less popular.\n- Reflect on your participation and whether you shared your opinion.\n- Provide feedback to the meeting organizer for potential improvement.'

In [26]:
response = client.chat.completions.create(
    model=MODEL,
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": formatted_content2}
    ],
)
assistant_message = response.choices[0].message.content
tokens = response.usage.total_tokens
assistant_message

"During a meeting, your behavior should align with the following practices:\n\n- Be on time.\n- Be present and participate actively while avoiding distractions (e.g., turn off notifications and put away your phone).\n- Ensure your opinions are heard, even if they are less popular.\n- Reflect on your participation and whether you've shared your opinion effectively.\n- Provide feedback to the meeting organizer for potential improvement."