In [34]:
# Docs : https://weaviate.io/developers/weaviate/client-libraries/python

In [1]:
# Run `docker-compose up -d` for local deployment

In [2]:
import os
import sys
sys.path.append("../..")
from utils import load_documents,split_documents,calculate_chunk_ids

import weaviate

In [3]:
EMBEDDING_MODEL = "text-embedding-ada-002"

In [4]:
from dotenv import load_dotenv
load_dotenv()

# # Option #1 - Self-hosted - Weaviate Open Source 
# client = weaviate.Client(
#     url="http://localhost:8080",
#     additional_headers={
#         "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY")
#     }
# )
WEAVIATE_ADMIN_BEARER_TOKEN = os.getenv("WEAVIATE_ADMIN")
WEAVIATE_URL = os.getenv("WEAVIATE_URL")
## Option #2 - SaaS - (Weaviate Cloud Service)
client = weaviate.Client(
    url= WEAVIATE_URL, # Expires in 14 days : Free Tier
    auth_client_secret=weaviate.auth.AuthApiKey(api_key=WEAVIATE_ADMIN_BEARER_TOKEN),
    additional_headers={
        "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY")
    }
)

            your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.

            For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
            For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration
            


In [5]:
client.is_ready()

True

In [6]:
# Clear up the schema, so that we can recreate it
client.schema.delete_all()
client.schema.get()

{'classes': []}

In [7]:
chunk_schema = {
    "class": "Chunk",
    "description": "A collection of context and related embeddings",
    "vectorizer": "text2vec-openai",
    "moduleConfig": {
        "text2vec-openai": {
          "model": "ada",
          "modelVersion": "002",
          "type": "text"
        }
    },
    "properties": [
    {
        "name": "context",
        "description": "Context chunk from a pdf",
        "dataType": ["text"],
        "moduleConfig": { 
            "text2vec-openai": { 
                "skip": True 
            } 
        }
    }]
}

# add the Article schema
client.schema.create_class(chunk_schema)

# get the schema to make sure it worked
client.schema.get()

{'classes': [{'class': 'Chunk',
   'description': 'A collection of context and related embeddings',
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'moduleConfig': {'text2vec-openai': {'baseURL': 'https://api.openai.com',
     'model': 'ada',
     'modelVersion': '002',
     'type': 'text',
     'vectorizeClassName': True}},
   'multiTenancyConfig': {'autoTenantActivation': False,
    'autoTenantCreation': False,
    'enabled': False},
   'properties': [{'dataType': ['text'],
     'description': 'Context chunk from a pdf',
     'indexFilterable': True,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-openai': {'skip': True,
       'vectorizePropertyName': False}},
     'name': 'context',
     'tokenization': 'word'}],
   'replicationConfig': {'factor': 1},
   'shardingConfig': {'actualCount': 1,
    'actualVirtualCount': 128,
    'desiredCount': 1,
    '

In [8]:
### Step 1 - configure Weaviate Batch, which optimizes CRUD operations in bulk
# - starting batch size of 100
# - dynamically increase/decrease based on performance
# - add timeout retries if something goes wrong

client.batch.configure(
    batch_size=5,
    dynamic=True,
    timeout_retries=3,
)

<weaviate.batch.crud_batch.Batch at 0x7c1cd5a19c90>

## Preprocessing the pdfs

In [9]:
documents = load_documents()

In [10]:
len(documents)

164

In [11]:
chunks = split_documents(documents)

In [12]:
chunks[0].metadata,chunks[0].page_content

({'source': '../../data/Galapagos/pdfs/CnE_GUI-CE-027 Guidance on Declaration of Interest (1).pdf',
  'page': 0},
 'Do you need to\n(Also known as a Conflict of Interest)DECLARE AN INTEREST?\nYou may need to declare an interest where a potential conflict arises…\nA potential conflict of interest just means that your personal interests could conflict with your role and decisions at \nGalapagos. \nIt doesn’t mean that there is an actual conflict or that you’ve done anything wrong. In fact, conflicts are usually a \nresult of good things — like having good relationships through friendships or investments. \nBut they could also be perceived by someone else as impacting your judgment, or could harm the trust between \ncolleagues, and that’s why we need to be aware of them so we can take any steps we need to manage them. \nHere are some examples…\nWhat should I do if I think I might need to declare an interest?')

In [13]:
chunks = calculate_chunk_ids(chunks)

In [14]:
chunks[0].metadata['id']

'../../data/Galapagos/pdfs/CnE_GUI-CE-027 Guidance on Declaration of Interest (1).pdf:0:0'

## Uploading the data

In [15]:
from dotenv import load_dotenv
load_dotenv()
import openai

# Load environment variables
OPENAI_ORG = os.getenv('OPENAI_ORG')
OPENAI_APIKEY = os.getenv('OPENAI_APIKEY')

openai.organization = OPENAI_ORG
openai.api_key = OPENAI_APIKEY


openai_client = openai.OpenAI(api_key=OPENAI_APIKEY)
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai_client.embeddings.create(input = [text], model=model).data[0].embedding


In [16]:
# Uploading data with vectors to Chunk schema
counter=0

with client.batch as batch:
    for chunk in chunks:
        chunk_id = chunk.metadata['id']
        chunk_content = chunk.page_content
        properties = {
            "context": chunk_content
        }
        
        vector = get_embedding(chunk_content)
        
        batch.add_data_object(properties, "Chunk", None, vector)
        counter = counter+1

In [17]:
assert counter == len(chunks)
print(f"Importing ({len(chunks)}) chunks complete") 

Importing (494) chunks complete


In [18]:
result = (
    client.query.aggregate("Chunk")
    .with_fields("meta { count }")
    .do()
)
print("Object count: ", result["data"]["Aggregate"]["Chunk"])

Object count:  [{'meta': {'count': 494}}]


In [19]:
# Test one article has worked by checking one object
test_article = (
    client.query
    .get("Chunk", ["context", "_additional {id}"])
    .with_limit(1)
    .do()
)["data"]["Get"]["Chunk"][0]

print(test_article["_additional"]["id"])
print(test_article["context"])

000ca20d-e85c-406d-8223-92e5e3537901
. 
The individual circumstances will determine whether a conflict actually exists and how this is best managed. If you consider any other Personal Interest to be of relevance to Galapagos, this should be declared even if not listed above.  
2.3 Makin g a Declaration of Personal Interest  
Anyone within Galapagos who identifies a relevant Personal Interest should make a Declaration of Personal Interest. This will enable an objective assessment of the 
situation to be undertaken and, where necessary, steps to be put in place to manage any 
arising conflict. This should be done for existing situations when you join Galapagos but also for any new circumstances which arise during your relationship with Galapagos.


In [20]:
def query_weaviate(query, collection_name, top_k=5):

    # Creates embedding vector from user query
    embedded_query = get_embedding(query)
    
    near_vector = {"vector": embedded_query}

    # Queries input schema with vectorised user query
    query_result = (
        client.query
        .get(collection_name, ["context", "_additional {certainty distance}"])
        .with_near_vector(near_vector)
        .with_limit(top_k)
        .do()
    )
    
    return query_result


#  Possibility to "Let Weaviate handle vector embeddings"

def near_text_weaviate(query, collection_name):
    
    nearText = {
        "concepts": [query],
        "distance": 0.7,
    }

    properties = [
        "title", "context",
        "_additional {certainty distance}"
    ]

    query_result = (
        client.query
        .get(collection_name, properties)
        .with_near_text(nearText)
        .with_limit(20)
        .do()
    )["data"]["Get"][collection_name]
    
    print (f"Objects returned: {len(query_result)}")
    
    return query_result

In [21]:
query_result = query_weaviate("How to behave during a meeting?", "Chunk")

In [22]:
import json
with open("result.json","w") as f:
    json.dump(query_result,f)

In [23]:
def get_combined_results(query_result):
    combined_result = ''
    for result in query_result['data']['Get']['Chunk']:
        combined_result += result['context'] +"\n\n"
    return combined_result

In [24]:
related_context = get_combined_results(query_result)

In [25]:
related_context

'|   1ATTENDEES - Best meeting practices\n•Decide if you are joining the meeting\n\uf0fcRSVP (respond) to the meeting organizer as soon as possible\n•Prepare: read the agenda and the pre- work\n\uf0fccheck tasks from last meeting, prepare insights and /or questions\n•Be on time\n•Be present, participate and avoid distractions\n\uf0fcturn off notifications, put away your phone\n•Make sure you are heard, even if your opinion is less popular \n•Reflect: did I participate, and did I share my opinion?\n•Provide feedback to the meeting organizer for potential improvement\n•Read and review the minutes \n•Check the follow -up tasks and act on yours\nBefore\nDuring\nAfter\n\n|   1ORGANIZERS - Best meeting practices\n•Decide if you really need the meeting and select correct meeting type\n•Define a clear and timed agenda, including the meeting objective\n•Use the Outlook Scheduling Assistant to plan at an appropriate time\n•Carefully choose the attendees, share the agenda and materials in advance

## Refining Output

In [26]:
USER_QUERY = "How to behave during a meeting?"
RETREIVER_K = 5

In [27]:
PROMPT_USER_TEMPLATE = """
Answer the question based only on the following context:

Question :
{question}


Context :
{context}

"""

SYSTEM_PROMPT = """
You are an ICF MCC certified coach who has a lot of experience with life coaching.
You are give certain context and a question. Use the context and output an answer that is precise and clear.

"""

In [28]:
prompt_query = PROMPT_USER_TEMPLATE.format(context=related_context, question=USER_QUERY)

In [29]:
MODEL = "gpt-4o"
response = openai_client.chat.completions.create(
    model=MODEL,
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt_query}
    ],
)
assistant_message = response.choices[0].message.content
tokens = response.usage.total_tokens
assistant_message

'During a meeting, you should:\n\n- Be on time.\n- Be present, participate, and avoid distractions (e.g., turn off notifications, put away your phone).\n- Make sure to share your opinions, even if they are less popular.\n- Reflect on your participation and whether you shared your opinion.\n- Provide feedback to the meeting organizer for potential improvement.'

In [30]:
related_context

'|   1ATTENDEES - Best meeting practices\n•Decide if you are joining the meeting\n\uf0fcRSVP (respond) to the meeting organizer as soon as possible\n•Prepare: read the agenda and the pre- work\n\uf0fccheck tasks from last meeting, prepare insights and /or questions\n•Be on time\n•Be present, participate and avoid distractions\n\uf0fcturn off notifications, put away your phone\n•Make sure you are heard, even if your opinion is less popular \n•Reflect: did I participate, and did I share my opinion?\n•Provide feedback to the meeting organizer for potential improvement\n•Read and review the minutes \n•Check the follow -up tasks and act on yours\nBefore\nDuring\nAfter\n\n|   1ORGANIZERS - Best meeting practices\n•Decide if you really need the meeting and select correct meeting type\n•Define a clear and timed agenda, including the meeting objective\n•Use the Outlook Scheduling Assistant to plan at an appropriate time\n•Carefully choose the attendees, share the agenda and materials in advance

In [31]:
""" 
'|   1ATTENDEES - Best meeting practices
•Decide if you are joining the meeting
\uf0fcRSVP (respond) to the meeting organizer as soon as possible
•Prepare: read the agenda and the pre- work
\uf0fccheck tasks from last meeting, prepare insights and /or questions
•Be on time
•Be present, participate and avoid distractions
\uf0fcturn off notifications, put away your phone
•Make sure you are heard, even if your opinion is less popular 
•Reflect: did I participate, and did I share my opinion?
•Provide feedback to the meeting organizer for potential improvement
•Read and review the minutes 
•Check the follow -up tasks and act on yours
Before
During
After

|   1ORGANIZERS - Best meeting practices
•Decide if you really need the meeting and select correct meeting type
•Define a clear and timed agenda, including the meeting objective
•Use the Outlook Scheduling Assistant to plan at an appropriate time
•Carefully choose the attendees, share the agenda and materials in advance
•Start by introducing people, stating the objective, and showing the agenda
•Assign note -taker and timekeeper, but record tasks yourself
•Moderate and ensure a safe space for everyone to get the word
•End meeting by summarizing the follow -up tasks for all attendees
•Share with all attendees the follow -up tasks and the meeting minutes
•Reflect: was the meeting goal achieved, and was everyone heard?
•Ensure follow -up and plan next meeting if necessary
Before
During
After

Tips for engaging and inclusive meetings
☑Build trust and encourage participation
|   1•assign minutes -taker and time -keeper roles
•send the agenda (and pre -work) to the attendees in advance
•keep it focused and engaging
\uf0d8be well -prepared to retain attendees’ attention 
\uf0d8keep the meeting as short as possible
\uf0d8use polls, ratings, and whiteboards
\uf0d8establish ground rules at the start of the meeting
•has everyone’s point of view been heard?
\uf0d8ask questions to prompt discussion and listen actively

Time Management tips for Effective Meetings
☑Respect other people’s time
|   1•respond to meeting invites as soon as you receive them
•reschedule timely to resolve meeting conflicts
•don’t be afraid to politely decline a meeting
(are you needed there? ask for an agenda, align expectations)
☑Save yourself some time
•share your calendar titles with your functional and TA teams
•book well in advance long, recurring, and team meetings
•respect time -zones and working hours, yours included!

Calendar Man agement tips for Effective Meetings
|   1☑Organ ize your calendar
•state your working hours (e.g. 09:00-1 7:00)
•change default meeting duration to 25’ instead of 30’
•book lunch and travel time as OOO
•book focus time as ‘tentative’ to avoid blocking your entire agenda
☑Organi ze y our calenda r for OOO days
•add public holidays for all major GLPG sites (single action)
•enter your vacation on Outlook well in advance
•reschedule (or decline) meetings happening during your absence

'
"""

" \n'|   1ATTENDEES - Best meeting practices\n•Decide if you are joining the meeting\n\uf0fcRSVP (respond) to the meeting organizer as soon as possible\n•Prepare: read the agenda and the pre- work\n\uf0fccheck tasks from last meeting, prepare insights and /or questions\n•Be on time\n•Be present, participate and avoid distractions\n\uf0fcturn off notifications, put away your phone\n•Make sure you are heard, even if your opinion is less popular \n•Reflect: did I participate, and did I share my opinion?\n•Provide feedback to the meeting organizer for potential improvement\n•Read and review the minutes \n•Check the follow -up tasks and act on yours\nBefore\nDuring\nAfter\n\n|   1ORGANIZERS - Best meeting practices\n•Decide if you really need the meeting and select correct meeting type\n•Define a clear and timed agenda, including the meeting objective\n•Use the Outlook Scheduling Assistant to plan at an appropriate time\n•Carefully choose the attendees, share the agenda and materials in adv

In [32]:
assistant_message

'During a meeting, you should:\n\n- Be on time.\n- Be present, participate, and avoid distractions (e.g., turn off notifications, put away your phone).\n- Make sure to share your opinions, even if they are less popular.\n- Reflect on your participation and whether you shared your opinion.\n- Provide feedback to the meeting organizer for potential improvement.'

In [33]:
""" 
'During a meeting, you should:

1. Be on time.
2. Be present and actively participate, avoiding distractions such as phone notifications.
3. Ensure you are heard, even if your opinions are less popular.
4. Reflect on your participation and whether you shared your opinion.
5. Provide feedback to the meeting organizer for potential improvements.
'
"""

" \n'During a meeting, you should:\n\n1. Be on time.\n2. Be present and actively participate, avoiding distractions such as phone notifications.\n3. Ensure you are heard, even if your opinions are less popular.\n4. Reflect on your participation and whether you shared your opinion.\n5. Provide feedback to the meeting organizer for potential improvements.\n'\n"