In [None]:
# Docs : https://weaviate.io/developers/weaviate/client-libraries/python

In [None]:
# Run `docker-compose up -d` for local deployment

In [2]:
import os
import sys
sys.path.append("../..")
from utils import load_documents,split_documents,calculate_chunk_ids

import weaviate

In [3]:
EMBEDDING_MODEL = "text-embedding-ada-002"

In [4]:
from dotenv import load_dotenv
load_dotenv()

# v3
# # Option #1 - Self-hosted - Weaviate Open Source 
# client = weaviate.Client(
#     url="http://localhost:8080",
#     additional_headers={
#         "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY")
#     }
# )

# v4
client = weaviate.connect_to_local(
    host="localhost",
    port=8080,
) 

# WEAVIATE_ADMIN_BEARER_TOKEN = os.getenv("WEAVIATE_ADMIN")
# WEAVIATE_URL = os.getenv("WEAVIATE_URL")
# ## Option #2 - SaaS - (Weaviate Cloud Service)
# client = weaviate.Client(
#     url= WEAVIATE_URL, # Expires in 14 days : Free Tier
#     auth_client_secret=weaviate.auth.AuthApiKey(api_key=WEAVIATE_ADMIN_BEARER_TOKEN),
#     additional_headers={
#         "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY")
#     }
# )

I0000 00:00:1721788533.744750   13998 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache


In [5]:
client.is_ready()

True

In [6]:
chunk_schema = {
    "class": "Chunk",
    "description": "A collection of context and related embeddings",
    "vectorizer": "text2vec-openai",
    "moduleConfig": {
        "text2vec-openai": {
          "model": "ada",
          "modelVersion": "002",
          "type": "text"
        }
    },
    "properties": [
    {
        "name": "context",
        "description": "Context chunk from a pdf",
        "dataType": ["text"],
        "moduleConfig": { 
            "text2vec-openai": { 
                "skip": True 
            } 
        }
    }]
}

## Preprocessing the pdfs

In [7]:
documents = load_documents()


In [8]:
len(documents)

164

In [9]:
chunks = split_documents(documents)

In [11]:
chunks[0].metadata,chunks[0].page_content

({'source': '../../data/Galapagos/pdfs/CnE_GUI-CE-027 Guidance on Declaration of Interest (1).pdf',
  'page': 0},
 'Do you need to\n(Also known as a Conflict of Interest)DECLARE AN INTEREST?\nYou may need to declare an interest where a potential conflict arises…\nA potential conflict of interest just means that your personal interests could conflict with your role and decisions at \nGalapagos. \nIt doesn’t mean that there is an actual conflict or that you’ve done anything wrong. In fact, conflicts are usually a \nresult of good things — like having good relationships through friendships or investments. \nBut they could also be perceived by someone else as impacting your judgment, or could harm the trust between \ncolleagues, and that’s why we need to be aware of them so we can take any steps we need to manage them. \nHere are some examples…\nWhat should I do if I think I might need to declare an interest?')

In [12]:
chunks = calculate_chunk_ids(chunks)

In [13]:
chunks[0].metadata['id']

'../../data/Galapagos/pdfs/CnE_GUI-CE-027 Guidance on Declaration of Interest (1).pdf:0:0'

## Uploading the data

In [14]:
from dotenv import load_dotenv
load_dotenv()
import openai

# Load environment variables
OPENAI_ORG = os.getenv('OPENAI_ORG')
OPENAI_APIKEY = os.getenv('OPENAI_APIKEY')

openai.organization = OPENAI_ORG
openai.api_key = OPENAI_APIKEY


openai_client = openai.OpenAI(api_key=OPENAI_APIKEY)
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return openai_client.embeddings.create(input = [text], model=model).data[0].embedding


In [16]:
# Uploading data with vectors to Chunk schema
counter=0
import uuid
with client.batch.dynamic() as batch:
    initial_uuid = None
    latest_uuid = None
    for chunk in chunks:
        chunk_id = chunk.metadata['id']
        chunk_content = chunk.page_content
        properties = {
            "context": chunk_content
        }
        _uuid = uuid.uuid4()
        if not initial_uuid:
            initial_uuid = _uuid
        latest_uuid = _uuid
        vector = get_embedding(chunk_content)
        batch.add_object(properties={
            chunk_schema['properties'][0]['name'] : chunk_content,
            },
            collection=chunk_schema['class'],
            uuid=_uuid
        )
        counter = counter+1
    batch.add_reference(from_collection=chunk_schema['class'], from_uuid=initial_uuid, from_property="linkedChunk", to=latest_uuid)


{'message': 'Failed to send 1 references in a batch of 1. Please inspect client.batch.failed_references or collection.batch.failed_references for the failed references.', 'errors': {0: ErrorReference(message='property linkedChunk does not exist for class Chunk', reference=_BatchReference(from_='weaviate://localhost/Chunk/c28f441b-9830-4d68-9d79-b3da72c65931/linkedChunk', to='weaviate://localhost/a5c666b4-e05c-438c-a13c-2290eace8d06', tenant=None, from_uuid='c28f441b-9830-4d68-9d79-b3da72c65931', to_uuid='a5c666b4-e05c-438c-a13c-2290eace8d06'))}}


In [17]:
assert counter == len(chunks)
print(f"Importing ({len(chunks)}) chunks complete") 

Importing (494) chunks complete


In [None]:
result = (
    client.query.aggregate("Chunk")
    .with_fields("meta { count }")
    .do()
)
print("Object count: ", result["data"]["Aggregate"]["Chunk"])

In [None]:
# Test one article has worked by checking one object
test_article = (
    client.query
    .get("Chunk", ["context", "_additional {id}"])
    .with_limit(1)
    .do()
)["data"]["Get"]["Chunk"][0]

print(test_article["_additional"]["id"])
print(test_article["context"])

In [None]:
def query_weaviate(query, collection_name, top_k=5):

    # Creates embedding vector from user query
    embedded_query = get_embedding(query)
    
    near_vector = {"vector": embedded_query}

    # Queries input schema with vectorised user query
    query_result = (
        client.query
        .get(collection_name, ["context", "_additional {certainty distance}"])
        .with_near_vector(near_vector)
        .with_limit(top_k)
        .do()
    )
    
    return query_result


#  Possibility to "Let Weaviate handle vector embeddings"

def near_text_weaviate(query, collection_name):
    
    nearText = {
        "concepts": [query],
        "distance": 0.7,
    }

    properties = [
        "title", "context",
        "_additional {certainty distance}"
    ]

    query_result = (
        client.query
        .get(collection_name, properties)
        .with_near_text(nearText)
        .with_limit(20)
        .do()
    )["data"]["Get"][collection_name]
    
    print (f"Objects returned: {len(query_result)}")
    
    return query_result

In [None]:
query_result = query_weaviate("How to behave during a meeting?", "Chunk")

In [None]:
import json
with open("result.json","w") as f:
    json.dump(query_result,f)

In [None]:
def get_combined_results(query_result):
    combined_result = ''
    for result in query_result['data']['Get']['Chunk']:
        combined_result += result['context'] +"\n\n"
    return combined_result

In [None]:
related_context = get_combined_results(query_result)

In [None]:
related_context

## Refining Output

In [None]:
USER_QUERY = "How to behave during a meeting?"
RETREIVER_K = 5

In [None]:
PROMPT_USER_TEMPLATE = """
Answer the question based only on the following context:

Question :
{question}


Context :
{context}

"""

SYSTEM_PROMPT = """
You are an ICF MCC certified coach who has a lot of experience with life coaching.
You are give certain context and a question. Use the context and output an answer that is precise and clear.

"""

In [None]:
prompt_query = PROMPT_USER_TEMPLATE.format(context=related_context, question=USER_QUERY)

In [None]:
MODEL = "gpt-4o"
response = openai_client.chat.completions.create(
    model=MODEL,
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt_query}
    ],
)
assistant_message = response.choices[0].message.content
tokens = response.usage.total_tokens
assistant_message

In [None]:
related_context

In [None]:
""" 
'|   1ATTENDEES - Best meeting practices
•Decide if you are joining the meeting
\uf0fcRSVP (respond) to the meeting organizer as soon as possible
•Prepare: read the agenda and the pre- work
\uf0fccheck tasks from last meeting, prepare insights and /or questions
•Be on time
•Be present, participate and avoid distractions
\uf0fcturn off notifications, put away your phone
•Make sure you are heard, even if your opinion is less popular 
•Reflect: did I participate, and did I share my opinion?
•Provide feedback to the meeting organizer for potential improvement
•Read and review the minutes 
•Check the follow -up tasks and act on yours
Before
During
After

|   1ORGANIZERS - Best meeting practices
•Decide if you really need the meeting and select correct meeting type
•Define a clear and timed agenda, including the meeting objective
•Use the Outlook Scheduling Assistant to plan at an appropriate time
•Carefully choose the attendees, share the agenda and materials in advance
•Start by introducing people, stating the objective, and showing the agenda
•Assign note -taker and timekeeper, but record tasks yourself
•Moderate and ensure a safe space for everyone to get the word
•End meeting by summarizing the follow -up tasks for all attendees
•Share with all attendees the follow -up tasks and the meeting minutes
•Reflect: was the meeting goal achieved, and was everyone heard?
•Ensure follow -up and plan next meeting if necessary
Before
During
After

Tips for engaging and inclusive meetings
☑Build trust and encourage participation
|   1•assign minutes -taker and time -keeper roles
•send the agenda (and pre -work) to the attendees in advance
•keep it focused and engaging
\uf0d8be well -prepared to retain attendees’ attention 
\uf0d8keep the meeting as short as possible
\uf0d8use polls, ratings, and whiteboards
\uf0d8establish ground rules at the start of the meeting
•has everyone’s point of view been heard?
\uf0d8ask questions to prompt discussion and listen actively

Time Management tips for Effective Meetings
☑Respect other people’s time
|   1•respond to meeting invites as soon as you receive them
•reschedule timely to resolve meeting conflicts
•don’t be afraid to politely decline a meeting
(are you needed there? ask for an agenda, align expectations)
☑Save yourself some time
•share your calendar titles with your functional and TA teams
•book well in advance long, recurring, and team meetings
•respect time -zones and working hours, yours included!

Calendar Man agement tips for Effective Meetings
|   1☑Organ ize your calendar
•state your working hours (e.g. 09:00-1 7:00)
•change default meeting duration to 25’ instead of 30’
•book lunch and travel time as OOO
•book focus time as ‘tentative’ to avoid blocking your entire agenda
☑Organi ze y our calenda r for OOO days
•add public holidays for all major GLPG sites (single action)
•enter your vacation on Outlook well in advance
•reschedule (or decline) meetings happening during your absence

'
"""

In [None]:
assistant_message

In [None]:
""" 
'During a meeting, you should:

1. Be on time.
2. Be present and actively participate, avoiding distractions such as phone notifications.
3. Ensure you are heard, even if your opinions are less popular.
4. Reflect on your participation and whether you shared your opinion.
5. Provide feedback to the meeting organizer for potential improvements.
'
"""