In [None]:
import sys
print(sys.path)
sys.path.append("c:\\Users\\baisingh\\go\\src\\Klerk")
print(sys.path)

In [None]:
import os
import pymongo
import requests
from pymongo import UpdateOne, DeleteMany
from dotenv import load_dotenv
from models.karnatakaLegalBook import KarnatakaLegalBook, KarnatakaLegalBookList
from datetime import datetime, timezone
from tenacity import retry, wait_random_exponential, stop_after_attempt

In [None]:
load_dotenv()
CONNECTION_STRING = os.environ.get("DB_CONNECTION_STRING")
EMBEDDINGS_DEPLOYMENT_NAME = os.environ.get("OPENAI_LLM_EMBEDDING")
AOAI_ENDPOINT = os.environ.get("OPENAI_API_ENDPOINT")
AOAI_KEY = os.environ.get("OPENAI_API_KEY")
AOAI_API_VERSION = "2023-05-15"
client = pymongo.MongoClient(CONNECTION_STRING)
print(CONNECTION_STRING)
# Create database to hold cosmic works data
# MongoDB will create the database "karnatakaLegalBook" if it does not exist
db = client.karnatakaLegalBook

In [None]:
# Add document registration data to database using bulkwrite and updateOne with upsert
# Get karnataka klerk document registration data from github
from bson.json_util import loads
# Corrected URL for raw JSON data
kr_legal_raw_data = "https://raw.githubusercontent.com/Baijnath-Singh/Klerk/main/models/karnatakaLegalBook.json"

# Fetch the raw JSON data
response = requests.get(kr_legal_raw_data)
response.raise_for_status()  # Ensure we raise an error for bad status codes

# Parse the JSON data
data = response.json()

# Create KnowledgeDocumentList from the parsed JSON data
kr_legal_data = KarnatakaLegalBookList(items=[KarnatakaLegalBook(**item) for item in data])

# Perform bulk write to MongoDB
db.document_registration.bulk_write([
    UpdateOne({"_id": dr.id}, {"$set": loads(dr.json(by_alias=True))}, upsert=True)
    for dr in kr_legal_data.items
])

In [None]:
import json
# Read and print documents from the collection
documents = db.document_registration.find().limit(10)  # Limiting to 10 documents for brevity
count = 0
for doc in documents:
    count = count + 1
    print(json.dumps(doc, indent=4, default=str))  # Convert ObjectId to string for printing

print(count)

In [None]:
from openai import AzureOpenAI
ai_client = AzureOpenAI(
    azure_endpoint = AOAI_ENDPOINT,
    api_version = AOAI_API_VERSION,
    api_key = AOAI_KEY
    )

In [None]:
import time
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(3))
def generate_embeddings(text: str):
    '''
    Generate embeddings from string of text using the deployed Azure OpenAI API embeddings model.
    This will be used to vectorize document data and incoming user messages for a similarity search with
    the vector index.
    '''
    response = ai_client.embeddings.create(input=text, model=EMBEDDINGS_DEPLOYMENT_NAME)
    embeddings = response.data[0].embedding
    time.sleep(0.1) # rest period to avoid rate limiting on AOAI
    return embeddings

In [None]:
# demonstrate embeddings generation using a test string
test = "hello, world"
print(generate_embeddings(test))

In [None]:
def add_collection_content_vector_field(collection_name: str):
    '''
    Add a new field to the collection to hold the vectorized content of each document.
    '''
    collection = db[collection_name]
    bulk_operations = []
    for doc in collection.find():
        # remove any previous contentVector embeddings
        if "contentVector" in doc:
            del doc["contentVector"]

        # generate embeddings for the document string representation
        content = json.dumps(doc, default=str)
        content_vector = generate_embeddings(content)       
        
        bulk_operations.append(pymongo.UpdateOne(
            {"_id": doc["_id"]},
            {"$set": {"contentVector": content_vector}},
            upsert=True
        ))
    # execute bulk operations
    collection.bulk_write(bulk_operations)

In [None]:
# Add vector field to products documents - this will take approximately 3-5 minutes due to rate limiting
add_collection_content_vector_field("document_registration")

In [None]:
# Create the document registration vector index
db.command({
  'createIndexes': 'document_registration',  # Name of the collection where the index is to be created
  'indexes': [
    {
      'name': 'VectorSearchIndex',  # Name of the index
      'key': {
        "contentVector": "cosmosSearch"  # Field to be indexed, using a special "cosmosSearch" type for vector search
      },
      'cosmosSearchOptions': {
        'kind': 'vector-ivf',  # Type of vector index, IVF (Inverted File) in this case
        'numLists': 1,  # Number of inverted lists used in the IVF index
        'similarity': 'COS',  # Similarity metric, COS stands for Cosine Similarity
        'dimensions': 1536  # Dimensionality of the vectors being indexed
      }
    }
  ]
})


In [None]:
def vector_search(collection_name, query, num_results=3):
    """
    Perform a vector search on the specified collection by vectorizing
    the query and searching the vector index for the most similar documents.

    returns a list of the top num_results most similar documents
    """
    collection = db[collection_name]
    query_embedding = generate_embeddings(query)    
    pipeline = [
        {
            '$search': {
                "cosmosSearch": {
                    "vector": query_embedding,
                    "path": "contentVector",
                    "k": num_results
                },
                "returnStoredSource": True }},
        {'$project': { 'similarityScore': { '$meta': 'searchScore' }, 'document' : '$$ROOT' } }
    ]
    results = collection.aggregate(pipeline)
    return results

def print_product_search_result(result):
    '''
    Print the search result document in a readable format
    '''
    print(f"Similarity Score: {result['similarityScore']}")  
    print(f"Category: {result['document']['category']}")   
    print(f"Question: {result['document']['question']}")
    print(f"Answer: {result['document']['answer']}")
    print(f"_id: {result['document']['_id']}\n")

In [None]:
query = "How ownership of immovable property is acquired by a person?"
results = vector_search("document_registration", query, num_results=1)
for result in results:
    print_product_search_result(result) 

In [None]:
db.drop_collection("document_registration")
client.drop_database("karnataka_klerk")
client.close()