In [1]:
import pandas as pd
from dotenv import load_dotenv
import os 
import weaviate
from weaviate.classes.init import Auth
from sentence_transformers import SentenceTransformer
import weaviate.classes as wvc
from weaviate.classes.config import Property, DataType, Configure, VectorDistances
from openai import OpenAI
import weaviate
from weaviate.classes.init import Auth
from weaviate.classes.config import Configure
import os


In [2]:
load_dotenv()

True

In [3]:
base_url = os.getenv("MODEL_BASE_URL")
api_key = os.getenv("AZURE_OPENAI_API_KEY")
api_version = os.getenv("AZURE_OPENAI_API_VERSION")

In [4]:


# Best practice: store your credentials in environment variables
weaviate_url = os.environ["WEAVIATE_URL"]
weaviate_api_key = os.environ["WEAVIATE_API_KEY"]

weaviate_client = weaviate.connect_to_weaviate_cloud(
    cluster_url=weaviate_url,                                    # Replace with your Weaviate Cloud URL
    auth_credentials=Auth.api_key(weaviate_api_key),             # Replace with your Weaviate Cloud key
)


In [5]:
# OpenAI client setup for Azure
embedding_client = OpenAI(
    api_key=api_key,
    base_url= base_url,
    default_headers={"api-key": api_key},
    default_query={"api-version": api_version}
)

# Embedding function
def get_embedding(text):
    response = embedding_client.embeddings.create(input=text, model="text-embedding-ada-002")
    return response.data[0].embedding 

In [6]:
import json

In [8]:
# --- Create schema if not exists ---
collection_name = "Docs"

if not weaviate_client.collections.exists(collection_name):
    weaviate_client.collections.create(
        name=collection_name,
        properties=[
            weaviate.classes.Property(name="doc_id", data_type=weaviate.classes.DataType.TEXT),
            weaviate.classes.Property(name="source", data_type=weaviate.classes.DataType.TEXT),
            weaviate.classes.Property(name="last_updated", data_type=weaviate.classes.DataType.TEXT),
        ],
        vectorizer_config=weaviate.classes.Configure.Vectorizer.none()  # because we're supplying vectors
    )

collection = weaviate_client.collections.get(collection_name)

# --- Load data from JSON ---
with open("self_critique_loop_dataset.json", "r") as f:
    records = json.load(f)

if isinstance(records, dict):  # in case it's a single record
    records = [records]

# --- Batch insert with embeddings ---
with collection.batch.dynamic() as batch:
    for item in records:        
        combined_text = f"{item['question']} {item['answer_snippet']}"
        vector = get_embedding(combined_text)
        metadata = { "doc_id" : item.get("doc_id"),
            "source": item.get("source"),
            "last_updated": item.get("last_updated")
        }
        batch.add_object(vector=vector, properties=metadata)

print("✅ All records embedded and uploaded to Weaviate.")

✅ All records embedded and uploaded to Weaviate.


In [19]:
# op = get_embedding(["I am angshuman","I love Biryani"])
op = get_embedding("I love Biryani")

In [20]:
# op.data[0].embedding
op.data

[Embedding(embedding=[-0.009589150547981262, -0.009378890506923199, -0.020605525001883507, -0.00021503909374587238, -0.01990465633571148, -0.0021806557197123766, -0.02426278218626976, -0.033335838466882706, -0.012220592238008976, -0.03234187886118889, 0.013303752057254314, -0.0018748223083093762, -0.012163248844444752, 0.0017059767851606011, -0.0036062852013856173, 0.006690105423331261, 0.040726810693740845, 0.005763047840446234, 0.03583347797393799, -0.013915418647229671, -0.014781947247684002, 0.002816215856000781, 0.0041701653972268105, -0.01197847444564104, 0.0056101311929523945, 0.025677261874079704, 0.012488196603953838, -0.01755993254482746, 0.00043167106923647225, -0.0006630372372455895, 0.01912732981145382, 0.020325176417827606, -0.007244428154081106, -0.016362085938453674, -0.028187643736600876, -0.0001933360763359815, -0.0060179089196026325, 0.01441239845007658, 0.010939914733171463, 0.0007056468166410923, 0.020745698362588882, 0.003235143842175603, -0.008212900720536709, 0.