In [None]:
# install if needed
# %pip install pypdf
# %pip install pandas
# %pip install -qU langchain-community 
# %pip install --upgrade --quiet azure-search-documents
# %pip install --upgrade --quiet azure-identity
# %pip install langchain_openai
# %pip install -U azure-search-documents

In [2]:
# libraries

import sys
import json
import requests
import pandas as pd

from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.retrievers import AzureCognitiveSearchRetriever
from langchain.vectorstores.azuresearch import AzureSearch

import os

# seems useless
os.environ["AZURESEARCH_FIELDS_ID"] = "chunk_id"
os.environ["AZURESEARCH_FIELDS_CONTENT"] = "chunk"
os.environ["AZURESEARCH_FIELDS_CONTENT_VECTOR"] = "text_vector"
os.environ["AZURESEARCH_FIELDS_TAG"] = "metadata_storage_path"


##### Creating Index :: Azure Cognitive Search (Vector Store)

In [None]:
import os
from openai import AzureOpenAI
# get config from aoai_config.yaml

%run env_config_setting.ipynb

openai_api_version = openai.api_version
openai_api_key = openai.api_key
openai_api_endpoint = openai.api_base

print('openi config: ', deployment_name, embedding_name, openai_api_key, openai_api_endpoint, openai_api_version)
print('search config: ', search_service_name, search_key, search_endpoint)
print('storageaccnt: ', storageaccnt)

index_name = "azure-rag-test-index"

# Search Index Schema definition
index_schema = "./openai-search-demo.json"


In [4]:
from azure.search.documents.indexes.models import CorsOptions
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import SearchIndex 

## Class to create index ::

# Instantiate a client
class CreateClient(object):
    def __init__(self, endpoint, key, index_name):
        self.endpoint = endpoint
        self.index_name = index_name
        self.key = key
        self.credentials = AzureKeyCredential(key)

    # Create a SearchClient
    # Use this to upload docs to the Index
    def create_search_client(self):
        return SearchClient(
            endpoint=self.endpoint,
            index_name=self.index_name,
            credential=self.credentials,
        )

    # Create a SearchIndexClient
    # This is used to create, manage, and delete an index
    def create_admin_client(self):
        return SearchIndexClient(endpoint=self.endpoint, credential=self.credentials)


# Get Schema from File or URL
def get_schema_data(schema, url=False):
    if not url:
        with open(schema) as json_file:
            schema_data = json.load(json_file)
            return schema_data
    else:
        data_from_url = requests.get(schema)
        schema_data = json.loads(data_from_url.content)
        return schema_data

# Convert CSV data to JSON
def convert_csv_to_json(url):
    df = pd.read_csv(url)
    convert = df.to_json(orient="records")
    return json.loads(convert)

# Create Search Index from the schema
# If reading the schema from a URL, set url=True
def create_schema_from_json_and_upload(schema, index_name, admin_client, url=False):

    cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)
    scoring_profiles = []
    schema_data = get_schema_data(schema, url)
    print("schema_data:", schema_data)
    # the schema_data does not define field 'content_vector'

    index = SearchIndex(
        name=index_name,
        fields=schema_data["fields"],
        scoring_profiles=scoring_profiles,
        suggesters=schema_data["suggesters"],
        cors_options=cors_options,
    )
    print("index:", index)
    
    try:
        print("call admin_client.create_index..")
        upload_schema = admin_client.create_index(index)
        if upload_schema:
            print(f"Schema uploaded; Index created for {index_name}.")
        else:
            exit(0)
    except:
        print("Unexpected error:", sys.exception()) #sys.exc_info()[0])



In [5]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient

start_client = CreateClient(search_endpoint, search_key, index_name)

admin_client = start_client.create_admin_client()
search_client = start_client.create_search_client()


In [6]:
# check schema data
schema_data = get_schema_data(index_schema, url=False)
print(schema_data)

{'name': 'openai-search-demo', 'fields': [{'name': 'id', 'type': 'Edm.String', 'searchable': True, 'filterable': True, 'retrievable': True, 'stored': True, 'sortable': True, 'facetable': True, 'key': True, 'indexAnalyzer': None, 'searchAnalyzer': None, 'analyzer': 'keyword', 'normalizer': None, 'dimensions': None, 'vectorSearchProfile': None, 'vectorEncoding': None, 'synonymMaps': []}, {'name': 'content', 'type': 'Edm.String', 'facetable': False, 'filterable': False, 'key': False, 'retrievable': True, 'searchable': True, 'sortable': False, 'analyzer': 'standard.lucene', 'indexAnalyzer': None, 'searchAnalyzer': None, 'synonymMaps': [], 'fields': []}, {'name': 'metadata', 'type': 'Edm.String', 'searchable': False, 'filterable': False, 'retrievable': True, 'stored': True, 'sortable': False, 'facetable': False, 'key': False, 'indexAnalyzer': None, 'searchAnalyzer': None, 'analyzer': None, 'normalizer': None, 'dimensions': None, 'vectorSearchProfile': None, 'vectorEncoding': None, 'synonymM

In [7]:
# check scoring profiles
scoring_profiles = []
cors_options = CorsOptions(allowed_origins=["*"], max_age_in_seconds=60)
print(cors_options)

{'additional_properties': {}, 'allowed_origins': ['*'], 'max_age_in_seconds': 60}


In [8]:
# index openai-search-demo successfully created in s100-ai-search | Indexes
# if index already was created, skip this block !!

index = create_schema_from_json_and_upload(index_schema, index_name, admin_client)

schema_data: {'name': 'openai-search-demo', 'fields': [{'name': 'id', 'type': 'Edm.String', 'searchable': True, 'filterable': True, 'retrievable': True, 'stored': True, 'sortable': True, 'facetable': True, 'key': True, 'indexAnalyzer': None, 'searchAnalyzer': None, 'analyzer': 'keyword', 'normalizer': None, 'dimensions': None, 'vectorSearchProfile': None, 'vectorEncoding': None, 'synonymMaps': []}, {'name': 'content', 'type': 'Edm.String', 'facetable': False, 'filterable': False, 'key': False, 'retrievable': True, 'searchable': True, 'sortable': False, 'analyzer': 'standard.lucene', 'indexAnalyzer': None, 'searchAnalyzer': None, 'synonymMaps': [], 'fields': []}, {'name': 'metadata', 'type': 'Edm.String', 'searchable': False, 'filterable': False, 'retrievable': True, 'stored': True, 'sortable': False, 'facetable': False, 'key': False, 'indexAnalyzer': None, 'searchAnalyzer': None, 'analyzer': None, 'normalizer': None, 'dimensions': None, 'vectorSearchProfile': None, 'vectorEncoding': No

In [9]:
from langchain.document_loaders import PyPDFLoader
from tqdm import tqdm
import pickle
import os

pdf_folder_path = './pdfs/'
loaders = [PyPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]
documents = []
for loader in tqdm(loaders):
    try:
        documents.extend(loader.load())
    except:
        pass
with open('my_documents.pkl', 'wb') as f:
    pickle.dump(documents, f)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 24.31it/s]


In [10]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

In [11]:
# chunk_size=1000
# embeddings: OpenAIEmbeddings = OpenAIEmbeddings(model=model,deployment=deployment,
#                                    openai_api_base=openai.api_base,
#                                   openai_api_type = "azure",
#                                   chunk_size=1)
from langchain_openai import AzureOpenAIEmbeddings

embeddings: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings(
    azure_deployment=embedding_name,
    openai_api_version=openai_api_version,
    azure_endpoint=openai_api_endpoint,
    api_key=openai_api_key,
)


In [12]:
# prepare data to fill in index
# need manually add filed content_vector

from operator import itemgetter
batch_array = []

for i,content in enumerate(texts):
    metad_str = str(content.metadata).replace('\'','"')
    print(i, metad_str)
    batch_array.append(
        {
            "id": str(i),
            "content": content.page_content,
            "metadata": metad_str, #json format like this  "metadata": "{\"source\": \"./pdfs/eda_llm_future.pdf\", \"page\": 1}",
            "category": "CATEGORY",
            "content_vector": embeddings.embed_query(content.page_content) 
        })
len(batch_array)

0 {"source": "./pdfs/eda_llm_future.pdf", "page": 0}
1 {"source": "./pdfs/eda_llm_future.pdf", "page": 1}


2

In [13]:
# upload data 

results = search_client.upload_documents(documents=batch_array)
results[0].as_dict()

{'key': '0', 'succeeded': True, 'status_code': 200}

In [15]:
# RAG search

result2 = search_client.search(search_text="circuit")
# print(result2)
for rs in result2:
    print(rs['@search.score'], rs['content'][:50])

0.48009038 Large Language Models for EDA: Future or Mirage?
Z
0.27359986 design automation, such as decoupled hardware temp


In [17]:
from langchain_community.vectorstores.azuresearch import AzureSearch
# langchain AzureSearch 會預設parse field content_vetor, metadata
# 如果沒有這兩個欄位 或資料格式不對 就會回error

# 即使上面 AZURESEARCH_FIELDS_CONTENT_VECTOR 改成 text_vector
# parse 的時候還是拿到 content_vector

vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=search_endpoint,
    azure_search_key=search_key,
    index_name= index_name, # use other index because above index does not have content_vector
    embedding_function=embeddings.embed_query,
    # Configure max retries for the Azure client
    additional_search_client_options={"retry_total": 4},
)

In [None]:
# Perform a similarity search

docs = vector_store.similarity_search(
    query="circuit",
    k=3,
    search_type="similarity",
)

# print(docs)

In [19]:
print(len(docs))
for doc in docs:
    print(doc.metadata, doc.page_content[:50])

2
{'id': '0', 'source': './pdfs/eda_llm_future.pdf', 'page': 0} Large Language Models for EDA: Future or Mirage?
Z
{'id': '1', 'source': './pdfs/eda_llm_future.pdf', 'page': 1} design automation, such as decoupled hardware temp


In [20]:
retriever = vector_store.as_retriever()

In [21]:
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

client = AzureChatOpenAI(
    azure_deployment=deployment_name,  # or your deployment
    api_version=openai_api_version,  # or your api version
    temperature=0,
    api_key=openai_api_key,
    azure_endpoint=openai_api_endpoint)

system_prompt = (
    "Use the given context to answer the question. "
    "If you don't know the answer, say you don't know. "
    "Use three sentence maximum and keep the answer concise. "
    "Context: {context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(client, prompt)
chain = create_retrieval_chain(retriever, question_answer_chain)

query = "What is chateda?"
chain.invoke({"input": query})


{'input': 'What is chateda?',
 'context': [Document(metadata={'id': '1', 'source': './pdfs/eda_llm_future.pdf', 'page': 1}, page_content='design automation, such as decoupled hardware template design, pri-\noritizing in-context learning given limited data, and proper prompt\nengineering. ChipChat [ 4] authors argue that when human feed-\nback is incorporated into the more advanced ChatGPT-4 model or\nused for co-design, the language model acts as a ’force multiplier’,\nenabling quick exploration and iteration of the design space.\nThe second interesting direction is to help the design flow with\nLLM. ChatEDA [ 8] addresses the challenge of integrating EDA tools\nto improve interoperability in circuit design, by leveraging LLM\ncapabilities in natural language processing and comprehension. An\nautonomous agent for EDA that utilizes the AutoMage LLM in con-\njunction with EDA tools as executors is proposed. It offers users\na conversational interface to interact with these tools. Users c