In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
AZURE_SEARCH_ENDPOINT = os.getenv("AZURE_SEARCH_ENDPOINT")
AZURE_SEARCH_KEY = os.getenv("AZURE_SEARCH_KEY")

if OPENAI_API_KEY is None or AZURE_SEARCH_ENDPOINT is None or AZURE_SEARCH_KEY is None:
    raise ValueError("One or more environment variables are not set")

In [5]:
import openai
import os
import uuid
import json
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings 
from langchain.vectorstores.azuresearch import AzureSearch 
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient 
from azure.search.documents.models import Vector
from azure.search.documents.indexes.models import(
ComplexField, CorsOptions,
SearchIndex,
ScoringProfile,
SearchFieldDataType,
SimpleField,
SearchField,
SearchableField,
VectorSearch,
HnswVectorSearchAlgorithmConfiguration
)

openai.api_key = OPENAI_API_KEY

In [None]:
vector_search= VectorSearch(
algorithm_configurations=[
    HnswVectorSearchAlgorithmConfiguration(
    name="my-vector-config",
    kind="hnsw",
    parameters={
    "m": 4,
    "efConstruction": 400,
    "efSearch": 500, 
    "metric": "cosine"})
])

In [None]:
# Create a search index client
client = SearchIndexClient(AZURE_SEARCH_ENDPOINT, AzureKeyCredential(AZURE_SEARCH_KEY))
# Create the index
index_name = "notesslides"
fields = [
SimpleField(name="documentId", type=SearchFieldDataType.String, filterable=True, sortable=True, key=True), 
SearchableField(name="content", type=SearchFieldDataType.String),
SearchField(name="embedding", type=SearchFieldDataType.Collection (SearchFieldDataType.Single), searchable=True, vector_search_dimensions = 1536, vector_search_configuration ="my-vector-config")
]
index= SearchIndex(
name=index_name,
fields=fields,
vector_search=vector_search
)
result = client.create_index(index)

In [None]:
def generate_embeddings(text):
    response = openai.Embedding.create(
    input = text , engine="text-embedding-ada-002")
    embeddings = response['data'][0]['embedding']
    return embeddings

In [None]:
# look into how to split our documents

path = "path to stored data.txt"

loader = TextLoader(path, encoding="utf-8")

documents = loader.load()

# text_splitter = CharacterTextSplitter (chunk_size=1000, chunk_overlap=50) 
# documents = text_splitter.split_documents (documents)

In [None]:
# Construct json
docs = []
for doc in documents:
    docs.append({"documentId":str(uuid.uuid4()),"content":doc.page_content,"embedding":generate_embeddings(doc.page_content)})
    json_data = json.dumps (docs)
with open("HandbookContent.json", "w") as f:
    f.write(json_data)

In [None]:
# Upload docs to index
with open('HandbookContent.json', 'r') as f:
    documents = json.load(f)
search_client = SearchClient(endpoint=AZURE_SEARCH_ENDPOINT, index_name=index_name, credential=AzureKeyCredential(AZURE_SEARCH_KEY))
result = search_client.upload_documents(documents)

In [None]:
# test query
query = "What information does this handbook has?"
vector = Vector (value=generate_embeddings(query), k=2, fields="embedding") # top 2 results

results = search_client.search(
    search_text=None,
    vectors=[vector],
    select=["content"]
    )

input_text = " "
for result in results:
    input_text += result['content'] + " "

In [None]:
# calling OpenAI endpoint
openai.Completion.create(
model="gpt-3.5-turbo-instruct",
prompt= f" Answer the question based on given input text. Input: {input_text}. Question: {query}",
max_tokens=100,
temperature=0)