# Overview

In this notebook, we will cover the steps required to build the logic that powers AskBSW. The key components are:
- Reading in HR Documents
- Generating Embeddings for each chunk of HR documents
- Upload embeddings into VectorDB
- Test VectorDB search


### Install packages
- These packages are needed to parse the HR documents

In [6]:
!pip install pypdf
!pip install python-docx
!pip install azure-search-documents==11.4.0b8



In [7]:
from azure.storage.blob import BlobServiceClient
from pypdf import PdfReader
from io import BytesIO
from docx import Document
import openai
import re, sys
import requests
import pandas as pd
import numpy as np
from openai import AzureOpenAI
import time
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import Vector 
from azure.search.documents import SearchIndexingBufferedSender
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,   
)

In [8]:
## CONFIG
connection_string = "DefaultEndpointsProtocol=https;AccountName=saaskbswdeveast2;AccountKey=DWFZmBZYho1KSOp77C2jHqGuN5eaPeiOUum1BZnBqGPB62zFoEy16JkxP4ER3fIsgdNOJDSyOM4x+AStZMiHPw==;EndpointSuffix=core.windows.net"
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_name = "datascience"
AZURE_OPENAI_KEY = "YOUR_OPENAI_KEY"
AZURE_OPENAI_ENDPOINT = "https://llm-dev-02.openai.azure.com/"
API_VERSION = "2023-05-15"
EMBEDDING_DEPLOYMENT_NAME = "text-embedding-ada02-askbsw"
openai_client = AzureOpenAI(
  api_key = AZURE_OPENAI_KEY,  
  api_version = API_VERSION,
  azure_endpoint = AZURE_OPENAI_ENDPOINT
)
# Azure Cognitive Search Vector Store Configuration
search_service_endpoint: str = "https://cog-askbsw-dev.search.windows.net/"
search_service_api_key: str = "YOUR_COGNITIVE_SEARCH_KEY"
index_name: str = "YOUR_INDEX_NAME"
credential = AzureKeyCredential(search_service_api_key)


In [11]:
## Helper Functions

def process_blobs(blob_service_client, container_name, directory_name="AskBSWH/HR", num_tokens=128, verbose=False):
    """
    Recursively processes blobs in a given directory and its subdirectories 
    in an Azure Blob Storage container.

    Parameters:
    - blob_service_client (BlobServiceClient): The Azure Blob Service Client.
    - container_name (str): The name of the container.
    - directory_name (str, optional): The name of the directory to start with.
      Defaults to an empty string, which means the root directory.
    - num_tokens (int, optional): The number of tokens for each chunk. Default is 128.


    Returns:
    None: This function prints the contents of each DOCX and PDF file found.
    """
    chunks = []
    fnames = []
    container_client = blob_service_client.get_container_client(container_name)
    blob_list = container_client.list_blobs(name_starts_with=directory_name)
    
    for blob in blob_list:
        blob_name = blob.name
        
        if blob_name.endswith('/'):  # This is a "directory"
            process_blobs(blob_service_client, container_name, blob_name)
        else:
            blob_client = container_client.get_blob_client(blob_name)
            stream = blob_client.download_blob()
            bytes_io = BytesIO(stream.readall())
            
            # Process DOCX files
            if blob_name.lower().endswith('.docx'):
                doc = Document(bytes_io)
                word_count = 0
                text = ' '.join(para.text for para in doc.paragraphs)
                text = text.replace('\n', ' ')

                # Tokenize and chunk the text
                tokens = text.split()
                for i in range(0, len(tokens), num_tokens):
                    chunks.append(' '.join(tokens[i:i + num_tokens]))
                    fnames.append(blob_name)
                for para in doc.paragraphs:
                    word_count += len(para.text.split())
                if verbose:
                    print (blob_name)
                    print(f"Total number of words in document: {word_count}")

            # Process PDF files
            elif blob_name.lower().endswith('.pdf'):
                reader = PdfReader(bytes_io)
                num_pages = len(reader.pages)
                if verbose:
                    print (blob_name)
                    print (f"Num pages: {num_pages}")
                for i in range(num_pages):
                    page = reader.pages[i]
                    text = page.extract_text()
                    text = text.replace('\n', ' ')

                    # Tokenize and chunk the text
                    tokens = text.split()
                    for i in range(0, len(tokens), num_tokens):
                        chunks.append(' '.join(tokens[i:i + num_tokens]))
                        fnames.append(blob_name) 

    return chunks, fnames

def generate_embeddings(text, model):
    response = openai_client.embeddings.create(
        input=text, model=model)
    embeddings = response.data[0].embedding
    return embeddings

def search_vector_db(user_input):
    embeddings = generate_embeddings(text=user_input, model=EMBEDDING_DEPLOYMENT_NAME)
    vectorList = [Vector(value=embeddings, k=5, fields="vector")]
    search_results = search_client.search(user_input, vectors=vectorList, top=3)
    
    # Save results to a list
    saved_results = []
    for doc in search_results:
        saved_results.append(doc)
        print (doc["text"])
    return saved_results


In [12]:
chunks, fnames = process_blobs(blob_service_client, container_name, num_tokens=512)


In [51]:
embeddings_array = []
start_time = time.time()

for index, chunk in enumerate(chunks):
    time.sleep(3)
    embedding_chunk = generate_embeddings(chunk, model=EMBEDDING_DEPLOYMENT_NAME)
    embeddings_array.append(embedding_chunk)
    
    if (index + 1) % 500 == 0:
        elapsed_time = time.time() - start_time
        print(f"Processed {index + 1} records. Elapsed time: {elapsed_time:.2f} seconds")



Processed 500 records. Elapsed time: 1523.86 seconds


## Upload embeddings to cognitive search

In [57]:
# Convert the embeddings_array into a DataFrame
df = pd.DataFrame({
    'vector_id': range(len(embeddings_array)),
    'vector': embeddings_array
})
df["id"] = df.index
df["text"] = chunks
df["filename"] = fnames
df.head()

Unnamed: 0,vector_id,vector,id,text,filename
0,0,"[-0.012635222636163235, 0.01425582729279995, -...",0,Page 1 of 2 Title: Adoption Assistance Departm...,AskBSWH/HR/Benefits/Policies & Procedures/Adop...
1,1,"[-0.014502687379717827, 0.011701651848852634, ...",1,Adoption Assistance BSWH. HR.BNFT.001.P Page 2...,AskBSWH/HR/Benefits/Policies & Procedures/Adop...
2,2,"[-0.019268624484539032, 0.008453466929495335, ...",2,sCOPE This document applies to the Baylor Scot...,AskBSWH/HR/Benefits/Policies & Procedures/Empl...
3,3,"[-0.02202620729804039, -0.024142025038599968, ...",3,qualifying incidents occurring after the emplo...,AskBSWH/HR/Benefits/Policies & Procedures/Empl...
4,4,"[-0.023361289873719215, -0.009505768306553364,...",4,sCOPE This document applies to Baylor Scott & ...,AskBSWH/HR/Benefits/Policies & Procedures/Tuit...


In [59]:
index_client = SearchIndexClient(endpoint=search_service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String),
    SimpleField(name="vector_id", type=SearchFieldDataType.String, key=True),
    SearchableField(name="text", type=SearchFieldDataType.String),
    SearchableField(name="filename", type=SearchFieldDataType.String),
    SearchField(name="vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="ask-vector-config")
]
# Configure the vector search configuration
vector_search = VectorSearch(
    algorithm_configurations=[
        HnswVectorSearchAlgorithmConfiguration(
            name="ask-vector-config",
            kind="hnsw",
            parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 500,
                "metric": "cosine"
            }
        )
    ]
)

In [60]:
# Create the index 
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search)
result = index_client.create_or_update_index(index)
print(f'{result.name} created')

askms-embeddings-index-v2 created


In [61]:
# Convert the 'id' and 'vector_id' columns to string so one of them can serve as our key field  
df['id'] = df['id'].astype(str)  
df['vector_id'] = df['vector_id'].astype(str)

# Convert the DataFrame to a list of dictionaries  
documents = df.to_dict(orient='records')  
# Split the data into chunks 
documents_chunks = [documents[i:i + 100] for i in range(0, len(documents), 100)]
search_client = SearchClient(endpoint=search_service_endpoint, index_name=index_name, credential=credential)
for chunk in documents_chunks:
    result = search_client.upload_documents(chunk)


## Search embedding

In [17]:

# Pure Vector Search
query = "Who can i add to my coverage?"
  
search_client = SearchClient(search_service_endpoint, index_name, AzureKeyCredential(search_service_api_key))  
vector = Vector(value= generate_embeddings(query, model=EMBEDDING_DEPLOYMENT_NAME), k=3, fields="vector")  
  
results = search_client.search(  
    search_text=None,  
    vectors=[vector],
)
cached = []
  
for result in results:
    if result["@search.score"] >= 0.81:
        cached.append(result)
        print(f"Text: {result['text']}")  
        print(f"Source: {result['filename']}")
        print(f"Score: {result['@search.score']}")  

Text: protection. Help with out-of- pocket expenses when you or your family may needit the mos t.›C ost-effective. You can sign up for this coverage at economical group rates, which means you may payless for your coverage . ›C onvenient. We make it easy. No copays, deductibles, coinsurance or network requirementsto worry about and your insurance premiums can beeasily deducted from your pay check. ›P ortable. You may be able to take your coverage with you if you leave your employer – benefits won’tchange if you port your coverage. 2 Is there such a thing as too much protection? Only you can answer that question. Think about your expenses, savings and plans for the future. Then, consider this: ›E very 40 seconds , a stroke occurs in the U.S.3
Source: AskBSWH/HR/Benefits/WebExtracts/Other Benefits/Accidental-Injury_Benefit-Summary.pdf
Score: 0.84724617
Text: protection. Help with out-of- pocket expenses when you or your family may needit the mos t.›C ost-effective. You can sign up for thi