## Generate Embeddings

#### Set environment variables

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

aoai_endpoint  = os.getenv("AOAI_ENDPOINT")
if aoai_endpoint is None or aoai_endpoint == "":
    print("AOAI_ENDPOINT environment variable not set.")
    exit()

aoai_api_version  = os.getenv("AOAI_API_VERSION")
if aoai_api_version is None or aoai_api_version == "":
    print("AOAI_API_VERSION environment variable not set.")
    exit()

aoai_embedding_deployed_model  = os.getenv("AOAI_EMBEDDING_DEPLOYED_MODEL")
if aoai_embedding_deployed_model is None or aoai_embedding_deployed_model == "":
    print("AOAI_EMBEDDING_DEPLOYED_MODEL environment variable not set.")
    exit()

aoai_key  = os.getenv("AZURE_OPENAI_KEY")
if aoai_key is None or aoai_key == "":
    print("AZURE_OPENAI_KEY environment variable not set.")
    exit()

com_vision_endpoint  = os.getenv("COM_VISION_ENDPOINT")
if com_vision_endpoint is None or com_vision_endpoint == "":
    print("COM_VISION_ENDPOINT environment variable not set.")
    exit()

com_vision_api_version  = os.getenv("COM_VISION_API_VERSION")
if com_vision_api_version is None or com_vision_api_version == "":
    print("COM_VISION_API_VERSION environment variable not set.")
    exit()

com_vision_key  = os.getenv("COMPUTER_VISION_KEY")
if com_vision_key is None or com_vision_key == "":
    print("COMPUTER_VISION_KEY environment variable not set.")
    exit()

#### Helper methods

In [None]:
import requests
import json

def vectorize_text_com_vision(com_vision_endpoint,com_vision_key,query):
    vectorize_text_url = f"{com_vision_endpoint}/computervision/retrieval:vectorizeText"  
    params = {  
        "api-version": com_vision_api_version 
    } 
    headers = {  
        "Content-Type": "application/json",  
        "Ocp-Apim-Subscription-Key": com_vision_key  
    }  
    data = {
        'text':query
    }

    response = requests.post(vectorize_text_url, params=params, headers=headers, json=data)
    query_vector = response.json()["vector"]

    return query_vector

def read_json_file(file_path):
    with open(file_path, "r") as file:
        return file.read()

#### Create text embeddings

In [None]:
import openai
import pandas as pd
import json
from openai import AzureOpenAI

azure_oai_client = AzureOpenAI(
  api_key = aoai_key,  
  api_version = aoai_api_version,
  azure_endpoint = aoai_endpoint
)

df = pd.read_json('../data/text/product_docs.json')

openai.api_type = "azure"
openai.api_key = aoai_key
openai.api_base = aoai_endpoint
openai.api_version = aoai_api_version

df['title_vector'] = df['title'].apply(lambda x : azure_oai_client.embeddings.create(input = [x], model=aoai_embedding_deployed_model).data[0].embedding) 
df['content_vector'] = df['content'].apply(lambda x : azure_oai_client.embeddings.create(input = [x], model=aoai_embedding_deployed_model).data[0].embedding) 

df.to_json('../data/text/product_docs_embeddings.json', orient="records")

#### Create document embeddings

In [None]:
from PyPDF2 import PdfReader
import pandas as pd
from langchain.text_splitter import CharacterTextSplitter
import openai
import pandas as pd
import json
from openai import AzureOpenAI

azure_oai_client = AzureOpenAI(
  api_key = aoai_key,  
  api_version = aoai_api_version,
  azure_endpoint = aoai_endpoint
)

pdf_reader = PdfReader('../data/docs/employee_handbook.pdf')
pages = [page.extract_text() for page in pdf_reader.pages]
text = " ".join(pages)

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)
chunks = text_splitter.split_text(text)
df = pd.DataFrame(chunks, columns=["chunk_content"])

df['chunk_content_vector'] = df['chunk_content'].apply(lambda x : azure_oai_client.embeddings.create(input = [x], model=aoai_embedding_deployed_model).data[0].embedding) 
df['id'] = df.index
df = df[['id', 'chunk_content', 'chunk_content_vector']]

df.to_json('../data/docs/employee_handbook_embeddings.json', orient="records")

#### Chunk End-to-End Evaluation Sample Embedding

In [None]:
from langchain_community.document_loaders import DirectoryLoader, UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import AzureOpenAI
import json
from tqdm import tqdm

EMBEDDING_FILE_PATH = "../data/chunking_evaluation/embeddings.json"

CHUNKING_DATA_PATH = "../data/chunking_evaluation/raw"
GLOB = "*.md"

# load the documents
loader = DirectoryLoader(CHUNKING_DATA_PATH, glob=GLOB, loader_cls=UnstructuredMarkdownLoader)
docs = loader.load()

# split the documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)
chunks = text_splitter.split_documents(docs)

# embed the documents
azure_oai_client = AzureOpenAI(
  api_key = aoai_key,
  api_version = aoai_api_version,
  azure_endpoint = aoai_endpoint
)

records = []
for i, chunk in enumerate(tqdm(chunks)):
    chunk_content = chunk.page_content
    chunk_content_vector = azure_oai_client.embeddings.create(input = [chunk_content], model=aoai_embedding_deployed_model).data[0].embedding
    metadata = chunk.metadata
    metadata['source'] = metadata['source'].split("/")[-1]
    records.append({
        "id": str(i),
        "chunk_content": chunk_content,
        "chunk_content_vector": chunk_content_vector,
        "metadata": json.dumps(metadata)
    })

with open(EMBEDDING_FILE_PATH, "w") as f:
    json.dump(records, f)

#### Create image embeddings

In [None]:
import os
import requests
import pandas as pd

image_folder = "../data/images"
image_list = os.listdir(image_folder)
df = pd.DataFrame(columns=['image', 'image_vector'])

for image_name in image_list:
    image_path = os.path.join(image_folder, image_name)

    with open(image_path, "rb") as binary_file:
        binary_data = binary_file.read()
        
        vectorize_img_url = f"{com_vision_endpoint}/computervision/retrieval:vectorizeImage"  
        params = {  
            "api-version": com_vision_api_version  
        } 
        headers = {  
            "Content-Type": "image/jpeg",  
            "Ocp-Apim-Subscription-Key": com_vision_key  
        }  

        response = requests.post(vectorize_img_url, params=params, headers=headers, data=binary_data)

        print(response)

        df_row = {'image':image_name, 'image_vector':response.json()["vector"]}
        df = pd.concat([df, pd.DataFrame([df_row])], ignore_index=True)

df['id'] = df.index
df = df[['id', 'image', 'image_vector']]

df.to_json('../data/images/images_embeddings.json', orient="records")