# PDF -> Embeddings -> Azure 

# Install Azure Cognitive Search SDK

In [None]:
%pip install --index-url=https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/ azure-search-documents==11.4.0a20230509004
%pip install azure-identity
%pip install langchain===0.0.200
%pip install openai, tiktoken
%pip install pypdf
%pip install python-dotenv

## Import required libraries

In [None]:
import os, json
import openai
from dotenv import load_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.schema import BaseRetriever
from langchain.vectorstores.azuresearch import AzureSearch

## Configure OpenAI settings
Configure the OpenAI settings to use Azure OpenAI or OpenAI

In [None]:
# Load environment variables from a .env file using load_dotenv():
load_dotenv()

azure_openai_api_key: str = os.environ.get('AZURE_OPENAI_API_KEY')

openai.api_type = "azure"
openai.api_base = "https://verx-corp-ai.openai.azure.com/"
openai.api_version = "2023-03-15-preview"
openai.api_key = azure_openai_api_key
openai.openai_api_key = azure_openai_api_key
model: str = "text-embedding-ada-002"

## Configure vector store settings
 
Set up the vector store settings using environment variables:
Change the index name to be your index...  the index name here will also be used for the similarity search below.

In [None]:
vector_store_address: str = 'https://violet-vector.search.windows.net'
vector_store_password: str = os.environ.get('AZURE_VECTOR_STORE_PASSWORD')
index_name: str = "et-docs-html"

## Create embeddings and vector store instances
 
Create instances of the OpenAIEmbeddings and AzureSearch classes:

In [None]:
embeddings: OpenAIEmbeddings = OpenAIEmbeddings(model=model, chunk_size=1, openai_api_key=azure_openai_api_key, deployment='VERX-CORP-ADA-002')  
vector_store: AzureSearch = AzureSearch(azure_search_endpoint=vector_store_address,  
                                        azure_search_key=vector_store_password,  
                                        index_name=index_name,  
                                        embedding_function=embeddings.embed_query)  



## PDF to Text and embeddings into vector store

Load

In [None]:
from langchain.document_loaders import PyPDFDirectoryLoader

loader = PyPDFDirectoryLoader("../data/")
docs = loader.load()
len(docs)


Split

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=0)
split_docs = text_splitter.split_documents(docs)
len(split_docs)


Store

In [None]:
vector_store.add_documents(documents=split_docs)

## Insert text and embeddings into vector store
 
Add texts and metadata from the JSON data to the vector store:

In [None]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
loader = TextLoader('../data/mobydick.txt', encoding='utf-8')

documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

vector_store.add_documents(documents=docs)

## URL 
This covers ow to load HTML documents from a list of URLs

In [None]:
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = WebBaseLoader(["https://www.espn.com/", "https://www.google.com"])
html_data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=0)
split_docs = text_splitter.split_documents(html_data)
len(split_docs)
vector_store.add_documents(documents=split_docs)


## Perform a vector similarity search
 
Execute a pure vector similarity search using the similarity_search() method:

In [None]:
# Perform a similarity search
docs = vector_store.similarity_search(query="mock draft", k=1, search_type='similarity')
print(docs[0].page_content)
