### Install libs

In [2]:
%pip install --no-build-isolation --force-reinstall \
    "boto3>=1.28.57" \
    "awscli>=1.29.57" \
    "botocore>=1.31.57"

%pip install -U opensearch-py==2.3.1 langchain==0.0.309 "pypdf>=3.8,<4" \
    apache-beam \
    datasets \
    tiktoken


Collecting boto3>=1.28.57
  Obtaining dependency information for boto3>=1.28.57 from https://files.pythonhosted.org/packages/ff/e9/8ece7607d288c1de22638a7223ae1dc41a34e8cd1511ebfc8171ac24db9d/boto3-1.28.73-py3-none-any.whl.metadata
  Downloading boto3-1.28.73-py3-none-any.whl.metadata (6.7 kB)
Collecting awscli>=1.29.57
  Obtaining dependency information for awscli>=1.29.57 from https://files.pythonhosted.org/packages/f9/af/ac25b9236912890e5476b1153afdec2a0f4b95f26609db6dc541b93792a0/awscli-1.29.73-py3-none-any.whl.metadata
  Downloading awscli-1.29.73-py3-none-any.whl.metadata (11 kB)
Collecting botocore>=1.31.57
  Obtaining dependency information for botocore>=1.31.57 from https://files.pythonhosted.org/packages/fa/f3/c369381bf6b2913e201d91be68d34e1ad3dfe9163ae7e8203606d09ee07c/botocore-1.31.73-py3-none-any.whl.metadata
  Downloading botocore-1.31.73-py3-none-any.whl.metadata (6.1 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3>=1.28.57)
  Using cached jmespath-1.0.1-py3-none-any.w

In [2]:
0

0

In [3]:
import warnings
warnings.filterwarnings('ignore')
import json
import os
import sys

import boto3

module_path = ".."
sys.path.append(os.path.abspath(module_path))
from utils import bedrock, print_ww


# ---- ⚠️ Un-comment and edit the below lines as needed for your AWS setup ⚠️ ----

# os.environ["AWS_DEFAULT_REGION"] = "<REGION_NAME>"  # E.g. "us-east-1"
# os.environ["AWS_PROFILE"] = "<YOUR_PROFILE>"
# os.environ["BEDROCK_ASSUME_ROLE"] = "<YOUR_ROLE_ARN>"  # E.g. "arn:aws:..."

os.environ["AWS_DEFAULT_REGION"] = "us-east-1" 

boto3_bedrock = bedrock.get_bedrock_client(
    #assumed_role=os.environ.get("BEDROCK_ASSUME_ROLE", None),
    region=os.environ.get("AWS_DEFAULT_REGION", None)
)


from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock
from langchain.load.dump import dumps

# - create the Anthropic Model
llm = Bedrock(
    model_id="anthropic.claude-v2", client=boto3_bedrock, model_kwargs={"max_tokens_to_sample": 200}
)



Create new client
  Using region: us-east-1
boto3 Bedrock client successfully created!
bedrock-runtime(https://bedrock-runtime.us-east-1.amazonaws.com)


### Data preparation

In [4]:
# DATA PREPARATION

import numpy as np
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader

loader = PyPDFDirectoryLoader("./vndata/")

documents = loader.load()
# - in our testing Character split works better with this PDF data set
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=2000,
    chunk_overlap=200,
)
docs = text_splitter.split_documents(documents)



In [None]:
len(docs)

In [5]:
avg_doc_length = lambda documents: sum([len(doc.page_content) for doc in documents]) // len(
    documents
)
avg_char_count_pre = avg_doc_length(documents)
avg_char_count_post = avg_doc_length(docs)
print(f"Average length among {len(documents)} documents loaded is {avg_char_count_pre} characters.")
print(f"After the split we have {len(docs)} documents more than the original {len(documents)}.")
print(
    f"Average length among {len(docs)} documents (after split) is {avg_char_count_post} characters."
)




Average length among 3 documents loaded is 1557 characters.
After the split we have 4 documents more than the original 3.
Average length among 4 documents (after split) is 1165 characters.


## Opensearch set up

In [6]:
import boto3
import time
vector_store_name = 'llm-rag'
index_name = "rag-sbert"

encryption_policy_name = "llm-rag-sp"
network_policy_name = "llm-rag-np"
access_policy_name = 'llm-rag-ap'
identity = boto3.client('sts').get_caller_identity()['Arn']

aoss_client = boto3.client('opensearchserverless')


In [7]:
## Todo: Replace your OpenSearch endpoint here: 

host = 'https://1n3li4pv4s7jhgykpmie.us-east-1.aoss.amazonaws.com:443'



## SageMaker embedding

In [8]:
## Todo: 
# Change endpoint_name to your Sagemaker sbert deployed model.

## Langchain Doc for SM Embedding Class Customization

from typing import Dict, List
from langchain.embeddings import SagemakerEndpointEmbeddings
from langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler
import json


class ContentHandler(EmbeddingsContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, inputs: list[str], model_kwargs: Dict) -> bytes:
        """
        Transforms the input into bytes that can be consumed by SageMaker endpoint.
        Args:
            inputs: List of input strings.
            model_kwargs: Additional keyword arguments to be passed to the endpoint.
        Returns:
            The transformed bytes input.
        """
        # Example: inference.py expects a JSON string with a "inputs" key:
        input_str = json.dumps({"inputs": inputs, **model_kwargs})  
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> List[List[float]]:
        """
        Transforms the bytes output from the endpoint into a list of embeddings.
        Args:
            output: The bytes output from SageMaker endpoint.
        Returns:
            The transformed output - list of embeddings
        Note:
            The length of the outer list is the number of input strings.
            The length of the inner lists is the embedding dimension.
        """
        # Example: inference.py returns a JSON string with the list of
        # embeddings in a "vectors" key:
        response_json = json.loads(output.read().decode("utf-8"))
        return response_json["vectors"]


content_handler = ContentHandler()



sbert_batch_embeddings = SagemakerEndpointEmbeddings(
    # credentials_profile_name="credentials-profile-name",
    endpoint_name="huggingface-pytorch-inference-2023-10-20-04-45-11-397", # change this to your own sbert endpoint
    region_name="us-east-1",  # change this to your sagemaker deployed sbert endpoint Region 
    content_handler=content_handler,
)

In [11]:
res_1 = sbert_batch_embeddings.embed_query("foo")
print(len(res_1)) # expect 768

res_2 = sbert_batch_embeddings.embed_documents(["foo"])
print(len(res_2) ) # expect 1
print(len(res_2[0]) ) # expect 768

768
1
768


## Ingest to AOSS

In [None]:
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
from langchain.vectorstores import OpenSearchVectorSearch

import langchain 


service = 'aoss'
credentials = boto3.Session().get_credentials()
auth = AWSV4SignerAuth(credentials, os.environ.get("AWS_DEFAULT_REGION", None), service)

docsearch = OpenSearchVectorSearch.from_documents(
    docs,
    sbert_batch_embeddings,
    bulk_size=1000,
    opensearch_url=host,
    http_auth=auth,
    timeout = 100,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    index_name=index_name,
    engine="faiss",
)

