In [None]:
!python --version

In [None]:
!pip install faiss-cpu

In [None]:
import sagemaker, boto3, json
from sagemaker.session import Session

sagemaker_session = Session()
aws_role = sagemaker_session.get_caller_identity_arn()
aws_region = boto3.Session().region_name
sess = sagemaker.Session()
s3 = boto3.client('s3')
bucket = 'powersensor-data'
key = 'AboutPowersensor.pdf'

In [None]:
from typing import Dict, List
from langchain.embeddings import SagemakerEndpointEmbeddings
from langchain.llms.sagemaker_endpoint import ContentHandlerBase
from langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler
import json


class ContentHandler(EmbeddingsContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, inputs: list[str], model_kwargs: Dict) -> bytes:
        input_str = json.dumps({"text_inputs": inputs, **model_kwargs})
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> List[List[float]]:
        response_json = json.loads(output.read().decode("utf-8"))
        embeddings = response_json['embedding']
        return embeddings


content_handler = ContentHandler()


embeddings = SagemakerEndpointEmbeddings(
    endpoint_name="hf-textembedding-all-minilm-l6-v2",
    region_name=aws_region,
    content_handler=content_handler,
)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS

from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader(key)
documents = loader.load()

print(type(documents))
#TODO: need to optimise the chunk splitting based on the quality of the result and the doc length
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0, length_function=len,)
chunks = text_splitter.split_documents(documents)

In [None]:
vector_db = FAISS.from_documents(chunks, embeddings)

In [None]:
VECTOR_DB_DIR = "vector-db"
vector_db.save_local(VECTOR_DB_DIR)

In [None]:
query = "Can I install this myself"
docs = vector_db.similarity_search(query)

In [None]:
print(len(docs))
for doc in docs:
    print(doc.page_content)
    print('\n')

In [None]:
def query_endpoint(encoded_text):
    endpoint_name = 'jumpstart-dft-sentence-encoder-cmlm-en-large-1'
    client = boto3.client('runtime.sagemaker')
    response = client.invoke_endpoint(EndpointName=endpoint_name, ContentType='application/x-text', Body=encoded_text, Accept='application/json;verbose')
    return response

In [None]:
query_response = query_endpoint("this is a test string")
model_predictions = json.loads(query_response['Body'].read())
embedding, model_output = model_predictions['embedding'], model_predictions['model_output']
print(embedding)
print(model_output)