# Problem

##### Region = eu-west-1
##### Jumpstart Version = 2.0.4
##### Meta Llama-2-13b instance type = ml.g5.12xlarge
##### llama-index==0.8.35

## Pip install the following 
llama-index==0.8.35

pypdf==3.16.4

transformers==4.34.1

torchvision==0.16.0

langchain==0.0.317

langsmith==0.0.49

openai==0.28.0

In [23]:
from llama_index import SimpleDirectoryReader, ServiceContext, VectorStoreIndex, download_loader
from llama_index.llms import OpenAI
from llama_index.tools import QueryEngineTool, ToolMetadata
from llama_index.query_engine import SubQuestionQueryEngine

import nest_asyncio

import logging
import sys
import json
from langchain.llms.sagemaker_endpoint import LLMContentHandler
from langchain.llms.sagemaker_endpoint import SagemakerEndpoint

from llama_index.callbacks import CallbackManager, LlamaDebugHandler
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt

from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index import StorageContext, load_index_from_storage
from IPython.display import Markdown, display
from llama_index.prompts import PromptTemplate

logging.basicConfig(stream=sys.stdout, level=logging.INFO)  # Change INFO to DEBUG if you want more extensive logging
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

nest_asyncio.apply()

llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

# S3Reader = download_loader("S3Reader")
# loader = S3Reader(bucket='llm-sql', prefix='ec2_qbr_priv/financials-llama-index/data/')

loader = SimpleDirectoryReader('./data', recursive=True, exclude_hidden=True)

fin_docs = loader.load_data()

class ContentHandlerForTextGeneration(LLMContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompt: str, model_kwargs: dict) -> bytes:
        input_str = json.dumps({"inputs": [[{"role": "user", "content": prompt},]],
                                  "parameters" : model_kwargs
                                  })
        return input_str.encode('utf-8')

    def transform_output(self, output: bytes) -> str:
        response_json = json.loads(output.read().decode("utf-8"))
        return response_json[0]['generation']['content']

parameters = {
    "max_new_tokens": 1024,
    "temperature": 0.1,}
region="eu-west-1"

endpoint_name = "meta-textgeneration-llama-2-13b-f-2024-04-11-10-59-12-609"
# endpoint_name= "huggingface-pytorch-tgi-inference-2024-04-11-11-15-17-687"
content_handler = ContentHandlerForTextGeneration()
llm = SagemakerEndpoint(
    endpoint_name=endpoint_name,
    region_name=region,
    model_kwargs=parameters,
    endpoint_kwargs={"CustomAttributes":"accept_eula=true"},
    content_handler=content_handler)
    

# ------------------------------------------------------------------------------------------
storage_directory = "index"
# chunk_size - It defines the size of the chunks (or nodes) that documents are broken into when they are indexed by LlamaIndex
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=600,
                                               embed_model="local",
                                               callback_manager=callback_manager)

# Build the index
index = VectorStoreIndex.from_documents(fin_docs, service_context=service_context, show_progress=True)

# Persist the index to disk
index.storage_context.persist(persist_dir=storage_directory)

storage_context = StorageContext.from_defaults(persist_dir=storage_directory)
index = load_index_from_storage(storage_context, service_context=service_context)

query_engine = index.as_query_engine(service_context=service_context,
                                     similarity_top_k=3)
response = query_engine.query("Give me a summary of the document")
display(Markdown(f"<b>{response}</b>"))

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


Parsing documents into nodes:   0%|          | 0/2 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/60 [00:00<?, ?it/s]

**********
Trace: index_construction
    |_node_parsing ->  0.14815 seconds
      |_chunking ->  0.06846 seconds
      |_chunking ->  0.066087 seconds
    |_embedding ->  1.500552 seconds
    |_embedding ->  1.529617 seconds
    |_embedding ->  1.539742 seconds
    |_embedding ->  1.528634 seconds
    |_embedding ->  1.517231 seconds
    |_embedding ->  1.536484 seconds
**********
INFO:llama_index.indices.loading:Loading all indices.
Loading all indices.
Loading all indices.
Loading all indices.
Loading all indices.
Loading all indices.
Loading all indices.
Loading all indices.
Loading all indices.
**********
Trace: index_construction
**********
**********
Trace: query
    |_query ->  4.87616 seconds
      |_retrieve ->  0.031331 seconds
        |_embedding ->  0.027563 seconds
      |_synthesize ->  4.843677 seconds
        |_templating ->  3.5e-05 seconds
        |_llm ->  4.835328 seconds
**********


<b> Sure! Based on the context information provided, here's a summary of the document:

The document is a personal essay by Paul Graham, a well-known computer programmer and investor. He describes his experience working at Interleaf, a software company, and later co-founding Viaweb and Y Combinator, two successful startups.

Graham highlights the lessons he learned during his time at Interleaf, including the importance of being the "entry-level" option, the dangers of prestige, and the value of working in a small, agile team. He also discusses the challenges of working on Hacker News, a news aggregator for startup founders, which was a significant source of stress for him.

Overall, the document provides insight into Graham's career and the lessons he has learned along the way, as well as his thoughts on the tech industry and startup culture.</b>

In [28]:
query_engine = index.as_query_engine(service_context=service_context,
                                     similarity_top_k=3, streaming=True)

In [29]:
response = query_engine.query("Give me a summary of the document")
display(Markdown(f"<b>{response}</b>"))

**********
Trace: query
    |_query ->  0.037419 seconds
      |_retrieve ->  0.031565 seconds
        |_embedding ->  0.027714 seconds
      |_synthesize ->  0.00558 seconds
        |_templating ->  2.7e-05 seconds
        |_llm ->  0.0 seconds
    |_llm ->  0.0 seconds
**********


Exception in thread Thread-9 (wrapped_llm_predict):
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/langchain/llms/sagemaker_endpoint.py", line 342, in _call
    resp = json.loads(line)
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/json/__init__.py", line 346, in loads


<b>None</b>

    return _default_decoder.decode(s)
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 2 (char 1)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 761, in run_closure
    _threading_Thread_run(self)
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ec2-