# Retrieval Augmented Generation
Author: Cristian Velandia

Creation Date: 2024-03-02

RAG Using PINECONE vector DB, OpenAI chat gpt 3.5 and openai embeddings

In [1]:
from pinecone import Pinecone, ServerlessSpec
from llama_cpp import Llama
from tqdm.auto import tqdm
import pandas as pd
import json

from langchain.embeddings import LlamaCppEmbeddings
from llama_cpp import Llama


  from tqdm.autonotebook import tqdm


In [58]:
# get api keys
creds =json.load(open('personal_creds.json'))

PINECONE_API_KEY = creds["PINECONE_API_KEY"]

## Setup Pinecone
### Log into API

In [3]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)

# Dfine unique and readable index name
INDEX_NAME = 'aws-docs-vdb-index'

### Create Index for Vector storage

In [4]:
# Check if index already exists and deletes it
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)

# Recreates index, after creation it is visible at the consle
pinecone.create_index(name = INDEX_NAME, dimension = 4096, metric = 'cosine',  spec = ServerlessSpec(cloud = 'aws', region = 'us-west-2')) 

In [4]:
#Create index object
index = pinecone.Index(INDEX_NAME)

### Load Previously Created Dataset
Here we load the dataset with the crpus and vectors to upload them after into pinecone (upsert)

In [5]:
data_path = r"D:\Documents\GitHub\knowledge_pal_assistant\2_outputs\vectors_llama.parquet"
vectors = pd.read_parquet(data_path, engine = "pyarrow")

In [6]:
vectors.head()

Unnamed: 0,index_x,id,page_content,metadata,tokens,nostopw_page_content,token_length,index_y,vect
0,0,0-0,# AWS::Events::Rule SageMakerPipelineParameter...,{'Header 1': 'AWS::Events::Rule SageMakerPipel...,"[#, AWS, :, :Events, :, :Rule, SageMakerPipeli...",# AWS : :Events : :Rule SageMakerPipelineParam...,39,0-0,"[0.884236752986908, 0.244439959526062, -0.5611..."
1,1,0-1,"## Syntax<a name=""aws-properties-events-rule-s...",{'Header 1': 'AWS::Events::Rule SageMakerPipel...,"[#, #, Syntax, <, a, name=, '', aws-properties...",# # Syntax < name= '' aws-properties-events-ru...,114,0-1,"[0.2725246548652649, -1.0039860010147095, 0.70..."
2,2,0-2,"## Properties<a name=""aws-properties-events-ru...",{'Header 1': 'AWS::Events::Rule SageMakerPipel...,"[#, #, Properties, <, a, name=, '', aws-proper...",# # Properties < name= '' aws-properties-event...,164,0-2,"[0.5978015661239624, -0.12955200672149658, -1...."
3,3,1-0,# Automating Amazon SageMaker with Amazon Even...,{'Header 1': 'Automating Amazon SageMaker with...,"[#, Automating, Amazon, SageMaker, with, Amazo...",# Automating Amazon SageMaker Amazon EventBrid...,315,1-0,"[-1.2290796041488647, -0.8195071816444397, -0...."
4,4,1-1,"## Training job state change<a name=""eventbrid...",{'Header 1': 'Automating Amazon SageMaker with...,"[#, #, Training, job, state, change, <, a, nam...",# # Training job state change < name= '' event...,445,1-1,"[1.082711935043335, 1.3398343324661255, -0.037..."


In [7]:
#clean meta, do not include headers
def pop_keys(d, key):
    tmp = d.copy()
    tmp.pop(key)
    return tmp

vectors["metadata"] = vectors["metadata"].apply(pop_keys, args = (["Header 1"])) 
vectors["metadata"] = vectors["metadata"].apply(pop_keys, args = (["Header 2"])) 

In [8]:
vectors["metadata"][0]

{'chunk': 0,
 'source': 'D:\\Documents\\GitHub\\knowledge_pal_assistant\\0_data\\aws-properties-events-rule-sagemakerpipelineparameter.md',
 'text': '# AWS::Events::Rule SageMakerPipelineParameter<a name="aws-properties-events-rule-sagemakerpipelineparameter"></a>  \nName/Value pair of a parameter to start execution of a SageMaker Model Building Pipeline\\.\n \'data source =  D:\\Documents\\GitHub\\knowledge_pal_assistant\\0_data\\aws-properties-events-rule-sagemakerpipelineparameter.md\''}

### Upsert embeddings to Pinecone 

In [9]:
# Create list for batching the upload
prepped = []

# Iterate through data
for i, row in tqdm(vectors.iterrows(), total=vectors.shape[0]):

    prepped.append({'id' : row['id'], 'values' : row['vect'], 'metadata' : row['metadata']})

    if len(prepped) >= 100:
        index.upsert(prepped) #Upsert a batch of 200 vectors
        prepped = []


100%|██████████| 800/800 [00:17<00:00, 46.74it/s]


In [9]:
#Descripe uploaded index
index.describe_index_stats()

{'dimension': 4096,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 800}},
 'total_vector_count': 800}

### Augment Llama Chat Queries 
Trhough this section the embeddings and calls to the LLM are performed, after, the prompts are built and tested. Here we will test the RAG with the test questions provided 

In [2]:
models_folder = ".\\hf_models"
model_name = "TheBloke/Llama-2-7B-Chat-GGML"
model_filename = "llama-2-7b-chat.ggmlv3.q4_K_M.bin"
llama_model_path = f"{models_folder}\\{model_filename}"

llm_model = Llama(llama_model_path, n_ctx = 4096)
emb_model = LlamaCppEmbeddings(model_path = llama_model_path, n_ctx = 4096, n_gpu_layers = 5) # Instance model with GPU 

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | VSX = 0 | 
AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | VSX = 0 | 


In [11]:
# First step, get all the relevant infromation from the vector DB (this will be the question that the user will ask)
query = "What are all AWS regions where SageMaker is available?"

embs = emb_model.embed_query(query)

In [48]:
res = index.query(vector = embs, top_k = 3, include_metadata = True)

context = [r['metadata']['text'] for r in res['matches']]

print('\n'.join(context)) #Visualize output


# AWS::SageMaker::ModelCard<a name="aws-resource-sagemaker-modelcard"></a>  
Creates an Amazon SageMaker Model Card\.  
For information about how to use model cards, see [Amazon SageMaker Model Card](https://docs.aws.amazon.com/sagemaker/latest/dg/model-cards.html)\.
 'data source =  D:\Documents\GitHub\knowledge_pal_assistant\0_data\aws-resource-sagemaker-modelcard.md'
## Syntax<a name="aws-properties-sagemaker-modelpackage-metadataproperties-syntax"></a>  
To declare this entity in your AWS CloudFormation template, use the following syntax:  
### JSON<a name="aws-properties-sagemaker-modelpackage-metadataproperties-syntax.json"></a>  
```
{
"[CommitId](#cfn-sagemaker-modelpackage-metadataproperties-commitid)" : String,
"[GeneratedBy](#cfn-sagemaker-modelpackage-metadataproperties-generatedby)" : String,
"[ProjectId](#cfn-sagemaker-modelpackage-metadataproperties-projectid)" : String,
"[Repository](#cfn-sagemaker-modelpackage-metadataproperties-repository)" : String
}
```  
### YAML<a

In [53]:
BOS, EOS = "<s>", "</s>"
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
SYSTEM_BEHAVIOR = """You are "Knowledge pal", an assistant for developers. Your role consists on answering questions about cloud services, coding in different languages, and provide a detailed response everytime. You will provide the source of the answer you are giving and related sources. Those sources can be a path or URL. Answer the question by combining your knowledge with the context provided. If information is not clear or the context is not enough to give an answer, tell the user that you don't have the answer. \n\n """

In [54]:
message = B_INST + B_SYS + SYSTEM_BEHAVIOR + "Context: " + '\n'.join(context) + E_SYS + "Question: " + query + " Answer: " + E_INST
print(message)

[INST]<<SYS>>
You are "Knowledge pal", an assistant for developers. Your role consists on answering questions about cloud services, coding in different languages, and provide a detailed response everytime. You will provide the source of the answer you are giving and related sources. Those sources can be a path or URL. Answer the question by combining your knowledge with the context provided. If information is not clear or the context is not enough to give an answer, tell the user that you don't have the answer. 

 Context: # AWS::SageMaker::ModelCard<a name="aws-resource-sagemaker-modelcard"></a>  
Creates an Amazon SageMaker Model Card\.  
For information about how to use model cards, see [Amazon SageMaker Model Card](https://docs.aws.amazon.com/sagemaker/latest/dg/model-cards.html)\.
 'data source =  D:\Documents\GitHub\knowledge_pal_assistant\0_data\aws-resource-sagemaker-modelcard.md'
## Syntax<a name="aws-properties-sagemaker-modelpackage-metadataproperties-syntax"></a>  
To decla

In [55]:
knowledge_pal_response_gen = llm_model(prompt = message, stream = False, temperature = 0.2, top_p = 0.6, max_tokens = 512, stop = None)

Llama.generate: prefix-match hit


In [56]:
print(knowledge_pal_response_gen["choices"][0]["text"])

  As Knowledge Pal, I can provide you with a comprehensive list of AWS regions where Amazon SageMaker is available. Here are the regions where SageMaker is supported:
1. US East (N. Virginia)
2. US West (Oregon)
3. EU (Ireland)
4. EU (Frankfurt)
5. Asia Pacific (Singapore)
6. Asia Pacific (Tokyo)
7. Australia (Sydney)
8. Brazil (São Paulo)
9. Canada (Toronto)
10. China (Beijing)
11. India (Mumbai)
12. Japan (Tokyo)
13. Korea (Seoul)
14. Middle East (Bahrain)
15. Middle East (Dubai)
16. South Africa (Cape Town)
17. UK (London)
18. US East (N. Virginia)

Note that this list may not be exhaustive, and AWS may add or remove regions for SageMaker support in the future. It's always a good idea to check the official AWS documentation for the most up-to-date information on SageMaker availability in different regions.


In [80]:
# Code for streaming=True
knowledge_pal_response = "".join([c["choices"][0]["text"] for c in knowledge_pal_response_gen])
print('-' * 80)
print(knowledge_pal_response)

Llama.generate: prefix-match hit


--------------------------------------------------------------------------------
  Great, I'm glad you asked! To check if an endpoint is KMS encrypted in AWS SageMaker, you can use the `sagemaker-dataqualityjobdefinition` or `sagemaker-modelbiasjobdefinition` API.
Here's how to do it:
1. First, make sure that you have the necessary permissions to access the endpoint. You can check the permissions by using the `IAM` service in AWS CLI or SDKs.
2. Use the `sagemaker-dataqualityjobdefinition` or `sagemaker-modelbiasjobdefinition` API to create a data quality monitoring job or model bias job, respectively. You can use the `create_job()` method of the API to create the job.
3. In the request body of the API call, include the `EncryptionConfiguration` parameter with the `KmsKeyId` set to the ID of the KMS key that you want to use for encryption. You can find the KMS key ID in the AWS Management Console or using the `aws sagemaker describe-key` command.
4. Once the job is created, you can che

In [None]:
# Chat Helper for messaging