In [21]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")
os.environ["PINECONE_API"] = os.getenv("PINECONE_API")
pinecone_api_key = os.environ.get("PINECONE_API")

# Data Ingestion Techniques


In [5]:
from langchain_community.document_loaders import S3FileLoader
reader = S3FileLoader(bucket="rag-folder", key="AI_Governance.pdf")
doc = reader.load()

# Chunking or Splitting...

In [16]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splitted_doc = text_splitter.split_documents(documents=doc)


# Embeddings
Vector Store - Pinecone

In [None]:
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key=pinecone_api_key)
pc.list_indexes()

In [24]:
index_name = "rag-index"
index = pc.Index(index_name)

In [None]:
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore

# Connect to Pinecone index and insert the chunked docs as contents
docsearch = PineconeVectorStore(embedding=OpenAIEmbeddings(), index=index)
#docsearch.add_documents(documents=splitted_doc)

## Integrate LLM

In [26]:
from langchain_openai import OpenAI
llm = OpenAI(
    model="gpt-3.5-turbo-instruct",
    temperature=0,
    max_retries=2
)

## Design Prompt Template

In [27]:
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template("""
Answer the following question based on the provided context.
Think step by step before providing a detailed answer.
<context>
{context}
</context>                                                                                                                                                                
Question: {input}
""")

## Chain
create stuff document chain

In [28]:
from langchain.chains.combine_documents.stuff import create_stuff_documents_chain
document_chain = create_stuff_documents_chain(llm=llm,prompt=prompt)

## Retrievers

In [29]:
retriever = docsearch.as_retriever(search_type="mmr")

### Retrieval chain(Retriever + Document chain):
    Retrieval chain takes in a user query, which is then passed to the retriever to fetch relevant documents. Those documents and original inputs are then passed to an LLM to generate a response.


In [30]:
from langchain.chains import create_retrieval_chain
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [None]:
response = retrieval_chain.invoke({"input": "Give me summary of document"})

In [40]:
response['answer']

'Answer: The document discusses the importance of safeguarding AI development and the key rights that should be considered in this process. It also mentions various international and regional treaties and laws that protect human rights and how they are used by civil society to monitor state and corporate activities. The document emphasizes the need for governments to implement these rights within certain parameters.'