# Indexing with Retrieval QA -with Pinecone

https://www.youtube.com/watch?v=inAY6M6UUkk <br>
https://www.youtube.com/watch?v=cVA1RPsGQcw - embedding and vector db pinecone <br>
https://www.youtube.com/watch?v=DXmiJKrQIvg


In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

In [None]:
import openai
import langchain
import pinecone

## Loading Document

In [None]:
filename = "data/authorize_doc/Kuiper_FCC-20-102A1.txt"

In [None]:
#loading the document
def import_document(filename):
    encodings = ['utf-8', 'ISO-8859-1', 'utf-16', 'ascii', 'cp1252']
    for enc in encodings:
        try:
            with open(filename, 'r', encoding=enc) as file:
                document_text = file.read()
            return document_text
        except UnicodeDecodeError:
            continue
        except FileNotFoundError:
            print(f"Error: File '{filename}' not found.")
            return None
        except Exception as e:
            print(f"Error occurred while importing the document: {e}")
            return None
    print(f"Error: Could not decode file with any of the tried encodings: {encodings}")
    return None

document = import_document(filename)
if document is not None:
    print("Document content:")
    print(document)

## Chunking and overlapping

In [None]:
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# Instantiate the Document with the content
doc = Document(page_content=document)

# Create the text splitter with specific parameters
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,           # Check if this parameter is valid
    chunk_overlap=100,        # Check if this parameter is valid
    length_function=len,      # Check if this parameter is valid
    keep_separator=True       # This is a valid parameter as per the traceback
)

# Split the document into smaller chunks
split_docs = text_splitter.split_documents([doc])



In [None]:
#we see here the document is separate into chunks and are overlapping 
split_docs

## Embedding

creating embedding for all the document chunks using OpenAI "ada" and store in Pinecone Vector db

In [None]:
from langchain.llms import OpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone

In [None]:
#seeing what gpt model are available for us
openai.Model.list()

In [None]:
#embedding model 
embeddings = OpenAIEmbeddings(modelName="text-embedding-ada-002")

#finding the dimension
query_result = embeddings.embed_query("MEOW_OW_OW_000")
len(query_result)

here we use pinecone as a vector store. set the dimension to 1536 and metric to cosine similarity
other vectore store can use Chroma, custom

In [None]:
# Initialize Pinecone
pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_env = os.getenv("PINECONE_ENV")

if not pinecone_api_key or not pinecone_env:
    raise ValueError("Environment variables not set.")

pinecone.init(api_key=pinecone_api_key, environment=pinecone_env)

index_name = "indexing"


In [None]:
# Create index if it doesn't exist
if index_name not in pinecone.list_indexes():
    pinecone.create_index(name=index_name, metric='cosine', dimension=len(query_result))

# Store documents in Pinecone
docsearch = Pinecone.from_documents(split_docs, embeddings, index_name=index_name)


you can check you pinecone now to see if the vector are in 

## Query the chunk with questions

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

In [None]:
# Initialize the chat model
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    raise ValueError("OPENAI_API_KEY not set.")

llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0,  # Don't be creative and make up an answer
    request_timeout=120,
    openai_api_key=openai_api_key
)


In [None]:
# Setting up the retriever
# Only retrieve documents that have a relevance score above a certain threshold
#retriever = docsearch.as_retriever(search_type="similarity_score_threshold", search_kwargs={'score_threshold': 0.8})

# Only get the single most similar document from the dataset
retriever = docsearch.as_retriever(search_kwargs={'k': 2})


RESOURCE: creating a chain, retrie ver, custom prompt- https://python.langchain.com/docs/use_cases/question_answering/how_to/vector_db_qa
We can also modify the search by passing specific search arguments through the retriever to the search function, using the search_kwargs keyword argument.

- k defines how many documents are returned; defaults to 4.
- score_threshold allows you to set a minimum relevance for documents returned by the retriever, if you are using the "similarity_score_threshold" search type.
- fetch_k determines the amount of documents to pass to the MMR algorithm; defaults to 20.
- lambda_mult controls the diversity of results returned by the MMR algorithm, with 1 being minimum diversity and 0 being maximum. Defaults to 0.5.
- filter allows you to define a filter on what documents should be retrieved, based on the documents' metadata. This has no effect if the Vectorstore doesn't store any metadata.

In [None]:
# Setting up the prompt 
from langchain.prompts import PromptTemplate

# Define the template without direct variable embedding
prompt_template = """
Follow exactly those 3 steps:
1. Read the context below and aggregate this data
Context: {context}
2. Answer the question using only this context
3. Show the source for your answers
User Question: {question}

If you don't have any context and are unsure of the answer, reply that you don't know about this topic.
"""

PROMPT = PromptTemplate(
    template=prompt_template, 
    input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}

In [None]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, chain_type_kwargs=chain_type_kwargs, return_source_documents=True)

In [None]:
qa

In [None]:
# List of questions
questions = {
    "const_name": "What's the name of the satellite constellation the company seeks to deploy or operate?",
    "date_release": "On which date was the document released?",
    "date_50": "By which date must the company launch and operate half of its satellites?",
    "date_100": "By which date is the company expected to have all its satellites operational?",
    "total_sat_const": "How many satellites is the company authorized to deploy and operate for this constellation?",
    "altitude": "At which authorized altitudes will the company deploy its satellites?",
    "inclination": "What are the authorized satellite inclinations within the corresponding altitudes?",
    "number_orb_plane": "How many orbital planes, corresponding to given altitudes and inclinations, has the company been authorized for?",
    "total_sat_per_orb_plane": "How many satellites are allocated to each orbital plane?",
    "total_sat_per_alt_incl": "How many satellites, for each altitude and inclination, are there across all matching orbital planes?",
    "operational_lifetime": "What is the satellite's expected operational lifetime in years?"
}

# Store the results
results_content = {}
#Store the source_document
source_documents = {}

In [None]:
for key, query in questions.items():
    matched_docs = retriever.get_relevant_documents(query)
    result = qa({"context": matched_docs, "query": query})
    results_content[key] = result['result']
    source_documents[key] = result['source_documents']  # Assuming the key in the result is 'source_documents'

for key, answer in results_content.items():
    print(f"\n## Question ({key}): {questions[key]}\n")
    print(f"Answer: {answer}\n")
    print(f"Source Documents: {source_documents[key]}\n")
