# Part-1: pinecone

In [None]:
!pip install langchain-community langchain-core langchain
!pip install pinecone-client
!pip install pypdf
!pip install openai
!pip install tiktoken

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import os

In [None]:
!mkdir pdfs

In [None]:
loader = PyPDFLoader("/content/pdfs/art of war -Sant tuz.pdf")
data = loader.load()
len(data)

258

In [None]:
# Tokenization
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
text_chunks = text_splitter.split_documents(data)
print(text_chunks[0].page_content)
print(len(text_chunks))

BILINGUAL  CHINESE   AND  ENGLISH  TEXT SUN TZU’S
Art of war_part1.indd   i 28/4/16   1:33 pm
1265


In [None]:
import os
# Set the key
os.getenv['OPENAI_API_KEY'] = 'some_key'

embeddings = OpenAIEmbeddings()
embeddings.embed("how are you") # Generates a list of embeddings

In [None]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', "the key")
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', "gcp-starter")

In [None]:
import pinecone
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)
index_name = "art-of-war"

In [None]:
# Name of your pinecone index
index_name="testing"
index = pinecone.Index('testing')


# Create Embeddings for each of the Text Chunk

In [None]:
docsearch = Pinecone.from_texts([t.page for t in text_chunks], embeddings, index_name=index_name)

In [None]:
query = "what are the five ways of attacking with fire?"

# Similarity search
docs = docsearch.similarity_search(query)
docs  # Embeddings

In [None]:
llm = OpenAI(temperature=0.9)
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())
qa.run(query)

In [None]:
import sys
while True:
  user_input = input("Enter a query: ")
  if user_input == "exit":
    break
  elif user_input == '':
    continue
  try:
    result = qa.run({'query':user_input})
    print(f'{result["result"]}')
  except Exception as e:
    print(f"Error: {e}")

In [None]:
!pip install sentence-transformers faiss-cpu

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14

In [None]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub
from google.colab import userdata

# Load PDF
loader = PyPDFLoader("/content/pdfs/art of war -Sant tuz.pdf")
documents = loader.load()

# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

# Create embeddings
embeddings = HuggingFaceEmbeddings()

# Create vector store
db = FAISS.from_documents(texts, embeddings)

# Initialize Hugging Face model
os.environ["HUGGINGFACEHUB_API_TOKEN"] = userdata.get('HUGGING_FACE_BEARER')
os.environ["HF_TOKEN"] = userdata.get('HUGGING_FACE_BEARER')
llm = HuggingFaceHub(repo_id="google/flan-t5-large", model_kwargs={"temperature":0.5, "max_length":512})

# Create retrieval-based QA chain
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=db.as_retriever())

# Chat loop
while True:
    query = input("Ask a question about your PDF (or type 'quit' to exit): ")
    if query.lower() == 'quit':
        break
    response = qa_chain.run(query)
    print(response)

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Ask a question about your PDF (or type 'quit' to exit): "what are the five ways of attacking with fire?"
The first is to burn soldiers in their camp;2 the second is to burn stores;3 the third is to burn baggage-trains;4 the fourth is to burn arsenals and magazines;5 the fifth is to hurl dropping fire amongst the enemy
Ask a question about your PDF (or type 'quit' to exit): who wrote the document you read? and what kind of document is it?
Neither of these questions is easy to answer. Indeed they may be the wrong questions. In many ways it is hardly a book at all.


KeyboardInterrupt: Interrupted by user

In [None]:
!pip install langchain-google-genai

Collecting langchain-google-genai
  Downloading langchain_google_genai-1.0.7-py3-none-any.whl (36 kB)
Collecting google-generativeai<0.8.0,>=0.7.0 (from langchain-google-genai)
  Downloading google_generativeai-0.7.1-py3-none-any.whl (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.9/163.9 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
Collecting google-ai-generativelanguage==0.6.6 (from google-generativeai<0.8.0,>=0.7.0->langchain-google-genai)
  Downloading google_ai_generativelanguage-0.6.6-py3-none-any.whl (718 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m718.3/718.3 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: google-ai-generativelanguage, google-generativeai, langchain-google-genai
  Attempting uninstall: google-ai-generativelanguage
    Found existing installation: google-ai-generativelanguage 0.6.4
    Uninstalling google-ai-generativelanguage-0.6.4:
      Successfully uninstalled google-ai-gener

In [None]:
# Gemini
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI
from google.colab import userdata

# Load PDF
loader = PyPDFLoader("/content/pdfs/art of war -Sant tuz.pdf")
documents = loader.load()

# Split text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

# Create embeddings
embeddings = HuggingFaceEmbeddings()

# Create vector store
db = FAISS.from_documents(texts, embeddings)

# Initialize Gemini model
os.environ["GOOGLE_API_KEY"] = userdata.get('GEMINI_API_KEY')
llm = ChatGoogleGenerativeAI(model="gemini-pro")

# Create retrieval-based QA chain
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=db.as_retriever())

# Chat loop
while True:
    query = input("Ask a question about your PDF (or type 'quit' to exit): ")
    if query.lower() == 'quit':
        break
    response = qa_chain.run(query)
    print(response)

  warn_deprecated(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Tutorial
* [source: langchain quickstart](https://python.langchain.com/v0.1/docs/get_started/quickstart/)

In [None]:
!pip install langchain-cohere -q
!pip install langchain-community langchain-core langchain -q
!pip install beautifulsoup4 -q
!pip install faiss-cpu -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from google.colab import userdata
from langchain_cohere import ChatCohere

import os
os.environ["COHERE_API_KEY"] = str(userdata.get('COHERE_API_KEY'))
# llm = ChatCohere(cohere_api_key=userdata.get('COHERE_API_KEY'))  # userdata.get('COHERE_API_KEY')
llm = ChatCohere()

In [None]:
llm.invoke("how can langsmith help with testing?")

AIMessage(content="Langsmith can help with testing in a number of ways:\n\n1. **Automated Testing**: Langsmith can be used to automate the testing process. You can use Langsmith to generate test cases, execute tests, and verify results. This can save a significant amount of time and effort in the testing process.\n\n2. **Natural Language Processing**: Langsmith's natural language processing (NLP) capabilities can be leveraged to test natural language user interfaces, such as chatbots or voice-activated systems. You can use Langsmith to generate a variety of test inputs in natural language and validate the system's responses.\n\n3. **Translation Testing**: If your product or service involves multiple languages, Langsmith can help with translation testing. You can use Langsmith to translate your test cases into different languages and verify that your product or service performs correctly in each language.\n\n4. **User Interface Testing**: Langsmith can be used to test user interfaces by

In [None]:
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a world class technical documentation writer."),
    ("user", "{input}")
])

chain = prompt | llm

chain.invoke({"input": "how can langsmith help with testing?"})

AIMessage(content="Langsmith is a powerful tool that can greatly assist with testing and quality assurance processes. Here's how Langsmith can help:\n\n1. **Generate Test Cases**: Langsmith can be used to generate a variety of test cases by providing different inputs and scenarios. By using Langsmith to describe test cases in natural language, you can quickly create a diverse set of test inputs, covering a wide range of scenarios, including edge cases and negative tests. This helps improve test coverage and identify potential issues early in the development cycle.\n\n2. **Create Test Data**: Langsmith is capable of generating large volumes of realistic and diverse test data. By using Langsmith's advanced data generation capabilities, you can create test datasets that mimic real-world data, including personal information, financial data, or any custom data structures. This helps ensure that your tests are executed against meaningful and representative data, increasing the effectiveness 

# Retrieval Chain

In [None]:
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://docs.smith.langchain.com/user_guide")

docs = loader.load()



In [None]:
from langchain_cohere.embeddings import CohereEmbeddings

embeddings = CohereEmbeddings()

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter()
documents = text_splitter.split_documents(docs)
vector = FAISS.from_documents(documents, embeddings)

In [None]:
from langchain.chains.combine_documents import create_stuff_documents_chain

prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}""")

document_chain = create_stuff_documents_chain(llm, prompt)

In [None]:
# passing document directly
from langchain_core.documents import Document

document_chain.invoke({
    "input": "how can langsmith help with testing?",
    "context": [Document(page_content="langsmith can let you visualize test results")]
})

'Langsmith can help with testing by visualizing test results.'

# Cohere with pypdf

In [None]:
!pip install pypdf -q

Collecting pypdf
  Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pypdf
Successfully installed pypdf-4.2.0


In [None]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import CohereEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import Cohere

from google.colab import userdata

# Load PDF
print('loading PDF...')
loader = PyPDFLoader("/content/power-of-your-subconscious-mind-epdf.pub.pdf")
documents = loader.load() # Each document is seperate page of pdf
print(f"Number of documents: {len(documents)}")

# Split text into chunks
print('splitting text...')
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

# Set Cohere API key
os.environ["COHERE_API_KEY"] = userdata.get('COHERE_API_KEY')

# Create embeddings using Cohere
print('creating embedding...')
embeddings = CohereEmbeddings()

# Create vector store
print('creating vector store...')
db = FAISS.from_documents(texts, embeddings)

# Initialize Cohere model
print('initializing Cohere model...')
llm = Cohere(model="command")  # You can also use "command-light" for a smaller model

# Create retrieval-based QA chain
print('creating retrieval-based QA chain...')
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=db.as_retriever())

# Chat loop
while True:
    query = input("Ask a question about your PDF (or type 'quit' to exit): ")
    if query.lower() == 'quit':
        break
    response = qa_chain.run(query)
    print(response)

# Gemini

In [None]:
!pip install --upgrade --quiet  langchain-google-genai pillow

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.9/163.9 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m718.3/718.3 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
imageio 2.31.6 requires pillow<10.1.0,>=8.3.2, but you have pillow 10.3.0 which is incompatible.[0m[31m
[0m

In [None]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI
from google.colab import userdata

# Set Gemini API key
os.environ['GOOGLE_API_KEY']=userdata.get('GEMINI_API_KEY')

# load the models
print('loading models...')
llm = ChatGoogleGenerativeAI(model="gemini-pro")

# Create embeddings using Cohere
print('initializing embedding...')
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# Load PDF
print('loading PDF...')
loader = PyPDFLoader("/content/power-of-your-subconscious-mind-epdf.pub.pdf")
documents = loader.load() # Each document is seperate page of pdf
print(f"Number of documents: {len(documents)}")

# Split text into chunks
print('splitting text...')
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

# Create vector store
print('creating vector store...')
db = FAISS.from_documents(texts, embeddings)

# Initialize Cohere model
print('initializing Cohere model...')
llm = Cohere(model="command")  # You can also use "command-light" for a smaller model

# Create retrieval-based QA chain
print('creating retrieval-based QA chain...')
qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=db.as_retriever())

# Chat loop
while True:
    query = input("Ask a question about your PDF (or type 'quit' to exit): ")
    if query.lower() == 'quit':
        break
    response = qa_chain.run(query)
    print(response)

loading models...
initializing embedding...
loading PDF...
Number of documents: 247
splitting text...
creating vector store...
initializing Cohere model...
creating retrieval-based QA chain...
Ask a question about your PDF (or type 'quit' to exit): list some amazing things about our sub conscious mind?
 The subconscious mind is an amazing entity that governs myriad vital processes and functions in the body, such as heartbeat, blood circulation, digestion, and elimination. It works tirelessly, never resting or sleeping, and can be harnessed to solve problems and effect miraculous changes in your life. 

One of the most remarkable aspects of the subconscious is its propensity to bring about tangible, material changes in the world through the power of belief and suggestion. Experiments have shown that by confidently suggesting to your subconscious mind that a specific goal has been achieved or an objective fulfilled, you can manifest that goal or objective in reality. 

For instance, if y

# Chromadb and gemini
* [reference](https://gist.github.com/janakiramm/6546d9734c7872f111b139cda1a8e0de)

In [None]:
### Install required modules and set the envvar for Gemini API Key
!pip install pypdf2 -q
!pip install chromadb -q
!pip install google.generativeai -q
!pip install langchain-google-genai -q
!pip install langchain -q
!pip install langchain_community -q
# !pip install jupyter

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m559.5/559.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.4/62.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.3/41.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.9/59.9 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
#Import Python modules
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.vectorstores import Chroma

import os
from google.colab import userdata

# Set Gemini API key
#export GOOGLE_API_KEY="YOUR_GOOGLE_API_KEY"
os.environ['GOOGLE_API_KEY']=userdata.get('GEMINI_API_KEY')

#Load the models
llm = ChatGoogleGenerativeAI(model="gemini-pro")
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

#Load the PDF and create chunks
loader = PyPDFLoader("/content/1706.03762v7.pdf")
text_splitter = CharacterTextSplitter(
    separator=".",
    chunk_size=250,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)
pages = loader.load_and_split(text_splitter)

#Turn the chunks into embeddings and store them in Chroma
persist_directory = "/content/embeddings_store"
vectordb=Chroma.from_documents(pages,embeddings, persist_directory=persist_directory)

#Configure Chroma as a retriever with top_k=5
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

#Create the retrieval chain
template = """
You are a helpful AI assistant.
Answer based on the context provided.
context: {context}
input: {input}
answer:
"""
prompt = PromptTemplate.from_template(template)
combine_docs_chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)


# Chat loop
while True:
    query = input("Ask a question about your PDF (or type 'quit' to exit): ")
    if query.lower() == 'quit':
        break
    #Invoke the retrieval chain
    response=retrieval_chain.invoke({"input":query})  # what is attention mechanism?
    # Print the answer to the question
    print(response["answer"])


ImportError: pypdf package not found, please install it with `pip install pypdf`

In [None]:
#Import Python modules
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.vectorstores import Chroma
import os
from google.colab import userdata

# Set Gemini API key
#export GOOGLE_API_KEY="YOUR_GOOGLE_API_KEY"
os.environ['GOOGLE_API_KEY']=userdata.get('GEMINI_API_KEY')

#Load the models
llm = ChatGoogleGenerativeAI(model="gemini-pro")
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

# Define the persistence directory
persist_directory = "/content/embeddings_store"

# List of document paths
document_paths = ["/content/1706.03762v7.pdf", "/content/power-of-your-subconscious-mind-epdf.pub.pdf"]

# Function to get document hash (you may want to use a more robust hashing method)
def get_document_hash(file_path):
    return f"{os.path.basename(file_path)}_{os.path.getsize(file_path)}"

# Function to load and split a document
def load_and_split_document(file_path):
    loader = PyPDFLoader(file_path)
    text_splitter = CharacterTextSplitter(
        separator=".",
        chunk_size=250,
        chunk_overlap=50,
        length_function=len,
        is_separator_regex=False,
    )
    return loader.load_and_split(text_splitter)

# Initialize or load Chroma
if os.path.exists(persist_directory) and os.listdir(persist_directory):
    print("Loading existing embeddings...")
else:
    print("Initializing new Chroma database...")
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

# Process each document
for doc_path in document_paths:
    doc_hash = get_document_hash(doc_path)

    # Check if document is already in the database
    if vectordb.get(where={"source": doc_hash})['ids'] != []:
        print(f"Document {doc_path} already processed. Skipping.")
        continue

    print(f"Processing new document: {doc_path}")
    pages = load_and_split_document(doc_path)

    # Add new document to the database with its hash as metadata
    # Add new document to the database with its hash as metadata
    metadatas = [{"source": doc_hash} for _ in range(len(pages))]
    ids = [f"{doc_hash}_{i}" for i in range(len(pages))]
    # vectordb.add_documents(documents=pages, metadatas=metadatas, ids=ids)
    vectordb.add_documents(pages)

# Persist changes
vectordb.persist()

# Configure Chroma as a retriever with top_k=5
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

#Create the retrieval chain
template = """
You are a helpful AI assistant.
Answer based on the context provided.
context: {context}
input: {input}
answer:
"""
prompt = PromptTemplate.from_template(template)
combine_docs_chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)


# Chat loop
while True:
    query = input("Ask a question about your PDF (or type 'quit' to exit): ")
    if query.lower() == 'quit':
        break
    #Invoke the retrieval chain
    response=retrieval_chain.invoke({"input":query})  # what is attention mechanism?
    # Print the answer to the question
    print(response["answer"])

Initializing new Chroma database...
Processing new document: /content/1706.03762v7.pdf




Processing new document: /content/power-of-your-subconscious-mind-epdf.pub.pdf


  warn_deprecated(


Ask a question about your PDF (or type 'quit' to exit): quit


In [None]:
doc_hash="power-of-your-subconscious-mind-epdf.pub.pdf_1191348"
vectordb.get(where={"source": '/content/power-of-your-subconscious-mind-epdf.pub.pdf'})

# vectordb.get(get_document_hash("/content/1706.03762v7.pdf"))

In [None]:
!rm -rf embeddings_store

In [None]:
# persist_directory="/content/embeddings_store"
# vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
# a=vectordb.get()
# a.keys()  # dict_keys(['ids', 'embeddings', 'metadatas', 'documents', 'uris', 'data', 'included'])
# a['metadatas']

doc_hash

'power-of-your-subconscious-mind-epdf.pub.pdf_1191348'

In [None]:
get_document_hash("/content/1706.03762v7.pdf")
get_document_hash("/content/power-of-your-subconscious-mind-epdf.pub.pdf")

'power-of-your-subconscious-mind-epdf.pub.pdf_1191348'

In [None]:
# vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
# vectordb.get(where={"source": '1706.03762v7.pdf_2215244'})
vectordb.get(where={"source": '/content/power-of-your-subconscious-mind-epdf.pub.pdf'})

In [None]:
a=[]
for t in vectordb.get()['metadatas']:
  a.extend(t.values())
  if '/content/1706.03762v7.pdf' in t.values():
    print(t)
    break
# set(list(a))

{'page': 0, 'source': '/content/1706.03762v7.pdf'}


{'/content/1706.03762v7.pdf',
 '/content/power-of-your-subconscious-mind-epdf.pub.pdf',
 0,
 2,
 6}

In [None]:
a=vectordb.get(where={"source": '/content/1706.03762v7.pdf'})
type(a)

dict

In [None]:
a.keys()  # dict_keys(['ids', 'embeddings', 'metadatas', 'documents', 'uris', 'data', 'included'])
a['ids']


In [None]:
vectordb.get().keys()

dict_keys(['ids', 'embeddings', 'metadatas', 'documents', 'uris', 'data', 'included'])

In [None]:
ids[:10]

['power-of-your-subconscious-mind-epdf.pub.pdf_1191348_0',
 'power-of-your-subconscious-mind-epdf.pub.pdf_1191348_1',
 'power-of-your-subconscious-mind-epdf.pub.pdf_1191348_2',
 'power-of-your-subconscious-mind-epdf.pub.pdf_1191348_3',
 'power-of-your-subconscious-mind-epdf.pub.pdf_1191348_4',
 'power-of-your-subconscious-mind-epdf.pub.pdf_1191348_5',
 'power-of-your-subconscious-mind-epdf.pub.pdf_1191348_6',
 'power-of-your-subconscious-mind-epdf.pub.pdf_1191348_7',
 'power-of-your-subconscious-mind-epdf.pub.pdf_1191348_8',
 'power-of-your-subconscious-mind-epdf.pub.pdf_1191348_9']

In [None]:
vectordb.add_documents(pages[:10], source2=['power']*10)

['3d71823a-becf-47f8-bd44-6253cc5bf635',
 '7ac0ed59-13cb-4db2-8a89-23421fe19e0d',
 'f8764911-dbf4-4943-9dbd-7467b95f980d',
 '7cf53c14-aab1-45dc-8bbf-71390f1720b5',
 '3dea608a-eb11-47aa-bc74-0efd39839c21',
 '40ca7929-ea7c-4dca-9ab2-011565d1f938',
 'cd463c33-6284-4228-9aac-0360f4b89f29',
 'b3e8786b-bddf-4bcc-8ccc-8bedca7c6e19',
 '22265a33-c9f6-4976-a783-79e573676a97',
 '0c46ea3c-9ab3-4654-87da-05faf65665de']

In [None]:
a=vectordb.get(where={"ids": '3d71823a-becf-47f8-bd44-6253cc5bf635'})
a

{'ids': [],
 'embeddings': None,
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents']}

# Pinecone
api key: [goto pinecone](https://app.pinecone.io/organizations/)
  * create new index (or you may create programatically refer to pinecone docs)
  * embedding size 768 for gemini
  * get api keys  from manage/API_KEYS

# References:
* [pinecone-langchain docs](https://docs.pinecone.io/integrations/langchain)
* [langchain-docs](https://python.langchain.com/v0.1/docs/integrations/chat/google_generative_ai/)



In [None]:
!pip install langchain-pinecone -q
!pip install pypdf -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [109]:
#Import Python modules
from langchain_pinecone import PineconeVectorStore
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    HarmBlockThreshold,
    HarmCategory,
)
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from google.colab import userdata
import os

# Set pinecone api key
os.environ['PINECONE_API_KEY']  = userdata.get('PINECONE_API_KEY')

# Set Gemini API key
#export GOOGLE_API_KEY="YOUR_GOOGLE_API_KEY"
os.environ['GOOGLE_API_KEY']=userdata.get('GEMINI_API_KEY')

index_name = "pdf-index"
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)

#Load the models
llm = ChatGoogleGenerativeAI(
    model="gemini-pro",
    safety_settings={
        HarmCategory.HARM_CATEGORY_DANGEROUS: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,

    },
)

from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.vectorstores import Chroma

import re

def remove_non_english(text):
    # This pattern keeps English letters, numbers, spaces, and basic punctuation
    pattern = re.compile(r'[^a-zA-Z0-9\s.,!?"-]')
    return pattern.sub('', text)

#Load the models
llm = ChatGoogleGenerativeAI(model="gemini-pro")
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

namespace = ["https://arxiv.org/pdf/1706.03762", "art-of-war"]
file_paths = ["/content/1706.03762v7.pdf", "/content/art of war -Sant tuz.pdf"]
for file_path, namespace in zip(file_paths, namespace):
    #Load the PDF and create chunks
    loader = PyPDFLoader(file_path)
    text_splitter = CharacterTextSplitter(
        separator=".",
        chunk_size=500,
        chunk_overlap=50,
        length_function=len,
        is_separator_regex=False,
    )
    docs = loader.load_and_split(text_splitter) # pages
    for d in docs:
      d.page_content = remove_non_english(d.page_content)


    # Initialize vector store
    vectorstore_from_docs = PineconeVectorStore.from_documents(
            docs,
            index_name=index_name,
            embedding=embeddings,
            namespace=namespace  # namespace to differentiate between two pdfs
    )
    # Add documents
    vectorstore_from_docs.add_documents(docs)

query1 ='what is transformers architecture based on given text?'
query2 ='What are five ways of attacking with fire based on given text?'
'''
# search by source
vectorstore_from_docs.similarity_search(query, k=5, namespace='art-of-war')
# Search by namespace
vectorstore_from_docs.similarity_search(query, k=5, namespace="https://arxiv.org/pdf/1706.03762")
'''




'\n# search by source\nvectorstore_from_docs.similarity_search(query, k=5, namespace=\'art-of-war\')\n# Search by namespace\nvectorstore_from_docs.similarity_search(query, k=5, namespace="https://arxiv.org/pdf/1706.03762")\n'

# Works at times
* sometimes it gives empty answer like for query: "what are five ways of attacking with fire."

In [11]:
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    HarmBlockThreshold,
    HarmCategory,
)
from langchain.text_splitter import TokenTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import os
from google.colab import userdata

# Set pinecone api key
os.environ['PINECONE_API_KEY']  = userdata.get('PINECONE_API_KEY')

# Set Gemini API key
#export GOOGLE_API_KEY="YOUR_GOOGLE_API_KEY"
os.environ['GOOGLE_API_KEY']=userdata.get('GEMINI_API_KEY')

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

llm = ChatGoogleGenerativeAI(
    model= "gemini-pro",  # 'gemini-1.5-flash',
    safety_settings={
        HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_DEROGATORY: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_TOXICITY: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_VIOLENCE: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUAL: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_MEDICAL: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_DANGEROUS: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,

    },
)

# Initialize Pinecone vector store
index_name = 'pdf-index'
vectorstore = PineconeVectorStore(
    index_name=index_name,
    embedding=embeddings,
    pinecone_api_key=userdata.get('PINECONE_API_KEY'),
    namespace='art-of-war'
)

# Set up the retriever with custom search parameters
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# Create the prompt template
template = """
You are a helpful AI assistant. Answer based on the context provided.
If the answer cannot be found in the context, say "I don't have enough information to answer that question."

Context: {context}
Question: {input}

Answer:
"""
prompt = PromptTemplate.from_template(template)

# Create the retrieval chain
combine_docs_chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

# Chat loop
while True:
    query = input("Ask a question about your PDF (or type 'quit' to exit): ")
    if query.lower() in ['quit', 'q']:
        break

    try:
        response = retrieval_chain.invoke({"input": query})
        print("\nResponse:", response)
        print("\nAnswer:", response["answer"])
    except Exception as e:
        print(f"\nAn error occurred: {str(e)}")

Ask a question about your PDF (or type 'quit' to exit): what are considerations while attack by fire?

Response: {'input': 'what are considerations while attack by fire?', 'context': [Document(page_content='58 CHINESE TEXT AND ENGLISH TRANSLATION\n5. In attacking with fire, one should be prepared to meet \nfive possible developments12\n6. 1 When fire breaks out inside the enemys camp, re-\nspond at once13 with an attack from without.\n7. \n8. \n9. \n7. 2 If there is an outbreak of fire, but the enemys soldiers \nremain quiet, bide your time and do not attack.14\n8. 3 When force of the flames has reached its height, follow it \nup with an attack, if that is practicable if not, stay where you are', metadata={'page': 126.0, 'source': '/content/art of war -Sant tuz.pdf'}), Document(page_content='58 CHINESE TEXT AND ENGLISH TRANSLATION\n5. In attacking with fire, one should be prepared to meet \nfive possible developments12\n6. 1 When fire breaks out inside the enemys camp, re-\nspond at on

In [130]:
# direct llm response
query = "what are considerations while attack by fire? context : [Document(page_content='58 CHINESE TEXT AND ENGLISH TRANSLATION\n5. In attacking with fire, one should be prepared to meet \nfive possible developments12\n6. 1 When fire breaks out inside the enemys camp, re-\nspond at once13 with an attack from without.\n7. \n8. \n9. \n7. 2 If there is an outbreak of fire, but the enemys soldiers \nremain quiet, bide your time and do not attack.14\n8. 3 When force of the flames has reached its height, follow it \nup with an attack, if that is practicable if not, stay where you are', metadata={'page': 126.0, 'source': '/content/art of war -Sant tuz.pdf'}), Document(page_content='58 CHINESE TEXT AND ENGLISH TRANSLATION\n5. In attacking with fire, one should be prepared to meet \nfive possible developments12\n6. 1 When fire breaks out inside the enemys camp, re-\nspond at once13 with an attack from without.\n7. \n8. \n9. \n7. 2 If there is an outbreak of fire, but the enemys soldiers \nremain quiet, bide your time and do not attack.14\n8. 3 When force of the flames has reached its height, follow it \nup with an attack, if that is practicable if not, stay where you are', metadata={'page': 126.0, 'source': '/content/art of war -Sant tuz.pdf'}), Document(page_content='15\n9. 4 If it is possible to make an assault with fire from with-\nout, do not wait for it to break out within, but deliver your attack at a favorable moment.\n16\n10. \n11. \n12.  \n10. 5 When you start a fire, be to windward of it. Do not \nattack from the leeward.17\n11. A wind that rises in the daytime lasts long, but a night \nbreeze soon falls.18\n12'"
llm.invoke(query).content

"1. **When fire breaks out inside the enemy's camp, respond at once with an attack from without.** This is because the enemy will be disorganized and confused, and therefore more vulnerable to attack.\n2. **If there is an outbreak of fire, but the enemy's soldiers remain quiet, bide your time and do not attack.** This is because the enemy may be preparing a trap, or they may be waiting for reinforcements.\n3. **When the force of the flames has reached its height, follow it up with an attack, if that is practicable.** If it is not practicable, stay where you are. This is because the enemy will be more focused on fighting the fire than on defending themselves.\n4. **If it is possible to make an assault with fire from without, do not wait for it to break out within, but deliver your attack at a favorable moment.** This is because the enemy will be less prepared for an attack from outside.\n5. **When you start a fire, be to windward of it.** Do not attack from the leeward. This is because 

# Working at times

In [107]:
from langchain_pinecone import PineconeVectorStore
text_field = "text"
index_name='pdf-index'

vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings, pinecone_api_key=userdata.get('PINECONE_API_KEY'), namespace='art-of-war')
# vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings, pinecone_api_key=userdata.get('PINECONE_API_KEY'), namespace="https://arxiv.org/pdf/1706.03762")
retriever = vectorstore.as_retriever()
# vectorstore.similarity_search_with_score(query1, k=5)

#Configure Chroma as a retriever with top_k=5
# retriever = vectorstore.as_retriever()
# retriever = vectorstore.as_retriever(namespace="https://arxiv.org/pdf/1706.03762")
# retriever = vectorstore.as_retriever(namespace="art-of-war")

#Create the retrieval chain
template = """
You are a helpful AI assistant.
Answer based on the context provided.
context: {context}
input: {input}
answer:
"""
prompt = PromptTemplate.from_template(template)
combine_docs_chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)
query3 =  "what does the text talks about? also what does it tells about five ways of attacking with fire?"

# Chat loop
while True:
    query = input("Ask a question about your PDF (or type 'quit' to exit): ")
    if query.lower() == 'quit' or query.lower() == 'q':
        break
    elif query=='1':
        query = query1
    elif query=='2':
        query = query2
    elif query=='3':
        query = query2
    #Invoke the retrieval chain
    response=retrieval_chain.invoke({"input":query})  # what is attention mechanism?
    # Print the answer to the question
    # print(response["answer"])
    print(response)



# from langchain.chains import RetrievalQA
# # completion llm

# qa = RetrievalQA.from_chain_type(
#     llm=llm,
#     chain_type="stuff",
#     retriever=vectorstore.as_retriever()
# )
# qa.run(query1)


Ask a question about your PDF (or type 'quit' to exit): 1
{'input': 'what is transformers architecture based on given text?', 'context': [Document(page_content='” He cautions us here not to pin our faith to abstract principles; “for, ” as Zhang Yu puts it, “while the main laws of strategy can be stated clearly enough for the benefit of all and sundry, you must be guided by the ac-tions of the enemy in attempting to secure a favorable position in actual warfare', metadata={'page': 136.0, 'source': '/content/art of war -Sant tuz.pdf'}), Document(page_content='” He cautions us here not to pin our faith to abstract principles; “for, ” as Zhang Yu puts it, “while the main laws of strategy can be stated clearly enough for the benefit of all and sundry, you must be guided by the ac-tions of the enemy in attempting to secure a favorable position in actual warfare', metadata={'page': 136.0, 'source': '/content/art of war -Sant tuz.pdf'}), Document(page_content='CHAPTER 6\nWeak Points And Strong

KeyboardInterrupt: Interrupted by user

In [108]:
# pip install pinecone-client[grpc]
from pinecone.grpc import PineconeGRPC as Pinecone

pc = Pinecone(api_key=userdata.get('PINECONE_API_KEY'))
index = pc.Index("pdf-index")

# index.delete(ids=["id-1", "id-2"], namespace='example-namespace')
# delete all
index.delete(delete_all=True, namespace="https://arxiv.org/pdf/1706.03762")
index.delete(delete_all=True, namespace="art-of-war")

