In [2]:
!pip install langchain
!pip install openai
!pip install PyPDF2
!pip install faiss-cpu
!pip install tiktoken
!pip install dotenv
!pip install PyPDF2
!pip install unstructured
!pip install chromadb

Collecting unstructured
  Downloading unstructured-0.18.14-py3-none-any.whl.metadata (24 kB)
Collecting filetype (from unstructured)
  Using cached filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting lxml (from unstructured)
  Downloading lxml-6.0.1-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Collecting nltk (from unstructured)
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting beautifulsoup4 (from unstructured)
  Downloading beautifulsoup4-4.13.5-py3-none-any.whl.metadata (3.8 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting python-iso639 (from unstructured)
  Downloading python_iso639-2025.2.18-py3-none-any.whl.metadata (14 kB)
Collecting langdetect (from unstructured)
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━

In [None]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
SERPAPI_API_KEY = os.getenv("SERPAPI_API_KEY")

In [None]:
# provide the path of  pdf file/files.
pdfreader = PdfReader('budget_speech.pdf')

In [None]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [None]:
raw_text

In [None]:
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [None]:
len(texts)

In [None]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

In [None]:
document_search = FAISS.from_texts(texts, embeddings)

In [None]:
document_search

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

In [None]:
chain = load_qa_chain(OpenAI(), chain_type="stuff")

In [None]:
query = "Vision for Amrit Kaal"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
query = "How much the agriculture target will be increased to and what the focus will be"
docs = document_search.similarity_search(query)
chain.run(input_documents=docs, question=query)

In [None]:
from langchain.document_loaders import OnlinePDFLoader

In [None]:
loader = OnlinePDFLoader("https://arxiv.org/pdf/1706.03762.pdf")

In [None]:
data = loader.load()

In [None]:
data

In [None]:
# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

In [None]:
from langchain.indexes import VectorstoreIndexCreator
index = VectorstoreIndexCreator().from_loaders([loader])

In [None]:
query = "Explain me about Attention is all you need"
index.query(query)