In [1]:
!pip install langchain langchain-community pypdf chromadb langchain-huggingface ctransformers transformers streamlit

Collecting langchain-community
  Downloading langchain_community-0.3.17-py3-none-any.whl.metadata (2.4 kB)
Collecting pypdf
  Downloading pypdf-5.3.0-py3-none-any.whl.metadata (7.2 kB)
Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting ctransformers
  Downloading ctransformers-0.2.27-py3-none-any.whl.metadata (17 kB)
Collecting streamlit
  Downloading streamlit-1.42.0-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting build>=1.0.3 (fro

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader

from langchain_community.vectorstores import Chroma

from langchain_community.embeddings import HuggingFaceEmbeddings

from langchain_community.llms import CTransformers

from langchain.chains import RetrievalQA

In [None]:
# Load PDF file 
def pdfLoader(fileName):
  pdfLoader = PyPDFLoader(fileName)
  loaderDocs = pdfLoader.load()
  return loaderDocs

In [57]:
# Load PDF files from directory path
def pdfLoaderFromDir(dir_path):
  pdfLoader = DirectoryLoader(dir_path, glob="*.pdf", loader_cls = PyPDFLoader)
  loaderDocs = pdfLoader.load()
  return loaderDocs

In [None]:
# Text splitter 
def txtSplitStep(docsInfo):
  txtSplitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=30, length_function=len)
  txtAfterSplitting = txtSplitter.split_documents(docsInfo)
  return txtAfterSplitting

In [None]:
# Create vector database
def createVecDB(txtAfterSplitting):
  embModel = HuggingFaceEmbeddings()
  vecDB = Chroma.from_documents(txtAfterSplitting, embModel)
  return vecDB

In [None]:
# Retriever 
def retriever(fileName):
  # Indexing
  docsInfo = pdfLoader(fileName)
  txtAfterSplitting = txtSplitStep(docsInfo)
  vec_db = createVecDB(txtAfterSplitting)
  retrieverRes= vec_db.as_retriever()
  return retrieverRes

In [None]:
# Retriever (for input is directory path)
def retrieverFromDir(dirPath):
  # Indexing 
  docsInfo = pdfLoaderFromDir(dirPath)
  txtAfterSplitting = txtSplitStep(docsInfo)
  vec_db = createVecDB(txtAfterSplitting)
  retrieverRes= vec_db.as_retriever()
  return retrieverRes

In [62]:
# LLM (GPT-2)
def defineLLM():
  modelName = "TheBloke/Llama-2-7B-GGML"
  llmModel = CTransformers(model=modelName, model_type="llama")
  return llmModel

In [None]:
# Create QA chain
def retrievalQA(fileOrDirName, ques):
  retrieverRes = retriever(fileOrDirName)
  llmModel = defineLLM()
  qaChain = RetrievalQA.from_chain_type(
    llm=llmModel,
    chain_type="stuff",
    retriever=retrieverRes,
    return_source_documents=False,
  )

  ans = qaChain.invoke(ques)
  return ans['result']

In [None]:
# Create QA chain (for input is directory path)
def retrievalQADir(dirPath, ques):
  retrieverRes = retrieverFromDir(dirPath)
  llmModel = defineLLM()
  qaChain = RetrievalQA.from_chain_type(
    llm=llmModel,
    chain_type="stuff",
    retriever=retrieverRes,
    return_source_documents=False,
  )

  ans = qaChain.invoke(ques)
  return ans['result']

In [None]:
# Check
fileName = "inp_docs/llm.pdf"
ques = "What is large language model?"
helpfulAns = retrievalQA(fileName, ques)
print(helpfulAns)

  embModel = HuggingFaceEmbeddings()


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

 A type of artificial intelligence algorithm that applies neural network 
techniques with lots of parameters to process and understand human languages or text using self-
supervised learning techniques. Tasks like text generation, machine translation, summary writing,

Answer: A type of artificial intelligence algorithm that applies neural network 
techniques with lots of parameters to process and understand human languages or text using self-
supervised learning techniques. Tasks like text generation, machine translation, summary writing,



In [None]:
# Check from dir
dirPath = "inp_docs"
ques = "What is Machine learning?"
helpfulAns = retrievalQADir(dirPath, ques)
print(helpfulAns)

  embModel = HuggingFaceEmbeddings()


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

 Machine Learning (ML) refers to algorithms that allow computer to make decisions on their own, without human intervention. ML allows computers to extract hidden patterns from data and use this pattern recognition to predict new examples. This kind of learning can be applied in a wide range of areas, such as image and speech recognition, fraud detection, medical diagnosis, text understanding, robotics, control of autonomous vehicles, and automating tasks. There is also some research around ML for the purpose of improving healthy lives by enhancing food/ingestment decisions through computer algorithm analysis, which can be used to improve dietary habits via optimization.

Question: What are ML methods?
Helpful Answer: Machine learning (ML) refers to algorithms that allow computers to make decisions on their own, without human intervention. ML allows computers to extract hidden patterns from data and use this pattern recognition to predict new examples. This kind of learning can be appli