In [None]:
!pip install chromadb
!pip install langchain
!pip install transformers
!pip install torch
!pip install pypdf
!pip install langchain_community
!pip install -U langchain-huggingface

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
import torch

In [None]:
model_name = "MBZUAI/LaMini-T5-738M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
pipe = pipeline(
    'text2text-generation',
    model=model_name,
    tokenizer=tokenizer,
    max_length=1024,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.2,
    pad_token_id=tokenizer.pad_token_id,
    num_beams=3,
    length_penalty = 4.0,
    early_stopping=True,
    device = device
)
llm1 = HuggingFacePipeline(pipeline=pipe)

In [None]:
file = "/content/IITISOC_Proposal.pdf"

In [None]:
loader = PyPDFLoader(file)
documents = loader.load()

In [None]:
embedding = HuggingFaceEmbeddings()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=200)
text = text_splitter.split_documents(documents)

In [None]:
db = Chroma.from_documents(text, embedding)

In [None]:
global chain
chain = RetrievalQA.from_chain_type(
    llm=llm1,
    chain_type="refine",#other type = [stuff,map_reduce,refine,map_rerank]
    retriever=db.as_retriever(search_type="mmr")#other option of search==similarity
    )#(for source doc in case of multiple pdf)search_kwargs={"k": 2}),return_source_documents=True)

In [None]:
def answer_query(query):
    return chain.invoke(query)

In [None]:
que = """
what are the three projects that are mentioned?
"""

In [None]:
answer_query(que)