![Image](https://assets.zilliz.com/Figure_1_How_RAG_works_246044aacf.png)

In [35]:
import pytesseract
from PyPDF2 import PdfReader
from dotenv import load_dotenv
from langchain.embeddings.cohere import CohereEmbeddings
from langchain.llms import Cohere
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.vectorstores import LanceDB
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFaceHub
from PIL import Image
import os
import io

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR' 

In [36]:
load_dotenv()

True

### Loading all PDF Data

In [37]:
PDF_PATH = r"sample_pdfs\en"

In [38]:
def get_from_ocr(image):
    image = Image.open(image)
    return pytesseract.image_to_string(image, lang='eng+hin+ben+chi_sim')

def get_pdf_data(path):
    all_pdf_data = ""

    for pdfs in os.listdir(PDF_PATH):
        file = os.path.join(PDF_PATH,str(pdfs))
        pdf_reader = PdfReader(file)
        
        for page in pdf_reader.pages:
            page_text = page.extract_text()
            if page_text:
                all_pdf_data += page_text

    return all_pdf_data


### Using google/flan-t5-small

In [39]:
llm = HuggingFaceHub(repo_id="google/flan-t5-small", 
                     model_kwargs={
                         "temperature": 0.5, 
                        "max_length": 512
                                   })

### Using HuggingFace embeddings

In [40]:
embeddings = HuggingFaceEmbeddings()

  embeddings = HuggingFaceEmbeddings()


### Splitting Text chunnks

In [41]:
text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )

In [42]:
pdf_data = get_pdf_data(PDF_PATH)

In [43]:
splits = text_splitter.split_text(pdf_data)

In [44]:
print(type(splits))

<class 'list'>


### Storing Vector Data in FAISS 

In [45]:
vectorstore = FAISS.from_texts(texts=splits, embedding=embeddings)

### Retriever for getting contextual data

In [46]:
retriever=vectorstore.as_retriever()

In [47]:
from langchain import hub
from langchain.prompts import ChatPromptTemplate

In [48]:
template = """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't.

Question: {question} 

Context: {context} 

Answer:"""

prompt = ChatPromptTemplate.from_template(template)

In [49]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [50]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

### Inference Chain

In [51]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

### Inference

In [60]:
QUESTION = "Whos is writer of Blue ocean ?"

In [61]:
rag_chain.invoke(QUESTION)

'Kim'