In [3]:
from langchain import PromptTemplate
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from dotenv import load_dotenv

In [4]:
load_dotenv()

True

# Extract data from pdf

In [5]:
def load_pdf(data):
    loader = DirectoryLoader(data,glob='*.pdf',loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [7]:
extracted_text = load_pdf('../data/')

# Create chunks

In [8]:
def text_split(extracted_text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_text)
    return text_chunks

In [9]:
text_chunks = text_split(extracted_text)

In [10]:
len(text_chunks)

7020

# Download Embeddings

In [11]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [12]:
embeddings = download_hugging_face_embeddings()

In [13]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [18]:
text_chunks = text_chunks[:100]

In [19]:
from langchain_community.vectorstores import FAISS

In [21]:
faiss = FAISS.from_texts([text.page_content for text in text_chunks],embeddings)

In [24]:
faiss.similarity_search("introduction")

[Document(page_content='individuals are highlighted as sidebar biographies thataccompany the main topical essays. Articles follow astandardized format that provides information at aglance. Rubrics include:\nDisorders/Conditions Tests/Treatments\nDefinition Definition\nDescription Purpose\nCauses and symptoms Precautions\nDiagnosis DescriptionTreatment Preparation\nAlternative treatment Aftercare\nPrognosis Risks\nPrevention Normal/Abnormal results\nResources ResourcesKey terms Key terms'),
 Document(page_content='copyright laws, as well as by misappropriation, trade secret, unfair com-petition, and other applicable laws. The authors and editor of this workhave added value to the underlying factual material herein through oneor more of the following: unique and original selection, coordination,expression, arrangement, and classification of the information.\nGale Group and design is a trademark used herein under license.All rights to this publication will be vigorously defended.Copyright

In [25]:
prompt_template = """
Use the following pieces of information to answer user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: 
<context>
{context}
</context>
Question: {input}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [26]:
from langchain_community.llms import Ollama
llm = Ollama(model="llama2")

In [27]:
from langchain_core.prompts import ChatPromptTemplate
prompt = ChatPromptTemplate.from_template(prompt_template)

In [28]:
from langchain.chains.combine_documents.stuff import create_stuff_documents_chain

In [30]:
document_chain = create_stuff_documents_chain(llm,prompt)

In [31]:
retriever = faiss.as_retriever()

In [32]:
from langchain.chains import create_retrieval_chain
retrieval_chain = create_retrieval_chain(retriever,document_chain)

In [33]:
response = retrieval_chain.invoke({"input" : "introduction"})

In [35]:
response

{'input': 'introduction',
 'context': [Document(page_content='individuals are highlighted as sidebar biographies thataccompany the main topical essays. Articles follow astandardized format that provides information at aglance. Rubrics include:\nDisorders/Conditions Tests/Treatments\nDefinition Definition\nDescription Purpose\nCauses and symptoms Precautions\nDiagnosis DescriptionTreatment Preparation\nAlternative treatment Aftercare\nPrognosis Risks\nPrevention Normal/Abnormal results\nResources ResourcesKey terms Key terms'),
  Document(page_content='copyright laws, as well as by misappropriation, trade secret, unfair com-petition, and other applicable laws. The authors and editor of this workhave added value to the underlying factual material herein through oneor more of the following: unique and original selection, coordination,expression, arrangement, and classification of the information.\nGale Group and design is a trademark used herein under license.All rights to this publicatio

In [36]:
print(response['answer'])

The Gale Encyclopedia of Medicine 2 is designed to provide ready reference for users, with information organized in a straight alphabetical arrangement and bold-faced terms functioning as print hyperlinks to related entries in the encyclopedia.
