In [1]:
%pwd

'd:\\Personal\\Projects\\MedicalBot_01\\research'

In [2]:
import os 
os.chdir("../")

In [3]:
%pwd

'd:\\Personal\\Projects\\MedicalBot_01'

In [4]:

from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
# Extract the data from the PDF File
def load_pdf_file(data):
    loader = DirectoryLoader(data,
                             glob = '*.pdf',
                             loader_cls = PyPDFLoader)
    
    documents = loader.load()
    
    return documents

In [6]:
extracted_data = load_pdf_file(data = "Data/")

In [7]:
# Split the data into Text chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap = 20)
    text_chunks =text_splitter.split_documents(extracted_data)
    return text_chunks

In [8]:

text_chunks = text_split(extracted_data)
print('Length of text chunks', len(text_chunks))

Length of text chunks 5860


In [14]:

from langchain.embeddings import HuggingFaceEmbeddings

In [15]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-V2")

In [16]:
query_results = embeddings.embed_query("Hello world")
print("Length:", len(query_results))

Length: 384


In [37]:
from dotenv import load_dotenv
load_dotenv() 

True

In [38]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

In [26]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medicalbot"

pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
    
)

{
    "name": "medicalbot",
    "metric": "cosine",
    "host": "medicalbot-wj64fa2.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [39]:
import os 
os.environ['PINECONE_API_KEY'] = PINECONE_API_KEY
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

In [29]:
# Embed each chunk and update the embeddings into your pincone index
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = text_chunks,
    index_name = index_name,
    embedding= embeddings,
)

In [30]:
#Load Existing index
from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your pinecone index
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [31]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x111c2427ad0>

In [32]:
retriever = docsearch.as_retriever(search_type='similarity', search_kwargs={'k':3})

In [33]:
retrieved_docs = retriever.invoke('What is an acne?')

In [34]:
retrieved_docs

[Document(id='98e44bc0-996b-48fa-9f2f-8148f843d54b', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='235f417c-6a05-45ab-a649-2f1b3a3629f1', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 'page_label': '39', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed.(Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM -

In [45]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4, max_tokens=1000)

In [46]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    'You are an assitant for question answering tasks.'
    'Use the following pieces of retrieved context to answer'
    'The question if you do not know the answer say that you do not know the answer politely'
    'use three sentence maximum and keep the answer clear and conscise'
    '\n\n'
    '{context}'
)

prompt = ChatPromptTemplate.from_messages(
    [
        ('system', system_prompt),
        ('human', '{input}')
    ]
)


In [47]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [48]:
response = rag_chain.invoke({'input': 'What is Acne?'})
print(response['answer'])



Acne is a skin disorder that causes pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria. It is a common condition that affects many people in the United States.


In [49]:
response = rag_chain.invoke({'input': 'What is stats?'})
print(response['answer'])


I am an assistant for question answering tasks and I do not have the ability to know everything. I apologize, but I do not have enough information to answer your question about stats.
