In [36]:
from langchain.chains import RetrievalQA
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

from langchain.document_loaders import PyMuPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_community.llms import CTransformers



In [37]:
try:
    import fitz
    print("fitz (PyMuPDF) imported successfully!")
except ImportError:
    print("Error: fitz (PyMuPDF) not found.")


fitz (PyMuPDF) imported successfully!


In [38]:
#PINECONE_API_KEY = "9ea3155f-be6e-4c0f-aa59-a6ae0d1e19b4"
#PINECONE_API_ENV = "gcp-starter"

In [39]:
# loading the pdf
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob = "*.pdf",
                    loader_cls = PyMuPDFLoader )
    
    docs = loader.load()
    return docs


In [40]:
# to extract the data
import fitz
extracted_data = load_pdf(r"F:\GenerativeAI_iNeuronCourse\medCHATBOT\data")  # having multiple pdfs in the folder will also work those will 
#also be loaded

In [41]:
#now according to architecture need to convert the data to text chunks
def text_splitter(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks



In [42]:
text_chunks=text_splitter(extracted_data)
len(text_chunks)

5779

In [43]:
text_chunks[1023].page_content

'head on the spine and the resulting release of the\nerector muscles of the back and legs which estab-\nlish improved coordination.\nHabit—Referring to the particular set of physical\nand mental tensions present in any individual.\nInhibition—Referring to the moment in an Alexan-\nder lesson when the student refrains from begin-\nning a movement in order to avoid tensing of the\nmuscles.\nSensory awareness—Bringing attention to the sen-\nsations of tension and/or release in the muscles.'

In [44]:
#now we need to create embeddings
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [45]:
embeddings=download_hugging_face_embeddings()



In [46]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [47]:
query_result = embeddings.embed_query("hello world") # embed_query:  vector representation of hello world  so the dimension of the vector is 384
len(query_result)

384

In [48]:
import chromadb


In [49]:
#using the vector db
#taking all the chunks and embedding model and applying it all and creating embeddings and stored in its database

#from langchain_chroma import Chroma

persist_directory = 'med_db1'
vectordb = Chroma.from_documents(text_chunks,
                                 embedding = embeddings,
                                 persist_directory =  persist_directory)   #embedding means we have given the embedding model
#Now we can load the persisted database from disk, and use it normally
vectordb=None
vectordb = Chroma( persist_directory =  persist_directory, embedding_function = embeddings)

#vector_db =  Chroma.from_documents(text_chunks,
                               #  embedding = embeddings,
                                # persist_directory =  persist_directory)


KeyboardInterrupt: 

In [None]:
vectordb

<langchain_community.vectorstores.chroma.Chroma at 0x248b12e9370>

In [None]:
retriever = vectordb.as_retriever()

In [None]:
answer = retriever.invoke("what are allergies")

In [None]:
retriever= vectordb.as_retriever(search_kwargs={"k":2})

In [None]:
docs = retriever.invoke("what are allergies")

In [None]:
docs[0].page_content

'to commonly encountered environmental substances.\nPurpose\nAllergy is a reaction of the immune system. Nor-\nmally, the immune system responds to foreign microor-\nganisms and particles, like pollen or dust, by producing\nspecific proteins called antibodies that are capable of\nbinding to identifying molecules, or antigens, on the\nforeign organisms. This reaction between antibody and\nantigen sets off a series of reactions designed to protect\nthe body from infection. Sometimes, this same series of'

In [None]:
prompt_template = """
use following infor to answer the question below
if u dont know the answer just say i dont know dont try to make up the answer

context: {context}
question: {question}
only return the helpful answer below
helpful answer:

"""

In [None]:
PROMPT = PromptTemplate(template = prompt_template, input_variables=["context","question"])
chain_type_kwargs={"prompt":PROMPT}   #used because we r using QnA chain concept 

In [None]:
llm = CTransformers(model = "TheBloke/Llama-2-7B-GGML",
                    model_type = "llama",
                    config = {'max_new_tokens':512,
                              'temperature':0.8})

# not able to use it directly from the downloaded version so copuied the link from huggingface TheBloke thing itself only

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 1 files: 100%|██████████| 1/1 [00:01<00:00,  1.09s/it]
Fetching 1 files: 100%|██████████| 1/1 [05:35<00:00, 335.70s/it]


In [None]:
llm1 = CTransformers(model = "F:\GenerativeAI_iNeuronCourse\medCHATBOT\model\llama-2-7b-chat.ggmlv3.q4_0.bin",
                    model_type = "llama",
                    config = {'max_new_tokens':512,
                              'temperature':0.8})

  llm1 = CTransformers(model = "F:\GenerativeAI_iNeuronCourse\medCHATBOT\model\llama-2-7b-chat.ggmlv3.q4_0.bin",


In [50]:
qa = RetrievalQA.from_chain_type(   #questions answering object
    llm=llm1,
    chain_type = "stuff",
    retriever = retriever,
    return_source_documents=True,
    chain_type_kwargs = chain_type_kwargs
)

In [51]:
while True:
    user_input = input(f"Input Prompt: ")
    result = qa({"query":user_input})  # stores the result of the query
    print("Response: ",result["result"])  #result of the result variable displayed

  warn_deprecated(


Response:  Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria.
Response:  Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria.
Response:  Diseases are conditions that affect the health and functioning of living organisms. In medicine, a disease can be defined as any abnormal condition that affects the body or mind, such as infection, inflammation, autoimmune disorders, genetic disorders, and neurological disorders.

In Chinese medicine, diseases are seen not just as problems with germs or viruses, but also as a weakness in the energy of the body, which allows sickness to occur. The identification of the species of bacteria or other pathogens involved in an illness can help determine treatment. However, Chinese medicine also recognizes that