In [1]:
from langchain.chains import retrieval_qa
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

from langchain.document_loaders import PyMuPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_community.llms import CTransformers



In [2]:
try:
    import fitz
    print("fitz (PyMuPDF) imported successfully!")
except ImportError:
    print("Error: fitz (PyMuPDF) not found.")


fitz (PyMuPDF) imported successfully!


In [3]:
#PINECONE_API_KEY = "9ea3155f-be6e-4c0f-aa59-a6ae0d1e19b4"
#PINECONE_API_ENV = "gcp-starter"

In [4]:
# loading the pdf
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob = "*.pdf",
                    loader_cls = PyMuPDFLoader )
    
    docs = loader.load()
    return docs


In [5]:
# to extract the data
import fitz
extracted_data = load_pdf(r"F:\GenerativeAI_iNeuronCourse\medCHATBOT\data")  # having multiple pdfs in the folder will also work those will 
#also be loaded

In [6]:
#now according to architecture need to convert the data to text chunks
def text_splitter(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks



In [7]:
text_chunks=text_splitter(extracted_data)
len(text_chunks)

5779

In [8]:
text_chunks[1023].page_content

'head on the spine and the resulting release of the\nerector muscles of the back and legs which estab-\nlish improved coordination.\nHabit—Referring to the particular set of physical\nand mental tensions present in any individual.\nInhibition—Referring to the moment in an Alexan-\nder lesson when the student refrains from begin-\nning a movement in order to avoid tensing of the\nmuscles.\nSensory awareness—Bringing attention to the sen-\nsations of tension and/or release in the muscles.'

In [9]:
#now we need to create embeddings
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [10]:
embeddings=download_hugging_face_embeddings()

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


In [11]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [12]:
query_result = embeddings.embed_query("hello world") # embed_query:  vector representation of hello world  so the dimension of the vector is 384
len(query_result)

384

In [13]:
import chromadb


In [14]:
#using the vector db
#taking all the chunks and embedding model and applying it all and creating embeddings and stored in its database

#from langchain_chroma import Chroma

persist_directory = 'med_db1'
vectordb = Chroma.from_documents(text_chunks,
                                 embedding = embeddings,
                                 persist_directory =  persist_directory)   #embedding means we have given the embedding model
#Now we can load the persisted database from disk, and use it normally
vectordb=None
vectordb = Chroma( persist_directory =  persist_directory, embedding_function = embeddings)

#vector_db =  Chroma.from_documents(text_chunks,
                               #  embedding = embeddings,
                                # persist_directory =  persist_directory)


  warn_deprecated(


In [15]:
vectordb

<langchain_community.vectorstores.chroma.Chroma at 0x248b12e9370>

In [16]:
retriever = vectordb.as_retriever()

In [17]:
answer = retriever.invoke("what are allergies")

In [18]:
retriever= vectordb.as_retriever(search_kwargs={"k":2})

In [19]:
docs = retriever.invoke("what are allergies")

In [20]:
docs[0].page_content

'to commonly encountered environmental substances.\nPurpose\nAllergy is a reaction of the immune system. Nor-\nmally, the immune system responds to foreign microor-\nganisms and particles, like pollen or dust, by producing\nspecific proteins called antibodies that are capable of\nbinding to identifying molecules, or antigens, on the\nforeign organisms. This reaction between antibody and\nantigen sets off a series of reactions designed to protect\nthe body from infection. Sometimes, this same series of'