In [1]:
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
import chromadb

In [2]:
#Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [3]:
extracted_data = load_pdf("data/")

In [6]:
print(extracted_data[-2].page_content)

OTHER
“Bursitis.” HealthAnswers.com. 1998 <http://www.health
answers.com>.
Karen Ericson, RN
Bypass surgery seeCoronary artery bypass
graft surgery
Byssinosis
Definition
Byssinosis is a chronic, asthma-like narrowing of the
airways. Also called brown lung disease, byssinosis resultsfrom inhaling particles of cotton, flax, hemp, or jute.
Description
Although inhaling cotton dust was identified as a
source of respiratory disease more than 300 years ago,byssinosis has been recognized as an occupational haz-ard for textile workers for less than 50 years. More than800,000 workers in the cotton, flax, and rope-makingindustries are exposed in the workplace to airborne parti-cles that can cause byssinosis. Only workers in mills thatmanufacture yarn, thread, or fabric have a significant riskof dying of this disease.
In the United States, byssinosis is almost completely
limited to workers who handle unprocessed cotton. Morethan 35,000 textile workers have been disabled by byssi-nosis and 183 die

In [7]:
len(extracted_data)

637

In [5]:
#Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [6]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 7020


In [8]:
#download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [9]:
embeddings = download_hugging_face_embeddings()

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

In [10]:
#text_chunks[1].metadata

In [10]:
text_chunks[20].page_content

'The Gale Encyclopedia of Medicine 2 (GEM2) is a\none-stop source for medical information on nearly 1,700common medical disorders, conditions, tests, and treat-ments, including high-profile diseases such as AIDS,Alzheimer’s disease, cancer, and heart attack. This ency-clopedia avoids medical jargon and uses language thatlaypersons can understand, while still providing thor-ough coverage of each topic. The Gale Encyclopedia of\nMedicine 2 fills a gap between basic consumer health'

In [23]:
import pickle

# Your vector creation logic
vectors = []
for i in range(len(text_chunks)):
    text = text_chunks[i].page_content
    values = embeddings.embed_query(text)
    metadata = text_chunks[i].metadata
    vectors.append({"id": str(f"vec{i}"), "values": values, "text": text, "metadata": metadata})

# Save the vectors list as a binary pickle file
file_path = 'my_list.pkl'
with open(file_path, 'wb') as file:
    pickle.dump(vectors, file)

print(f"Vectors saved to {file_path}")


Vectors saved to my_list.pkl


In [11]:
import pickle

# Load vectors from the binary pickle file
file_path = 'my_list.pkl'
with open(file_path, 'rb') as file:
    vectors = pickle.load(file)


In [2]:
#index.upsert(
 #   vectors=vectors
#)

##############################################################################################################################

## ChromeBD

In [12]:
import chromadb
client = chromadb.PersistentClient(path="data/")

In [13]:
client.heartbeat()

1708435068859383000

In [8]:
#client.delete_collection("medchat")

In [25]:
client.list_collections()

[Collection(name=medchat)]

In [33]:
collection_names = [collection.name for collection in collections]
for collection_name in collection_names:
    print(collection_name)

medchat


In [24]:
collection = client.create_collection(name="medchat")

In [25]:
vector_values = [vector["values"] for vector in vectors]
vector_texts = [vector["text"] for vector in vectors]
vector_ids = [vector["id"] for vector in vectors]
vector_metadata = [vector["metadata"] for vector in vectors]

# Add vectors to the collection
collection.add(
    embeddings=vector_values,
    documents=vector_texts,
    metadatas=vector_metadata,
    ids=vector_ids
)

In [9]:
collection = client.get_collection(name="medchat")

In [10]:
query = "What are Prognosis"
result = collection.query(
    query_embeddings=embeddings.embed_query(query),
    n_results=3
)
context = result["documents"][0]
print(context)

['Prognosis\nA good prognosis is dependent on the ability to treat', 'Prognosis\nA-T is a fatal condition. Children with A-T become\nphysically disabled by their early teens and typically dieby their early 20s, usually from the associated blood can-cers and malignancies. In very rare cases, individualswith A-T may experience slower progression and aslightly longer life span, surviving into their 30s. A-Tcarriers have a five-fold higher risk than non-carriers ofdeveloping certain cancers, especially breast cancer .', 'disease at the time of presentation, the better the long-\nterm outcome after treatment, or prognosis, becomes.\nWhen a patient has physical signs or symptoms\nA very common finding that leads to diagnosis is the']


In [32]:
for i in range(len(result["documents"][0])):
   print(result["documents"][0][i])

Prognosis
A good prognosis is dependent on the ability to treat
Prognosis
A-T is a fatal condition. Children with A-T become
physically disabled by their early teens and typically dieby their early 20s, usually from the associated blood can-cers and malignancies. In very rare cases, individualswith A-T may experience slower progression and aslightly longer life span, surviving into their 30s. A-Tcarriers have a five-fold higher risk than non-carriers ofdeveloping certain cancers, especially breast cancer .


In [11]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [13]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])

In [12]:
llm=CTransformers(model=r"E:\generative_ai\4. med_chat_bot\models\llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.5})

In [14]:
def embedding_query(query):
    result = collection.query(
        query_embeddings=embeddings.embed_query(query),
        n_results=3
    )
    context = result["documents"][0]
    return context

In [15]:
context = embedding_query(query)

In [16]:
context

['Prognosis\nA good prognosis is dependent on the ability to treat',
 'Prognosis\nA-T is a fatal condition. Children with A-T become\nphysically disabled by their early teens and typically dieby their early 20s, usually from the associated blood can-cers and malignancies. In very rare cases, individualswith A-T may experience slower progression and aslightly longer life span, surviving into their 30s. A-Tcarriers have a five-fold higher risk than non-carriers ofdeveloping certain cancers, especially breast cancer .',
 'disease at the time of presentation, the better the long-\nterm outcome after treatment, or prognosis, becomes.\nWhen a patient has physical signs or symptoms\nA very common finding that leads to diagnosis is the']

In [17]:
llm_chian = LLMChain(prompt=PROMPT, llm=llm)

In [None]:
llm_chain = LLMChain(prompt=PROMPT, llm=llm)

while True:
    user_input = input("Input Prompt:")
    
    # Assuming you want to use user_input as the question
    question = user_input
    
    # Retrieve context based on user input
    context = embedding_query(question)
    
    # Make sure to use the correct keys for the input dictionary
    inputs = {"context": context, "question": question}
    
    result = llm_chain(inputs)
    print("Response : ", result["text"])

Input Prompt: What are Prognosis


Response :  The prognosis for individuals with A-T depends on the severity of the condition at the time of presentation. Those who experience slower progression and have a later age of onset tend to have a better long-term outcome after treatment, or prognosis. However, in general, children with A-T become physically disabled by their early teens and typically die by their early 20s, usually from the associated blood cancers and malignancies. A-T carriers have a five-fold higher risk than non-carriers of developing certain cancers, especially breast cancer.


Input Prompt: 1


In [47]:
result

{'context': ['When thisoccurs, an allergy develops against the offending sub-stance (an allergen.)',
  'the itchy, scratchy nose, eyes, and throat common inallergic rhinitis .\nThe particular allergens to which a person is sensi-',
  "GALE ENCYCLOPEDIA OF MEDICINE 2 117Allergies\nAllergic rhinitis is commonly triggered by\nexposure to household dust, animal fur,or pollen. The foreign substance thattriggers an allergic reaction is calledan allergen.\nThe presence of an allergen causes the\nbody's lymphocytes to begin producingIgE antibodies. The lymphocytes of an allergy sufferer produce an unusuallylarge amount of IgE.\nIgE molecules attach to mast\ncells, which contain histamine.HistaminePollen grains\nLymphocyte\nFIRST EXPOSURE"],
 'question': 'allergy',
 'text': 'It is not possible to determine the specific allergen that caused an allergy without more information. The development of an allergy is a complex process involving multiple factors, including genetics, environment, and expo