In [None]:
# --- 0. Setup ---
!pip install llama-cloud-services llama-index-core llama-index-readers-file python-dotenv chromadb langchain-community sentence-transformers
!pip install nest_asyncio



In [None]:
# --- 1. Parse Document using LlamaParse ---
from llama_cloud_services import LlamaParse
from google.colab import userdata
import nest_asyncio

nest_asyncio.apply()

parser = LlamaParse(api_key=userdata.get('Lamaparser'), result_type="markdown", premium_mode=True)

file_url = "/content/September 2023 Report.pdf"

parsed_docs = parser.load_data(file_url)

print(parsed_docs[5].text)

Started parsing the file under job_id 0ee3a45b-38d0-4a37-9e71-a741cc34d5b8
# DISEASE OUTBREAKS OF PREVIOUS WEEKS REPORTED LATE

| Unique ID.         | Name of State/UT | Name of District | Disease/ Illness | No. of Cases | No. of Deaths | Date of Start Outbreak | Current Status     | Comments/ Action Taken                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
| ------------------ | 

In [None]:
# --- 2. Chunking the Document ---
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.schema import Document

docs = [Document(page_content=doc.text) for doc in parsed_docs]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[
    ("#", "Heading1"),

    ("##", "Heading2"),
    ("###", "Heading3")
])

chunks = []
for doc in docs:
    chunks.extend(markdown_splitter.split_text(doc.page_content))

print(f"Number of markdown chunks: {len(chunks)}\n")
print(chunks[12].page_content)

Number of markdown chunks: 13

COVID-19 was declared as a pandemic on 11ᵗʰ March, 2020. The States have been reporting the COVID-19 cases and deaths on a real-time basis and the same was being updated on the website of Ministry of Health and Family Welfare. This is made available on public domain and can be accessed through https://www.mohfw.gov.in/


In [None]:
# --- 3. Generate Embeddings & Create Chroma Vector DB ---
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vector_store = Chroma.from_documents(documents=chunks, embedding=embedding_model, persist_directory="db")
vector_store.persist()

  embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  vector_store.persist()


In [None]:
all_docs = vector_store.get()["documents"]


for i, doc in enumerate(all_docs[:5]):
    print(f"\n--- Document {i+1} ---")
    print(doc[:500])

#Metadata
all_metas = vector_store.get()["metadatas"]

for i in range(min(5, len(all_metas))):
    print(f"\n--- Metadata for Doc {i+1} ---")
    print(all_metas[i])


--- Document 1 ---
Disease Alerts/Outbreaks reported and responded to by States/UTs through Integrated Disease Surveillance Program (IDSP)
(27th February 2023 to 5th March 2023)

--- Document 2 ---
District wise disease alerts/outbreaks reported in the 9th week 2023

--- Document 3 ---
[Map of India with disease outbreaks marked]  
Raisen- Chickenpox  
Durg- Fever with Rash  
Aurangabad- Measles  
Ahmednagar- Fever  
Cuttack- Food Poisoning  
Malappuram- Cholera  
Rayagada- ADD  
Thrissur- Adeno Virus  
Palakkad- Chickenpox(2), Food Poisoning  
Ernakulam- Adeno Virus  
Tiruvannamalai- Chickenpox  
Salem- Chickenpox  
Kottayam- Adeno Virus

--- Document 4 ---
| Description                                                          | Number |
| -------------------------------------------------------------------- | ------ |
| No. of States/UT's Submitted outbreak report ( including NIL report) | 15     |
| No. of States/UT's Submitted 'NIL' outbreak report                   | 03     |  
IN

In [None]:
# @title
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})

query = "How many instances of chickenpox were observed"
relevant_chunks = retriever.get_relevant_documents(query)

for i, chunk in enumerate(relevant_chunks):
    print(f"\nChunk {i+1}:")
    print(chunk.page_content)


Chunk 1:
| Disease/alerts          | No. of alerts/outbreaks |
| ----------------------- | ----------------------- |
| Chickenpox              | 5                       |
| Adeno Virus             | 3                       |
| Food Poisoning          | 2                       |
| Acute Diarrheal Disease | 1                       |
| Fever                   | 1                       |
| Fever with Rash         | 1                       |
| Measles                 | 1                       |
| Cholera                 | 1                       |

Chunk 2:
[Map of India with disease outbreaks marked]  
Raisen- Chickenpox  
Durg- Fever with Rash  
Aurangabad- Measles  
Ahmednagar- Fever  
Cuttack- Food Poisoning  
Malappuram- Cholera  
Rayagada- ADD  
Thrissur- Adeno Virus  
Palakkad- Chickenpox(2), Food Poisoning  
Ernakulam- Adeno Virus  
Tiruvannamalai- Chickenpox  
Salem- Chickenpox  
Kottayam- Adeno Virus

Chunk 3:
| MP/RSN/2023/09/199 | Madhya Pradesh | Raisen   | Chickenpox         

  relevant_chunks = retriever.get_relevant_documents(query)


In [None]:
!pip install huggingface-hub llama-cpp-python



In [None]:
from llama_cpp import Llama

llm = Llama.from_pretrained(
    repo_id="prithivMLmods/Llama-Sentient-3.2-3B-Instruct-GGUF",
    filename="Llama-Sentient-3.2-3B-Instruct.Q5_K_M.gguf",
    verbose=False
)

question = "From which disease individuals were most affected?"

output = llm(
      prompt = f"""
      You are a helpful assistant. Use the context below to answer the question.

      Context:
      {relevant_chunks[0].page_content}

      Question:
      {question}

      Answer:
      """,
      max_tokens=32,
      stop=["Q:", "\n"],
)
print(output)

(…)ama-Sentient-3.2-3B-Instruct.Q5_K_M.gguf:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

llama_init_from_model: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


{'id': 'cmpl-9f59ee58-765f-4d81-a4a7-0b97f0f0be2f', 'object': 'text_completion', 'created': 1744874022, 'model': '/root/.cache/huggingface/hub/models--prithivMLmods--Llama-Sentient-3.2-3B-Instruct-GGUF/snapshots/011b69eb6ceecb2ead6f1609fff329c8fe8a780a/./Llama-Sentient-3.2-3B-Instruct.Q5_K_M.gguf', 'choices': [{'text': ' To determine the age of the individuals most affected, we need to look for the disease with the most number of outbreaks. In this case, Chickenpox has', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 144, 'completion_tokens': 32, 'total_tokens': 176}}
