In [None]:
from google import genai
from google.genai.types import HttpOptions
import os
from google.colab import userdata

os.environ["GOOGLE_API_KEY"] =userdata.get("GOOGLE_API_KEY")

client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"], http_options=HttpOptions(api_version="v1"))
model_id = "gemini-1.5-pro"
pdf_path = "/content/20200311-sitrep-51-covid-19.pdf"

In [None]:
from google import genai
from google.genai import types
import pathlib

client = genai.Client()

filepath = pathlib.Path('/content/20200311-sitrep-51-covid-19.pdf')

prompt = """
You are a document parsing assistant.
Please extract and return:
- The full text of the document in markdown,
- Any tables in valid JSON format (including headers and rows),
- Describe any embedded images (e.g., captions or content summary).
Respond as a valid JSON object with fields: 'text', 'tables', 'images'"""
response = client.models.generate_content(
  model="gemini-2.5-flash",
  contents=[
      types.Part.from_bytes(
        data=filepath.read_bytes(),
        mime_type='application/pdf',
      ),
      prompt],
   config=types.GenerateContentConfig(
      response_mime_type="application/json"
  )
  )


In [None]:
print(response.text)

{
  "text": "# World Health Organization\n# Coronavirus disease 2019 (COVID-19) Situation Report – 51\n\n**Data as reported by national authorities by 10 AM CET 11 March 2020**\n\n## HIGHLIGHTS\n\n*   WHO Director-General in his regular media briefing today stated that WHO has been assessing this outbreak around the clock and we are deeply concerned both by the alarming levels of spread and severity, and by the alarming levels of inaction. WHO therefore have made the assessment that COVID-19 can be characterized as a pandemic. For detailed information, please see [here](https://www.who.int/dg/speeches/detail/who-director-general-s-opening-remarks-at-the-media-briefing-on-covid-19---11-march-2020).\n\n*   Four new countries/territories/areas (Bolivia [Plurinational State of], Jamaica, Burkina Faso and Democratic Republic of the Congo) have reported cases of COVID-19 in the past 24 hours.\n\n*   The COVID-19 virus infects people of all ages. However, evidence to date suggests that two gr

In [None]:
import json5

text = response.text
if text.startswith("```json"):
  text = text[6:]
if text.endswith("```"):
  text = text[:-3]

text = text.strip()

try:
    json_data = json5.loads(text)
    print("✅ Parsed successfully")
    print(json_data.keys())
except ValueError as e:
    print("Parsing failed:", e)


✅ Parsed successfully
dict_keys(['text', 'tables', 'images'])


In [None]:
with open('Parser_pdf_output.json', 'w') as f:
    json5.dump(json_data, f)

In [None]:
!pip install -q langchain langchain_community langgraph

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.3/153.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m444.0/444.0 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.9/43.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.0/54.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
pip install -U jq


Collecting jq
  Downloading jq-1.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Downloading jq-1.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (757 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m757.1/757.1 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jq
Successfully installed jq-1.10.0


In [None]:
with open('Parser_pdf_output.json', 'r') as f:
    json_data = json5.load(f)

In [None]:
from langchain.docstore.document import Document

docs =[]

docs.append(Document(page_content=json_data["text"], metadata={"type": "text"}))


In [None]:
json_data.get("images", [])

[{'caption': 'Figure 1. Countries, territories or areas with reported confirmed cases of COVID-19, 11 March 2020',
  'content_summary': 'A world map showing the distribution of COVID-19 cases as of March 11, 2020. Countries with confirmed cases are shaded based on the number of cases, ranging from 1-2 (lightest shade) to >11000 (darkest red). Major outbreak areas include China, Republic of Korea, and Italy, with numerous other countries reporting cases. A note specifies that 696 cases are identified on a cruise ship in Japanese territorial waters.'},
 {'caption': 'Figure 2. Epidemic curve of confirmed COVID-19 cases reported outside of China, by date of report and WHO region through 11 March 2020',
  'content_summary': 'A stacked bar chart illustrating the daily number of new confirmed COVID-19 cases reported outside of China from December 30, 2019, to March 9, 2020. The bars are segmented by WHO region (Eastern Mediterranean, Europe, Other, Americas, South-East Asia, Western Pacific, 

In [None]:
for i in json_data.get('tables',[]):
  table_content = f"table(caption:{i['caption']}):{i['headers']} | Rows: {i['rows']}"
  docs.append(Document(page_content=table_content, metadata={"type": "table", "caption": i["caption"]}))

In [None]:
for t in json_data.get("images", []):
    table_str = f"image (caption {t['caption']}):  | content summary: {t['content_summary']}"
    docs.append(Document(page_content=table_str, metadata={"type": "table", "caption": t["caption"]}))

In [None]:
len(docs)

5

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)

chunks = splitter.split_documents(docs)

In [None]:
len(chunks)

63

In [None]:
!pip install -q faiss_cpu langchain_huggingface

In [None]:
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': False}
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from typing_extensions import TypedDict
from typing import List,Annotated
from langchain_core.messages import BaseMessage
from langgraph.graph.message import add_messages


class chatState(TypedDict):
  messages:Annotated(List[BaseMessage], add_messages)
  Retrieval_docs:List[str]


In [None]:
!pip install -q sentence-transformers

In [None]:
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-base")



In [None]:
from langchain.retrievers.document_compressors import CrossEncoderReranker
from sentence_transformers import CrossEncoder


compressor = CrossEncoderReranker(model=model, top_n=3)


In [None]:
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker


def retrieval(state:chatState):
  query = state["messages"][-1].content
  try:
    vectorstore = FAISS.load_local("faiss_index", embedding_model)
  except:
    vectorstore = FAISS.from_documents(chunks, embedding_model)
    vectorstore.save_local("faiss_index")


  semantic_retrieval = vectorstore.as_retriever(search_type='mmr',search_kwargs={"k": 5})
  lexical_retrieval = BM25Retriever.from_documents(chunks)
  retrieval = EnsembleRetriever(retrievers=[semantic_retrieval, lexical_retrieval], weights=[0.7, 0.3])


  compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retrieval)
  docs = compression_retriever.get_relevant_documents(query)

  return {'retrieval_docs':docs}



In [None]:
!pip install -q langchain_ollama

In [None]:
from langchain_ollama import ChatOllama

llm = ChatOllama(model="mistral", temperature=0.7)


In [None]:
from langchain.prompts import PromptTemplate
def generate_content(state:chatState):
  docs=state['docs']
  context=''
  for doc in docs:
    context+=doc.page_content

  prompt = PromptTemplate(
      input_variables=["context", "query"],
      template='''
      You are a helpful assistant whose primary goal is to answer the user's query based only on the provided context.

Follow these rules strictly:

Answer the query concisely using only the information in the provided context.

You must cite the source of your information by including a citation number, for example [1] or [table_1] or [figure 2], at the end of each sentence or fact that comes from a specific source.

If the provided context is not enough to answer the query, you must respond with the exact phrase "I don't know because I don't find relevant content yet."

Context:
[context]

User Query:
[query]

Answer:
      '''
  )

  response = llm.invoke({'context':context, 'query':state['messagess']})

  return {'messages':response}



In [None]:
!pip install -q langgraph-checkpoint-sqlite

In [None]:
from langgraph.graph import StateGraph,START,END
import sqlite3
from langgraph.checkpoint.memory import SqliteSaver

graph = StateGraph(state_schema=chatState)

graph.add_node('retriever',retrieval)
graph.add_node('generate_content',generate_content)

graph.add_edge(START,'retriever')
graph.add_edge('retriever','generate_content')
graph.add_edge('generate_content',END)

conn = sqlite3.connect("checkpoints.db", check_same_thread=False)
checkpointer = SqliteSaver(conn)

workflow=graph.compile(checkpointer=checkpointer)