# GEO Curator

Making an agent that could find the single cell RNA dataset and find the h5ad, h5Seurat, csv files and downloaded if approved by the user

Agent loop through:
1. Uses Qwen locally
2. Lets the model decide to call a tool
3. Executes geo_search
4. Feeds results back
5. Optionally downloads a dataset

User question → LLM thinks → decides: "I need GEO" → LLM outputs TOOL_CALL(JSON) → Python executes geo_search() → Tool result injected back into LLM → LLM continues reasoning


In [31]:
# Defining a tool
import json
from Bio import Entrez

Entrez.email = "abhinavjj@gmail.com"

def geo_search(query: str, retmax: int = 5) -> str:
    """
    Search GEO DataSets (GDS) using a query string.
    Returns a JSON string of GEO IDs.
    """
    handle = Entrez.esearch(
        db="gds",
        term=query,
        retmax=retmax
    )
    record = Entrez.read(handle)
    handle.close()
    return json.dumps(record["IdList"])


Since LLM is hard constraints we tell Qwen: If you want to call a tool, output EXACTLY this JSON

In [2]:
{
  "tool_call": {
    "name": "geo_search",
    "arguments": {
      "query": "string"
    }
  }
}
# Other wise Normal text


{'tool_call': {'name': 'geo_search', 'arguments': {'query': 'string'}}}

In [74]:
import re

def extract_ftp_links(soft_text):
    ftp_links = re.findall(r'ftp://[^\s]+', soft_text)
    return list(set(ftp_links))


In [75]:
def geo_summary(uid):
    # Step 1: Summary
    handle = Entrez.esummary(db="gds", id=uid)
    summary = Entrez.read(handle)
    handle.close()

    # Step 2: Full SOFT (text, not XML)
    handle = Entrez.efetch(
        db="gds",
        id=uid,
        rettype="full",
        retmode="text"
    )
    soft_text = handle.read()
    handle.close()

    ftp_links = extract_ftp_links(soft_text)

    return {
        "summary": summary,
        "ftp_links": ftp_links
    }


In [76]:
geo_summary(200301650)

{'summary': [{'Item': [], 'Id': '200301650', 'Accession': 'GSE301650', 'GDS': '', 'title': 'Single-cell RNA-seq of isolated non-parenchymal cells from imiquimod-induced psoriasis mouse model.', 'summary': 'We applied 10x Genomics single-cell RNA sequencing to profile non-parenchymal liver cells (NPCs) in a psoriasis-like mouse model. The study focuses on immune and liver sinusoidal endothelial cell (LSEC) alterations along the skin–liver axis to elucidate mechanisms driving comorbid liver disease in psoriasis. Given that 30–50% of psoriasis patients develop liver involvement, this approach aims to identify maladaptive cellular crosstalk and potential therapeutic targets.', 'GPL': '24247', 'GSE': '301650', 'taxon': 'Mus musculus', 'entryType': 'GSE', 'gdsType': 'Expression profiling by high throughput sequencing', 'ptechType': '', 'valType': '', 'SSInfo': '', 'subsetInfo': '', 'PDAT': '2026/01/01', 'suppFile': 'MTX, TSV', 'Samples': [{'Accession': 'GSM9086914', 'Title': 'Liver NPCs, Imi

In [None]:
# new GEO with extracting of the FTP link to be downloaded
import json
from Bio import Entrez
import re

Entrez.email = "abhinavjj@gmail.com"

def geo_summary(uid: str) -> str:
    Entrez.email = "abhinavjj@gmail.com"

    # --- Step 1: Get GSE accession ---
    handle = Entrez.esummary(db="gds", id=uid)
    record = Entrez.read(handle)
    handle.close()

    doc = record[0]
    gse = doc.get("Accession")

    summary = {
        "uid": uid,
        "accession": gse,
        "title": doc.get("title", "unknown"),
        "type": doc.get("gdsType", "unknown"),
        "platform": doc.get("GPL", "unknown"),
        "n_samples": doc.get("n_samples", "unknown"),
        "supplementary_files": []
    }

    if not gse:
        return json.dumps(summary, indent=2)

    # --- Step 2: Fetch GEO XML summary ---
    handle = Entrez.efetch(
        db="gds",
        id=uid,
        # rettype="summary",
        retmode="xml"
    )
    # records = Entrez.read(handle)
    soft_text = handle.read()
    handle.close()

    # --- Step 3: Extract FTP links safely ---
    ftp_links = []

    def walk(obj):
        if isinstance(obj, dict):
            for v in obj.values():
                walk(v)
        elif isinstance(obj, list):
            for v in obj:
                walk(v)
        elif isinstance(obj, str) and obj.startswith("ftp://"):
            ftp_links.append(obj)

    walk(records)

    summary["supplementary_files"] = sorted(set(ftp_links))

    return json.dumps(summary, indent=2)


In [53]:
import requests
from pathlib import Path

def geo_download_from_ftp(ftp_url: str):
    import requests, os

    filename = os.path.basename(ftp_url)
    r = requests.get(ftp_url, stream=True)
    r.raise_for_status()

    with open(filename, "wb") as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)

    return f"Downloaded {filename}"


In [54]:
{
  "tool_call": {
    "name": "geo_summary",
    "arguments": {
      "uid": "200289404"
    }
  }
}


{'tool_call': {'name': 'geo_summary', 'arguments': {'uid': '200289404'}}}

In [55]:
AGENT_SYSTEM_PROMPT = """
You are a bioinformatics research agent specializing in single-cell RNA-seq data curation.

You have access to TWO tools:
1. geo_search(query: str) → returns GEO Entrez UIDs
2. geo_download_from_ftp(ftp_url: str) → downloads processed GSE* processed file

Rules:
- You may call ONLY ONE tool per response.
- If you need to search GEO, output ONLY a JSON tool call.
- Do NOT hallucinate GEO accessions.
- GEO search results are Entrez UIDs, NOT GSE accessions.
- UID resolution (UID → GSE → metadata) will be provided to you as tool output.
- You may ONLY use GSE accessions explicitly provided in tool results.
- Do not mix explanations with tool calls.
- If a field is missing, say "unknown".
- Ask the user for confirmation BEFORE downloading any dataset.
- NEVER construct or guess GEO FTP URLs.
- FTP links MUST be extracted verbatim from supplementary file fields in geo_summary.
- If no supplementary FTP link exists, report "unknown".
- Download ONLY one ftp link from supplementary files.

After UID resolution, analyze each dataset and determine:
- Is this single-cell RNA-seq?
- What organism and immune context?
- Are RAW or processed files available?
- Does it match the user’s biological question?

Tool call formats (EXACT):

{
  "tool_call": {
    "name": "geo_search",
    "arguments": {
      "query": "..."
    }
  }
}

{
  "tool_call": {
    "name": "geo_download_from_ftp",
    "arguments": {
      "ftp_url": "..."
    }
  }
}
"""


In [6]:
## Loading a Qwen model
# conda activate torch_gpu_dna
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_id = "Qwen/Qwen2.5-7B-Instruct"
## KimiK2 thinking cannot be downloaded so we start with Qwen. Also my GPU is Tesla T4 so I will stick to Qwen-7B.

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    load_in_4bit=True,
    device_map="cuda"
)

  from .autonotebook import tqdm as notebook_tqdm
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.38s/it]


In [None]:
### This is just generating the like provide you the plan as you instructed the agent
def run_llm(messages, max_new_tokens=512):
    # This line converts structured chat messages into a single text prompt in the 
    # exact format the model was trained on.
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    # It tokenizes based on the model tokenization was done.
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad(): # Since inference not training
        output_ids = model.generate( # autoregressive text generation
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.2, # randomness or creativity is low
            do_sample=False # does not matter about the randomness
        )
    return tokenizer.decode(
        output_ids[0][inputs["input_ids"].shape[-1]:],
        skip_special_tokens=True
    ).strip()


#### Tool-call detector (the agent “router”)

In [56]:
import json

def parse_tool_call(text: str):
    try:
        data = json.loads(text)
        if "tool_call" in data:
            return data["tool_call"]
    except json.JSONDecodeError:
        return None
    return None


In [70]:
import json

def run_agent(prompt, max_steps=5):
    # This message is just the initialization where Agent System Prompt tells the agent what to do and user provided the input
    # Then in the message it keep on getting appended more messages, tools etc.
    messages = [
        {"role": "system", "content": AGENT_SYSTEM_PROMPT},
        {"role": "user", "content": prompt},
    ]
    for step in range(max_steps):
        print(f'step:{step}')
        # This line converts structured chat messages into a single text prompt in the 
        # exact format the model was trained on.
        # every time model see the full history what has happened. 
        # This converts all previous messages into a single prompt
        # The model sees everything:
        # system rules
        # user query
        # previous tool calls\
        # previous tool results
        inputs = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        # It tokenizes based on the model tokenization was done.
        model_inputs = tokenizer(inputs, return_tensors="pt").to(model.device)
        with torch.no_grad(): # Since inference not training
            output_ids = model.generate( # autoregressive text generation
                **model_inputs,
                max_new_tokens=512,
                do_sample=False,
            )
        response = tokenizer.decode(
            output_ids[0][model_inputs["input_ids"].shape[-1]:],
            skip_special_tokens=True
        ).strip()
        print(f"\nLLM OUTPUT:\n{response}") ## This returns only the JSON since it is instructed in the Agent system prompt when geo_search is called
        # while when geo_summary is called it will result the output in the language
        # 2. Try parsing tool call
        try:
            data = json.loads(response) 
            tool_call = data.get("tool_call") # if there is a tool call it is true
        except json.JSONDecodeError:
            # Normal text → done
            return response
        if tool_call:
            tool_name = tool_call["name"] # tool name whether geo_search or geo_summary
            args = tool_call["arguments"]
            # 3. Execute tool
            # if tool_name == "geo_search": # this is taking only one UID
            #     result = geo_search(**args)
            if tool_name == "geo_search": # this can take multiple UID and feed to geo_summary
                uids = json.loads(geo_search(**args))
                summaries = []
                for uid in uids:
                    summaries.append(json.loads(geo_summary(uid=uid)))
                result = json.dumps(summaries, indent=2)
                # print(f'geo_search result: {result}')
            # elif tool_name == "geo_summary":
            #     result = geo_summary(**args)
            elif tool_name == "geo_download_from_ftp":
                result = geo_download_from_ftp(**args)
            else:
                raise ValueError(f"Unknown tool: {tool_name}")
            print(f"\nTOOL RESULT:\n{result}")
            # 4. Feed tool result back to model
            # After every step, you append new entries so the model can reason based on what already happened.
            messages.append({
                "role": "assistant",
                "content": response
            })
            # Models trained for tool calling (Qwen, Llama, GPT-style) expect:
            # | Role        | Meaning                           |
            # | ----------- | --------------------------------- |
            # | `system`    | Rules and behavior                |
            # | `user`      | Human request                     |
            # | `assistant` | Model’s reasoning / tool decision |
            # | `tool`      | External factual input            |
            # when role: tool it knows: “This came from the real world, not my imagination.”
            messages.append({
                "role": "tool",
                "name": tool_name,
                "content": result
            })
            # So you need to append both assistant and the tool
            # Assistant	“I decided to call a tool”
            # Tool	“Here is the result of that tool”
        else:
            return response
    raise RuntimeError("Agent did not finish in time")


In [71]:
query = """
Find single-cell RNA-seq datasets related to immune cell aging.
Prefer processed data formats if possible.
"""

final_output = run_agent(query)
print("\nFINAL OUTPUT:\n", final_output)


step:0

LLM OUTPUT:
{
  "tool_call": {
    "name": "geo_search",
    "arguments": {
      "query": "immune cell aging single-cell RNA-seq processed"
    }
  }
}


StreamModeError: the XML file must be opened in binary mode.

In [None]:
# now searching in the GSE than GDS
query = (
    "single cell[All Fields] AND RNA-seq[All Fields] AND immune[All Fields]"
)

final_output = run_agent(query)
print("\nFINAL OUTPUT:\n", final_output)


step:0

LLM OUTPUT:
{
  "tool_call": {
    "name": "geo_search",
    "arguments": {
      "query": "single cell RNA-seq immune"
    }
  }
}
geo_search result: [
  {
    "uid": "200301650",
    "accession": "GSE301650",
    "title": "Single-cell RNA-seq of isolated non-parenchymal cells from imiquimod-induced psoriasis mouse model.",
    "gse": "301650",
    "type": "Expression profiling by high throughput sequencing",
    "platform": "24247",
    "n_samples": 8
  },
  {
    "uid": "200301873",
    "accession": "GSE301873",
    "title": "TGF-\u03b2RII/IL-15 Immunotherapeutic complex targets exhausted CD8+ T cell subsets in lymph nodes and tumors",
    "gse": "301873",
    "type": "Expression profiling by high throughput sequencing",
    "platform": "21103",
    "n_samples": 4
  },
  {
    "uid": "200293951",
    "accession": "GSE293951",
    "title": "Single cell RNA-seq of myeloid immune cells in melanoma-draining lymph nodes",
    "gse": "293951",
    "type": "Expression profiling by 