In [1]:
import re
import json
import pandas as pd

import ollama
from ollama import chat
pd.set_option('display.max_columns', None)

In [2]:
def ollama_nhgri_anvil_agent(findability_df):
    prompt = f"""
You are an expert NHGRI AnVIL genomics metadata agent.

Your job is to infer the **reference genome** used for each file based on the provided metadata fields:
- files.file_name
- datasets.title

Return a JSON list. Each object must follow this format:

{{
  "files.file_name": <string>,
  "files.data_modality": <string>, //e.g. genomic, transcriptomic 
  "files.reference_genome": <string>,           // e.g. GRCh38, T2T-CHM13, hg19, or "unknown"
  "files.determination_data_modality": "logic" | "llm",       // Use "logic" if obvious from name/title, otherwise "llm"
  "files.confidence_data_modality": <float>,                  // Between 0 and 1
  "files.justification_data_modality": <string>
}}

Here is the metadata input:
{json.dumps(findability_df, indent=2)}

Respond ONLY with the JSON list. Do NOT include explanations, markdown, or extra text.
"""

    # Call Ollama
    response = chat(
        model='llama3.2',
        messages=[{"role": "user", "content": prompt}],
        options={
            "temperature": 0.3,
            "top_p": 0.9,
            "num_ctx": 4096
        }
    )

    # Clean and parse output
    raw_output = response['message']['content'].strip()
    clean_output = re.sub(r"^```(?:json)?\s*|\s*```$", "", raw_output, flags=re.MULTILINE)

    try:
        metadata = json.loads(clean_output)
        for item in metadata:
            print(json.dumps(item, indent=2))
        return metadata  # optional: return for downstream use
    except json.JSONDecodeError as e:
        print("❌ Failed to parse JSON:", e)
        print("🔴 Raw response:\n", repr(clean_output))
        return None


In [3]:
# import data and subset to demo example
findability_subset_df = pd.read_csv('findability-funk/anvil-manifest-3a7b7cb2-10be-5eb2-9c74-28f2662904ee.40bc3110-e5b2-5c64-92d2-99d0f02b28ed.tsv',sep='\t')
findability_df = findability_subset_df[['datasets.title', 'files.file_name']].head(n=10).to_dict(orient='records')

In [4]:
%%time
ollama_nhgri_anvil_agent_response = ollama_nhgri_anvil_agent(findability_df)

{
  "files.file_name": "m84091_230905_192642_s1.fail_reads.bc1016.bam",
  "files.data_modality": "genomic",
  "files.reference_genome": "GRCh38",
  "files.determination_data_modality": "logic",
  "files.confidence_data_modality": 0.999,
  "files.justification_data_modality": "logic"
}
{
  "files.file_name": "ZNF800-BGRSDLV04-85_Aligned.sortedByCoord.out.bam",
  "files.data_modality": "genomic",
  "files.reference_genome": "T2T-CHM13",
  "files.determination_data_modality": "logic",
  "files.confidence_data_modality": 0.999,
  "files.justification_data_modality": "logic"
}
{
  "files.file_name": "m84081_231119_015049_s1.fail_reads.bc2083.bam",
  "files.data_modality": "genomic",
  "files.reference_genome": "GRCh38",
  "files.determination_data_modality": "logic",
  "files.confidence_data_modality": 0.999,
  "files.justification_data_modality": "logic"
}
{
  "files.file_name": "NA18906.f1_assembly_v2_genbank.hap1.bam",
  "files.data_modality": "genomic",
  "files.reference_genome": "GRCh

In [10]:
# list of dicts for each filename
ollama_nhgri_anvil_agent_response

with open("AnV.json", "w") as f:
    json.dump(ollama_nhgri_anvil_agent_response, f, indent=2)

[{'files.file_name': 'm84091_230905_192642_s1.fail_reads.bc1016.bam',
  'files.data_modality': 'genomic',
  'files.reference_genome': 'GRCh38',
  'files.determination_data_modality': 'logic',
  'files.confidence_data_modality': 0.999,
  'files.justification_data_modality': 'logic'},
 {'files.file_name': 'ZNF800-BGRSDLV04-85_Aligned.sortedByCoord.out.bam',
  'files.data_modality': 'genomic',
  'files.reference_genome': 'T2T-CHM13',
  'files.determination_data_modality': 'logic',
  'files.confidence_data_modality': 0.999,
  'files.justification_data_modality': 'logic'},
 {'files.file_name': 'm84081_231119_015049_s1.fail_reads.bc2083.bam',
  'files.data_modality': 'genomic',
  'files.reference_genome': 'GRCh38',
  'files.determination_data_modality': 'logic',
  'files.confidence_data_modality': 0.999,
  'files.justification_data_modality': 'logic'},
 {'files.file_name': 'NA18906.f1_assembly_v2_genbank.hap1.bam',
  'files.data_modality': 'genomic',
  'files.reference_genome': 'GRCh38',
  '