In [43]:
import json

import requests, pysam
import tempfile
import os

import pandas as pd
import ollama


In [44]:
def fetch_header(url, nbytes=1_000_000):
    # Range request for the first nbytes
    r = requests.get(url, headers={'Range': f'bytes=0-{nbytes}'})
    with tempfile.NamedTemporaryFile(delete=False, suffix='.bam') as tmp:
        tmp.write(r.content)
        tmp_path = tmp.name
    try:
        header = pysam.AlignmentFile(tmp_path, "rb", ignore_truncation=True, check_sq=False).header.to_dict()
    finally:
        os.remove(tmp_path)
    return header

In [45]:
def ollama_parse_header_prompt(header_dict):
    return f"""
You are a genomics metadata assistant.

Below is the SAM/BAM header as a Python dictionary. Your job is to extract and summarize the most relevant metadata:

1. Reference genome (e.g., GRCh38, T2T-CHM13, hg19)
2. Annotation version (e.g., GENCODE v40, RefSeq)
3. Alignment tool (e.g., STAR, BWA, Minimap2) and version
4. Any transcriptome or quantification settings (e.g., --quantMode TranscriptomeSAM)

Header:
{header_dict}

Please return a clean summary like:

Reference genome: ___  
Annotation version: ___  
Aligner: ___  
Quant mode: ___  
Notes: ___
"""


In [46]:
# import data and subset to demo example
findability_subset_df = pd.read_csv('findability-funk/anvil-manifest-3a7b7cb2-10be-5eb2-9c74-28f2662904ee.40bc3110-e5b2-5c64-92d2-99d0f02b28ed.tsv',sep='\t')
demo_df = findability_subset_df[~findability_subset_df['files.reference_assembly'].isna()].head(n=5)
demo_df = demo_df[['files.reference_assembly', 'files.file_name', 'files.azul_file_url']]

In [47]:

for url in demo_df['files.azul_file_url'].tolist():
    print(f"⏳ Fetching: {url}")
    hdr = fetch_header(url)
    asm = hdr.get('HD', {}).get('AS', 'unknown')

    print(f"{url.split('/')[-1]} → assembly tag: {asm}")

    prompt = ollama_parse_header_prompt(hdr)

    response = ollama.chat(model="llama3.2", messages=[
        {"role": "user", "content": prompt}
    ])
    
    print("📋 Parsed Metadata:\n", response['message']['content'], "\n" + "-"*60)


⏳ Fetching: https://service.explore.anvilproject.org/repository/files/001ae10d-fbcd-4083-a332-9a0a706bf8c2?catalog=anvil9&version=2022-06-01T00%3A00%3A00.000000Z


  header = pysam.AlignmentFile(tmp_path, "rb", ignore_truncation=True, check_sq=False).header.to_dict()


001ae10d-fbcd-4083-a332-9a0a706bf8c2?catalog=anvil9&version=2022-06-01T00%3A00%3A00.000000Z → assembly tag: unknown
📋 Parsed Metadata:
 Here is the cleaned-up summary:

**Summary**

* **Reference Genome**: hg38_gencode40
* **Annotation Version**: gencode40
* **Aligner**: STAR
* **Quant Mode**: TranscriptomeSAM
* **Notes**: STAR alignment with filter and quant mode set for transcriptome analysis. 
------------------------------------------------------------
⏳ Fetching: https://service.explore.anvilproject.org/repository/files/005f272a-64f4-46ad-8203-ad2fad4a30ad?catalog=anvil9&version=2022-06-01T00%3A00%3A00.000000Z
005f272a-64f4-46ad-8203-ad2fad4a30ad?catalog=anvil9&version=2022-06-01T00%3A00%3A00.000000Z → assembly tag: unknown
📋 Parsed Metadata:
 Here is a cleaned-up summary of the provided information:

**Summary**

* Reference genome: hg38_gencode40
* Annotation version: gencode40
* Aligner: STAR
* Quant mode: TranscriptomeSAM

Note that I extracted the relevant information from th