In [5]:
from ollama import chat
from ollama import ChatResponse
import json
import pandas as pd

In [22]:
findability_subset_df = pd.read_csv('findability-funk/anvil-manifest-3a7b7cb2-10be-5eb2-9c74-28f2662904ee.40bc3110-e5b2-5c64-92d2-99d0f02b28ed.tsv',sep='\t')

filenames = findability_subset_df[~findability_subset_df['files.reference_assembly'].isna()]['files.file_name'].tolist()[:5]
# Format filenames into a prompt
file_list_prompt = "\n".join(f'- {name}' for name in filenames)

prompt = f"""
You are a metadata inference agent. You are given a list of filenames corresponding to genomic data files.

Your job is to infer and output the following metadata per filename:
1. Data modality (e.g., genomics, transcriptomics)
2. Reference genome (e.g., GRCh38, T2T-CHM13, mm10)
3. Technology tag (e.g., Illumina, PacBio, Nanopore)

Respond ONLY in valid JSON array format. Do not return Python code, do not explain. No text, no markdown, no code blocks.

Filenames:
{file_list_prompt}
"""

# Call Ollama with parameters to match your original model
response = chat(
    model='llama3.2',
    messages=[{"role": "user", "content": prompt}],
    options={
        "temperature": 0.3,
        "top_p": 0.9,
        "num_ctx": 4096
    }
)

try:
    metadata = json.loads(response['message']['content'])
    for item in metadata:
        print(json.dumps(item, indent=2))
except json.JSONDecodeError as e:
    print("Failed to parse JSON:", e)
    print("Raw response:\n", repr(response['message']['content']))

{
  "data_modality": "genomics",
  "reference_genome": "GRCh38",
  "technology_tag": "Illumina"
}
{
  "data_modality": "genomics",
  "reference_genome": "T2T-CHM13",
  "technology_tag": "Illumina"
}
{
  "data_modality": "genomics",
  "reference_genome": "mm10",
  "technology_tag": "PacBio"
}
{
  "data_modality": "genomics",
  "reference_genome": "GRCh38",
  "technology_tag": "Nanopore"
}
{
  "data_modality": "genomics",
  "reference_genome": "T2T-CHM13",
  "technology_tag": "PacBio"
}
