In [6]:
from pydantic import BaseModel, Field
import numpy as np
import pandas as pd

from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [13]:
from typing import List, Optional
from pydantic import BaseModel, Field
import json

from langchain.chat_models import init_chat_model

# Initialise LLM (Gemini 2.5)
llm = init_chat_model(
    model="gemini-2.5-flash",
    model_provider="google_genai",
    temperature=0.0
)

# Canonical user input
paragraph = (
    "I identified these five genes to be significantly more mutated than expected by chance in my cohort of human brain cancer patients: TP53, AKT3, EGFR, ATRX and PDX1."
)

# Define Pydantic class for input genes and context
class StudyExtraction(BaseModel):
    genes: List[str] = Field(
        description="List of gene symbols mentioned in the text, normalized to official HGNC/NCBI-style symbols if possible."
    )
    organism: Optional[str] = Field(
        description="Scientific name (binomial) of the organism (e.g., 'Homo sapiens', 'Mus musculus')."
    )
    field_of_study: Optional[str] = Field(
        description="High-level biomedical domain, e.g., 'oncology', 'cancer genomics', 'neuroscience', 'immunology', 'microbiology'."
    )
    organ: Optional[str] = Field(
        description="Primary organ or tissue referenced (e.g., 'brain', 'liver', 'blood')."
    )
    analysis_type: Optional[str] = Field(
        description="Concise description of the analysis performed, e.g., 'differential expression', 'mutation enrichment', 'GWAS', 'copy-number analysis', 'metagenomic profiling'."
    )

# Prompt
parsing_prompt = (
    "You are an experienced bioinformatician. Your job is to extract structured information from short user input text and return only a JSON object conforming to the schema below. Do not include any explanations or commentary.\n\n"
    f"{paragraph}"
)

# Run with structured output
structured_llm = llm.with_structured_output(StudyExtraction)
user_output = structured_llm.invoke(parsing_prompt)

# Convert to JSON string
json_output = user_output.model_dump_json(indent=2)
print(json_output)


{
  "genes": [
    "TP53",
    "AKT3",
    "EGFR",
    "ATRX",
    "PDX1"
  ],
  "organism": "Homo sapiens",
  "field_of_study": "oncology",
  "organ": "brain",
  "analysis_type": "mutation enrichment"
}
