In [1]:
from pydantic import BaseModel, Field
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [10]:
from typing import List, Optional
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate

from langchain.chat_models import init_chat_model

# Initialise LLM (Gemini 2.5)
llm = init_chat_model(
    model="gemini-2.5-flash",
    model_provider="google_genai",
    temperature=0.0
)

# Canonical user input
paragraph = (
    "I identified these five genes to be significantly more mutated than expected by chance in my cohort of human brain cancer patients: TLR4, TLR2, IL1B, CCL14."
)

# Define Pydantic class for input genes and context
class StudyExtraction(BaseModel):
    genes: List[str] = Field(
        description="List of gene symbols mentioned in the text, normalized to official HGNC/NCBI-style symbols if possible."
    )
    organism: Optional[str] = Field(
        description="Scientific name (binomial) of the organism (e.g., 'Homo sapiens', 'Mus musculus')."
    )
    field_of_study: Optional[str] = Field(
        description="High-level biomedical domain, e.g., 'oncology', 'cancer genomics', 'neuroscience', 'immunology', 'microbiology'."
    )
    organ: Optional[str] = Field(
        description="Primary organ or tissue referenced (e.g., 'brain', 'liver', 'blood')."
    )
    analysis_type: Optional[str] = Field(
        description="Concise description of the analysis performed, e.g., 'differential expression', 'mutation enrichment', 'GWAS', 'copy-number analysis', 'metagenomic profiling'."
    )
    GSEA: bool = Field(default=False, 
                       description="Whether the user mentions that a GSEA is needed on the gene set. If no mention, keep it False.")

#alternative implementation to parse as pydantic more robustly
parser = PydanticOutputParser(pydantic_object=StudyExtraction)
format_instructions = parser.get_format_instructions()

prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract per schema:\n{format_instructions}"),
    ("human", "{paragraph}"),
]).partial(format_instructions=format_instructions)
parsing_llm = prompt | llm | parser

# pass raw user input "paragraph"
parsed_input = parsing_llm.invoke({"paragraph": paragraph})
#convert to JSON string
json_output = parsed_input.model_dump_json(indent=2)
print(json_output)

#next step - inject the json to the LLM to determine attributes to fetch from BioMart


{
  "genes": [
    "TLR4",
    "TLR2",
    "IL1B",
    "CCL14"
  ],
  "organism": "Homo sapiens",
  "field_of_study": "oncology",
  "organ": "brain",
  "analysis_type": "mutation enrichment analysis",
  "GSEA": false
}


In [11]:
from src.querries_script import group_by_gene_dynamic, fill_with_ncbi, call_querry_biomart
import pandas as pd    

attributes = pd.read_csv("data/attributes.csv")["name"].to_list()[:15]
print(len(attributes))
    
output = call_querry_biomart(attributes=attributes,
                            filters={"external_gene_name": parsed_input.genes})

print("biomart done")

output = group_by_gene_dynamic(output)

output = fill_with_ncbi(output)

print("NCBI done")

print(output)

15
biomart done
NCBI done
[{'ensembl_gene_id': 'ENSG00000125538', 'description': 'interleukin 1 beta [Source:HGNC Symbol;Acc:HGNC:5992]', 'chromosome_name': '2', 'start_position': '112829751', 'end_position': '112836816', 'strand': '-1', 'band': 'q14.1', 'external_gene_name': 'IL1B', 'transcript_count': '8', 'percentage_gene_gc_content': '45.51', 'gene_biotype': 'protein_coding', 'external_synonym': 'IL-1B', 'phenotype_description': 'GASTRIC CANCER GASTRIC CANCER INTESTINAL INCLUDED', 'name_1006': ['cellular response to mechanical stimulus', 'cellular response to xenobiotic stimulus', 'cytokine-mediated signaling pathway', 'defense response to Gram-positive bacterium', 'hyaluronan biosynthetic process'], 'namespace_1003': 'biological_process', 'go_id': ['GO:0071260', 'GO:0071466', 'GO:0019221', 'GO:0050830', 'GO:0030213'], 'go_linkage_type': ['IEP', 'IDA'], 'ncbi': 'Gene: IL1B (Homo sapiens)\n\n--- Summary ---\nThe protein encoded by this gene is a member of the interleukin 1 cytokine 