In [1]:
from pydantic import BaseModel, Field
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

In [2]:
from typing import List, Optional
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate

from langchain.chat_models import init_chat_model

# Initialise LLM (Gemini 2.5)
llm = init_chat_model(
    model="gemini-2.5-flash",
    model_provider="google_genai",
    temperature=0.0
)

# Canonical user input
paragraph = (
    "I identified these five genes to be significantly more mutated than expected by chance in my cohort of human brain cancer patients: CCLX, TLR4, TLR2, IL1B. Do a GSEA"
)

# Define Pydantic class for input genes and context
class StudyExtraction(BaseModel):
    genes: List[str] = Field(
        description="List of gene symbols mentioned in the text, normalized to official HGNC/NCBI-style symbols if possible."
    )
    organism: Optional[str] = Field(
        description="Scientific name (binomial) of the organism (e.g., 'Homo sapiens', 'Mus musculus')."
    )
    field_of_study: Optional[str] = Field(
        description="High-level biomedical domain, e.g., 'oncology', 'cancer genomics', 'neuroscience', 'immunology', 'microbiology'."
    )
    organ: Optional[str] = Field(
        description="Primary organ or tissue referenced (e.g., 'brain', 'liver', 'blood')."
    )
    analysis_type: Optional[str] = Field(
        description="Concise description of the analysis performed, e.g., 'differential expression', 'mutation enrichment', 'GWAS', 'copy-number analysis', 'metagenomic profiling'."
    )
    GSEA: bool = Field(default=False, 
                       description="Whether the user mentions that a GSEA is needed on the gene set. If no mention, keep it False.")

#alternative implementation to parse as pydantic more robustly
parser = PydanticOutputParser(pydantic_object=StudyExtraction)
format_instructions = parser.get_format_instructions()

prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract per schema:\n{format_instructions}"),
    ("human", "{paragraph}"),
]).partial(format_instructions=format_instructions)
parsing_llm = prompt | llm | parser

# pass raw user input "paragraph"
parsed_input = parsing_llm.invoke({"paragraph": paragraph})
#convert to JSON string
json_output = parsed_input.model_dump_json(indent=2)
print(json_output)

#next step - inject the json to the LLM to determine attributes to fetch from BioMart


{
  "genes": [
    "CCLX",
    "TLR4",
    "TLR2",
    "IL1B"
  ],
  "organism": "Homo sapiens",
  "field_of_study": "oncology",
  "organ": "brain",
  "analysis_type": "mutation enrichment",
  "GSEA": true
}


In [3]:
from src.querries_script import group_by_gene_dynamic, fill_with_ncbi, call_querry_biomart
from src.print_gene import format_genes
import pandas as pd    

attributes = pd.read_csv("data/attributes.csv")["name"].to_list()

output = call_querry_biomart(attributes=attributes[:15],
                            filters={"external_gene_name": parsed_input.genes})

output = group_by_gene_dynamic(output)

output = fill_with_ncbi(output)

print(format_genes(output))

### IL1B
```
ensembl_gene_id: ENSG00000125538
description: interleukin 1 beta [Source:HGNC Symbol;Acc:HGNC:5992]
chromosome_name: 2
start_position: 112829751
end_position: 112836816
strand: -1
band: q14.1
external_gene_name: IL1B
transcript_count: 8
percentage_gene_gc_content: 45.51
gene_biotype: protein_coding
external_synonym: IL-1B
phenotype_description: GASTRIC CANCER GASTRIC CANCER INTESTINAL INCLUDED
name_1006: [cellular response to mechanical stimulus, cellular response to xenobiotic stimulus, cytokine-mediated signaling pathway, defense response to Gram-positive bacterium, hyaluronan biosynthetic process]
namespace_1003: biological_process
go_id: [GO:0071260, GO:0071466, GO:0019221, GO:0050830, GO:0030213]
go_linkage_type: [IEP, IDA]
ncbi: Gene: IL1B (Homo sapiens)

--- Summary ---
The protein encoded by this gene is a member of the interleukin 1 cytokine family. This cytokine is produced by activated macrophages as a proprotein, which is proteolytically processed to its active

In [4]:
from src.tools import enrichr_query
if parsed_input.GSEA: 
    tool_results = enrichr_query(parsed_input.genes)

        # Optionally, format or reduce the output for readability (e.g., top rows)
    if not tool_results.empty:
        # Filter rows where Adjusted P-value < 0.05
        filtered_df = tool_results[tool_results["Adjusted P-value"] < 0.05]

        # Drop the 'Gene_set' column
        filtered_df = filtered_df.drop(columns=["Gene_set","Old P-value","Old Adjusted P-value"])
        gsea_string = "the results of gene set enrichment are:\n "+ filtered_df.head(20).to_string(index = False)
        # Show the top rows
        # print(gsea_string)

In [5]:
#retrieve gene popularity
from src.gene_lookup import _format_popularity_block
file = "data/all_gene_counts.tsv"
popularity_block = _format_popularity_block(file, parsed_input.genes)
#print(popularity_block)

In [6]:
from langchain_core.output_parsers import StrOutputParser

text = str(output)

task ='''
<task>
You are a helpful and biological expert specializing in integrating and interpreting gene-related data from the given information report, combined with the knowledge you have obtained during your training.
When given a report on a set of genes, briefly summarize the key insights for each gene. The final and most important job is to concisely contextualise the user's findings with biological or biomedical background knowledge and find commonalities between the genes given. Stay scientifically accurate. Tailor the response to the context given by the client.
Think hard!
</task>
'''

user_prompt = "I want to know more about these genes {input_g}, with respective popularities (frequency of citation) {popularity_block}. Report these popularities. They were obtained after {analyses} in {organism}. All information I know about these genes is the following: {text} \n Do you find any commonalities or interesting findings about these genes? I'm mainly interested in {context}"
if parsed_input.GSEA:
    user_prompt += gsea_string
    
prompt = ChatPromptTemplate.from_messages([
    ("system", task),
    ("user", user_prompt)
])
chain = prompt | llm | StrOutputParser()
response = chain.invoke({"text": text, "input_g": parsed_input.genes, "context": parsed_input.field_of_study, "analyses": parsed_input.analysis_type, "organism": parsed_input.organism, "popularity_block": popularity_block})
print(response)

Here's a summary of your genes, focusing on their biological roles, commonalities, and relevance to oncology, especially in light of your mutation enrichment and gene set enrichment results.

First, regarding **CCLX**, no information was provided in the dataset for this gene. It is not a standard HGNC symbol, and without further data, I cannot offer insights. It's possible it was a placeholder or a typo.

Now, let's delve into the other genes:

### Gene Summaries

1.  **IL1B (Interleukin 1 beta)**
    *   **Popularity:** 0.310 (highest among the provided genes).
    *   **Key Insights:** IL1B is a potent pro-inflammatory cytokine, primarily produced by activated macrophages. It plays a central role in the inflammatory response, influencing cell proliferation, differentiation, and apoptosis. It is processed by caspase 1 and is known to induce cyclooxygenase-2 (COX-2), contributing to inflammatory pain. Its dysregulation is implicated in various inflammatory conditions, including osteoar

Based on the provided information and your interest in oncology, here's a summary of the genes and their commonalities:

First, please note that **information for 'CCLX' was not provided** in the dataset. The analysis below focuses on IL1B, TLR4, and TLR2.

### Gene-Specific Insights:

1.  **IL1B (Interleukin 1 beta)**
    *   **Function:** A key pro-inflammatory cytokine produced by activated macrophages. It mediates inflammatory responses and is involved in cell proliferation, differentiation, and apoptosis. It induces cyclooxygenase-2 (COX2), contributing to inflammatory pain.
    *   **Oncology Relevance:** Directly implicated in **Gastric Cancer**. Its role in promoting inflammation, cell proliferation, and influencing apoptosis makes it a significant player in tumor development and progression. Elevated IL-1B levels are also seen in severe inflammatory conditions like COVID-19, highlighting its potent inflammatory capacity.

2.  **TLR4 (Toll-like Receptor 4)**
    *   **Function:** A crucial component of the innate immune system, recognizing pathogen-associated molecular patterns (PAMPs), particularly lipopolysaccharide (LPS) from Gram-negative bacteria. Its activation leads to the production of cytokines essential for effective immunity.
    *   **Oncology Relevance:** While not explicitly listed with a cancer phenotype in your data, TLR4's role in initiating inflammatory responses and activating downstream signaling pathways (like NF-kappaB) is highly relevant to cancer. Chronic inflammation, often triggered by TLR activation, is a known driver of tumorigenesis and progression.

3.  **TLR2 (Toll-like Receptor 2)**
    *   **Function:** Another vital Toll-like receptor that forms heterodimers with other TLRs to recognize PAMPs, such as bacterial lipopeptides and lipoteichoic acid. Its activation modulates the host's inflammatory response and can promote apoptosis in response to bacterial components.
    *   **Oncology Relevance:** Directly associated with **Colorectal Cancer**. Similar to TLR4, its involvement in innate immunity, inflammation, and apoptosis positions it as a critical factor in the tumor microenvironment, influencing cancer cell survival, proliferation, and immune evasion.

### Commonalities and Biological Context in Oncology:

The gene set enrichment analysis strongly highlights the common biological pathways shared by IL1B, TLR4, and TLR2, all of which are highly pertinent to oncology:

1.  **Central Role in Innate Immunity and Inflammation:**
    *   All three genes are fundamental components of the **innate immune system** and are potent drivers of the **inflammatory response**. IL1B is a pro-inflammatory cytokine, while TLR4 and TLR2 are pattern recognition receptors that initiate inflammatory cascades upon sensing microbial components.
    *   The enrichment terms like "positive regulation of inflammatory response" (GO:0050729, GO:0006954) and "cellular response to molecule of bacterial origin" (GO:0071219) underscore this shared function.

2.  **Convergence on NF-kappaB Signaling:**
    *   A striking commonality is their strong involvement in the **NF-kappaB signaling pathway**. Terms like "positive regulation of NIK/NF-kappaB signaling" (GO:1901224), "regulation of NIK/NF-kappaB signaling" (GO:1901222), "I-kappaB phosphorylation" (GO:0007252), and "positive regulation of NF-kappaB transcription factor activity" (GO:0051092) are highly enriched and involve all three genes (IL1B, TLR4, TLR2).
    *   **Oncology Significance:** NF-kappaB is a master regulator of genes involved in inflammation, cell proliferation, survival, and angiogenesis. Chronic activation of NF-kappaB is a hallmark of many cancers, promoting tumor growth, metastasis, and resistance to therapy. Mutations leading to its aberrant activation in these genes could directly contribute to cancer development.

3.  **Promotion of Pro-Tumorigenic Cytokine Production:**
    *   The genes are commonly involved in the "positive regulation of chemokine production" (GO:0032722), "positive regulation of interleukin-8 production" (GO:0032757), and "positive regulation of interleukin-6 production" (GO:0032755).
    *   **Oncology Significance:** IL-6 and IL-8 are well-known pro-inflammatory and pro-tumorigenic cytokines. They create a favorable microenvironment for cancer cells by promoting proliferation, survival, angiogenesis, and immune evasion. IL-1B itself is a key cytokine in this network.

4.  **Response to Microbial Stimuli and Potential for Chronic Inflammation:**
    *   TLR4 and TLR2 are critical for recognizing bacterial components (LPS, lipopeptides). The enrichment for "lipopolysaccharide-mediated signaling pathway" (GO:0031663) further emphasizes this.
    *   **Oncology Significance:** The tumor microenvironment can be influenced by the microbiome. Chronic exposure to bacterial products, especially in tissues like the gut (relevant for Colorectal Cancer) or stomach (Gastric Cancer), can lead to persistent TLR activation, driving chronic inflammation and NF-kappaB signaling, thereby fueling cancer progression.

### Overall Conclusion for Oncology:

The "mutation enrichment" of IL1B, TLR4, and TLR2, combined with their shared biological functions, strongly suggests that **dysregulation of innate immunity and chronic inflammation is a critical underlying mechanism** in the context of your findings.

Mutations in these genes could lead to:
*   **Aberrant activation of the NF-kappaB pathway**, promoting cancer cell survival, proliferation, and metastasis.
*   **Excessive production of pro-inflammatory cytokines** (e.g., IL-6, IL-8), fostering a pro-tumorigenic microenvironment.
*   **Altered responses to microbial stimuli**, potentially contributing to inflammation-driven cancers.

These genes represent key nodes in the intricate interplay between inflammation and cancer, where their dysregulation can significantly contribute to oncogenesis and tumor progression.