In this advanced tutorial, we show how to orchestrate a multi-agent, multi-step workflow. The workflow does the following:
1. Runs RNAseq DEA across 10 parallel Finch runs
2. Run a single meta-analysis (consensus) Finch run using all the outputs of step 1.
3. Run 10 parallel Crow runs on the top 10 differentially expressed genes from step 2.
4. Use Finch to create a volcano plot incorporating results from step 2 and step 3.

In [1]:
from fhda.tortoise import Tortoise, Step, StepConfig
from futurehouse_client import JobNames
import pandas as pd
import json

In [2]:
# Define our parameters
TREATMENT = "dexamethasone"
MECHANISM = "airway smooth muscle cells"
CONTEXT = "asthma"
N_TOP_GENES = 10
PARALLEL_DEA = 5
FH_API_KEY = "yG35e6jmf7bCNO5Z/dBSwg.platformv01.eyJqdGkiOiJjOWM4OWZiZS0yM2MyLTRlZGUtYTBiMS0wYzFkNjg2OWJmNTgiLCJzdWIiOiJzdkhlUEZCTTRyV0Z4Z1BpdE1sbFFsRjRDWmwyIiwiaWF0IjoxNzQ3MjcyNzUxfQ.OY6wjer+pvWe92dQD3cjwqthadOvkaqAnBxUvow39nU"  # Add your API key here

# Define the prompts
DEA_PROMPT = """
Determine the effect of {treatment} on {mechanism} in {context}. 

Perform differential expression analysis and pathway analysis on relevant comparison groups. Map all gene IDs to gene symbols using annotation package such as 'org.Hs.eg.db'.

Generate volcano plots and heatmap of differentially expressed genes, and dot plots for enriched pathways, use gene symbols for labels where relevant.

Output a single csv file named "dea_results.csv"  with the results for all tested genes of the most relevant contrast, report both gene ID and gene symbol.

If there is an error, keep trying, do not give up until you reach the end of the analysis. When mapping gene ID to gene symbol, consider all possible forms of gene IDs, keep trying until the gene symbols are obtained.
"""

CONSENSUS_PROMPT = f"""
Combine these differential expression analysis results by calculating the mode of log2FC and adjusted p values. Output the results in a file named 'consensus_results.csv', include the columns gene_symbol, log2FC and adjusted P values. In a separate file named 'top{N_TOP_GENES}_genes.csv', output the gene symbols of the consensus most significant genes with the column name "gene_symbol". 

Create a stacked bar plot showing gene regulation consistency across all analyses. Plot regulation direction (up vs down) on x-axis and percentage of genes in each category on y-axis. Color-code by significance category: all analyses, >50% of analyses and  <50% of analyses. Include percentages within each segment and a clear legend. Exclude genes that are non-significant across all analyses.
"""

PQA_PROMPT = """
What are the possible mechanisms for {gene} in the effect of {treatment} on {mechanism} in {context}?
From 1 to 5, with 1 being no evidence of association at all and 5 being strong association with supporting evidence, how strong is the evidence supporting this mechanism?
Give a concise summary for the evidence in up to 10 words, and a short summary of mechanisms in up to 20 words. Do not include references or links.
Please share this information in json format in the form of: `"gene_symbol": <gene_symbol>, "association_evidence_score":[1...5], "evidence_summary": <evidence_summary>, "mechanism_summary": <mechanism_summary>`.
Share nothing else but the JSON output.
"""

VOLCANO_PROMPT = f"""
Make an interactive volcano plot. Colour-code by significance categories: top up-regulated genes, up-regulated genes, top down-regulated genes, down-regulated genes, and non-significant genes. Genes considered as top have extra annotation available in 'pqa_results.csv'.

Include hover information according to the categories, for the top genes, on hover, show gene symbol, log2FC, adjusted p value, mechanism, evidence and evidence score. For up and down regulated genes that are not in top {N_TOP_GENES}, show gene symbol, log2FC and adjusted p value. For non-significant genes, do not include hover information.

For the annotations, remove all text in the brackets in the summary columns, and remove the fullstop at the end. For annotations with 6 words or more in a line, use text-wrap. Don't include text on the plot itself. Include a legend explaining the color-codes.

PLEASE USE TEXT WRAP FOR THE HOVER INFORMATION!
"""

# Initialize Tortoise
tortoise = Tortoise(api_key=FH_API_KEY)

OUTPUT_DIR = "output"

In [4]:
# Step 1: Differential Expression Analysis (DEA)
dea_step = Step(
    name=JobNames.FINCH,
    prompt_template=DEA_PROMPT,
    cot_prompt=True,
    prompt_args={"treatment": TREATMENT, "mechanism": MECHANISM, "context": CONTEXT},
    input_files={
        "datasets/GSE52778_All_Sample_FPKM_Matrix.txt.gz": "GSE52778_series_matrix.txt.gz"
    },
    output_files={"dea_results.csv": "dea_results/dea_results.csv"},
    parallel=PARALLEL_DEA,
    config=StepConfig(language="R", max_steps=30, timeout=15 * 60),
)
tortoise.add_step(dea_step)






In [5]:
# Step 2: Consensus Analysis
consensus_step = Step(
    name=JobNames.FINCH,
    prompt_template=CONSENSUS_PROMPT,
    cot_prompt=True,
    input_files={f"{OUTPUT_DIR}/{dea_step.step_id}/dea_results": "dea_results/"},
    output_files={
        "consensus_results.csv": "consensus_results.csv",
        f"top{N_TOP_GENES}_genes.csv": f"top{N_TOP_GENES}_genes.csv",
    },
    config=StepConfig(language="R", max_steps=30, timeout=15 * 60),
)
tortoise.add_step(consensus_step)

In [5]:
# Step 3: Literature Search with PaperQA
def pqa_post_process(results, output_dir):
    """Process the results from multiple PQA tasks"""

    answer_list = []
    for task_response in results.get("task_responses", []):
        try:
            answer = json.loads(task_response.answer)
            if isinstance(answer, list):
                answer = answer[0]
            answer_list.append(answer)
        except Exception as e:
            print(f"Error parsing answer for task {task_response.task_id}: {e}")

    # Create DataFrame and save
    pqa_df = pd.DataFrame(answer_list)
    pqa_df.to_csv(f"{output_dir}/pqa_results.csv", index=False)
    return pqa_df


# Define a function to create multiple PQA prompts for genes
def pqa_prompt_generator():
    """Generate PQA prompts for each top gene"""
    top_genes_df = pd.read_csv(
        #f"{OUTPUT_DIR}/{consensus_step.step_id}/top{N_TOP_GENES}_genes.csv"
        "/Users/jineta/git/gitrepo/data-analysis-crow/tutorial/output/38258650/top10_genes.csv"
    )
    gene_symbols = top_genes_df["gene_symbol"].tolist()
    prompt_pairs = []
    for gene in gene_symbols:
        prompt_pairs.append(
            (
                PQA_PROMPT,
                {
                    "gene": gene,
                    "treatment": TREATMENT,
                    "mechanism": MECHANISM,
                    "context": CONTEXT,
                },
            )
        )
    return prompt_pairs


# Read top genes and create PQA steps
pqa_step = Step(
    name=JobNames.CROW,
    prompt_template=PQA_PROMPT,
    prompt_generator=pqa_prompt_generator,
    parallel=N_TOP_GENES,  # Will process all top genes in parallel
    post_process=pqa_post_process,
)
tortoise.add_step(pqa_step)



In [8]:
# Step 4: Visualization with Volcano Plot
volcano_step = Step(
    name=JobNames.FINCH,
    prompt_template=VOLCANO_PROMPT,
    cot_prompt=True,
    input_files={
        #f"{OUTPUT_DIR}/{consensus_step.step_id}/consensus_results.csv": "consensus_results.csv",
        "/Users/jineta/git/gitrepo/data-analysis-crow/tutorial/output/38258650/consensus_results.csv": "consensus_results.csv",
        f"{OUTPUT_DIR}/{pqa_step.step_id}/pqa_results.csv": "pqa_results.csv",
    },
    config=StepConfig(language="PYTHON", max_steps=30, timeout=15 * 60),
)
tortoise.add_step(volcano_step)



In [None]:
# Run the pipeline
results = await tortoise.run_pipeline(OUTPUT_DIR)
print("Pipeline execution completed")
print(
    f"View the final volcano plot at: https://platform.futurehouse.org/trajectories/{tortoise.results[volcano_step.step_id]['task_ids'][0]}"
)

In [None]:

# Run in Jupyter or IPython
import asyncio

async def run_single_step():
    results = await tortoise.run_pipeline(output_dir="/Users/jineta/git/gitrepo/data-analysis-crow/tutorial/output/38258650/")
    print("=== Step Results ===")
    print(results)

await run_single_step()