In [None]:
%load_ext autoreload    
%autoreload 2    

In [None]:
import os 
import sys
from pathlib import Path # To set data downloading path

# Append ghostcoder folder to path 
sys.path.append(os.path.abspath('..'))
import ghostcoder

# For visualize the graph
from IPython.display import Image, display

Here we will test and illustrate each subgraph in Ghostcoder, as an important component of BIA (bioinformatics agnet), mainly functions to complete the generation and execution of bioinformatics analysis codes. It contains five subgraphs. They are filemanager, retriever, coder and webcrawler executor respectively.


### Set ups

#### Set up LLMs

In [None]:
# Use Openai API for example, you can use other LLM provider as well
from langchain_openai import ChatOpenAI

# I will recommend use Openai API, I haven't tested any APIs from other suppliers yet
def call_chatllm_openai(key, api_base, model_name):
    llm = ChatOpenAI(
        api_key = key,
        base_url= api_base,
        model = model_name,
        temperature= 0,
        max_retries = 3,
        )
    return llm

# # Setup up api keys 
# openai_key = ""
# openai_base = ""
# openai_chat_model = "" 
# openai_code_model = ""

# Here, I recommend using an LLM with stronger coding capabilities as the code model, which is set to perform code-related work in all Ghostcoder graphs. Meanwhile, it is better to use LLM with stronger reasoning ability as the chat model.
chat_model = call_chatllm_openai(openai_key, openai_base, openai_chat_model)
code_model = call_chatllm_openai(openai_key, openai_base, openai_code_model)

In [None]:
# Test LLM
response = chat_model.invoke("What is bioinformatics")
print(response.content)

#### Set up Tavily search

In [None]:
from ghostcoder.config import tavily_config
# Initial Tavily
# Currently, all web search functional is based on Tavily search, it's provide 10000 query/month free credit, more see https://app.tavily.com/home
tavily_api = ""

# You can set up Tavily by os environ
# os.environ["TAVILY_API_KEY"] = tavily_api
# or with config, which work inside Ghostcoder only
tavily_config.TAVILY_API_KEY = tavily_api

# Set up max results of each Tavily query
tavily_config.MAX_RESULTS = 7

### Omics data preparation

In [3]:
# Please change your dir to Ghostcoder/Test
os.chdir('Test')

In [None]:
#──────work_dir 
#  └───data
#    └─Input_data.whatever
# First lets download a scRNAseq data
import scanpy as sc

from ghostcoder.config import file_config
os.mkdir('data',exist_ok=True)
current_dir = Path.cwd()

# Ghostcoder pre-set WORK_DIR and INPUT_DATA_DIR for continues bioinformatics tasks using one input data

# Download scRNAseq data
sc.settings.datasetdir = current_dir/ file_config.INPUT_DATA_DIR # Download data into data/ folder in current dir
sc.datasets.pbmc3k()

# Create a data description file to illustrate the scRNAseq data details 
data_des = "The data used in this basic preprocessing and clustering tutorial was collected from bone marrow mononuclear cells of healthy human donors. The samples used in this tutorial were measured using the 10X Multiome Gene Expression and Chromatin Accessability kit."

with open('data/data_description.txt','w') as f:
    f.write(data_des)
    
# Set workdir as current dir
file_config.WORK_DIR = current_dir

### Run GhostCoder

In [None]:
# Please change your dir to Ghostcoder/Test
os.chdir('Test')

In [None]:
from ghostcoder.config import file_config, ghostcoder_config
from ghostcoder.graph import create_ghostcoder_agent

# Task description, from scanpy tutorials - basics - preprocessing and clustering - Quality Control
task_description = """
Develop a module to perform quality control (QC) on single-cell RNA-sequencing data using Scanpy. The input is an AnnData object containing gene expression data. Generate visual summaries with violin and scatter plots for metrics such as n_genes_by_counts, total_counts, and pct_counts_mt. Filter out cells with fewer than 100 genes and genes detected in fewer than 3 cells. Ensure visualizations support threshold selection and that the updated AnnData object is ready for downstream analysis with robust quality outcomes.
"""

# Set task id
ghostcoder_config.TASK_ID = "test_01" # use task id for each task dir

# Set workdir as current dir
current_dir = Path.cwd()
file_config.WORK_DIR = current_dir


# Parse input
graph_input = {
    #"task_id" : "Test", # 
    "task_description": task_description, 
    "previous_codeblock": "", 
    #"max_iter": 5,
}

# Initial Ghost Coder
ghostcoder = create_ghostcoder_agent(
    chat_model = chat_model, 
    code_model = code_model,
    max_retry = 3,
    )

# Run Ghost Coder
fin_states = await ghostcoder.ainvoke(
    graph_input,
    {"recursion_limit": 100},
    )

## Solo Test for sub modules
### File management and data perception by ghostcoder.filemanager

In [None]:
# Please change your dir to Ghostcoder/Test
os.chdir('Test')

In [None]:
# The fill manager will automatically set up file system and percept the input data 
# A initial file system status for a task with task_id should as follow: 
#─┬─────work_dir 
# ├─┬───data
# │ └───Input_data.whatever
# └─┬───task_id // Work dir for every new tasks
#   ├─┬─data  
#   │ └─Input_data.whatever // A copy from work_dir/data/
#   ├───figures // Where output figures will be saved
#   └───results // Where processed data will be saved
#
# The file manager will automatically detect data files (any format) under work_dir/task_id/data/ folder
#

from ghostcoder.config import docker_config, file_config
from ghostcoder.graph import create_filemanager_agent

#  Set workdir as current dir
current_dir = Path.cwd()
file_config.WORK_DIR = current_dir



# Initial graph
manager = create_filemanager_agent(
        chat_model = chat_model, 
        code_model = code_model,
        max_retry = 3,
        )

# Parse input
fm_input = {
    "task_id" : "Test", # 
    "docker_profile_dir": docker_config.DOCKER_PROFILES_DIR, # use pre-set docker profiles, please read those docker images 
    "max_iter": 3,
}

fm_state = await manager.ainvoke(fm_input)

print(f"Data perception of given data:\n{fm_state['data_perc']}")

In [None]:
# Visualize the graph [optional], if failed try to run the cell again 
Image(manager.get_graph().draw_mermaid_png())

### Coder, generate bioinformatic analysis code and execution, with self-correction

In [None]:
# Please change your dir to Ghostcoder/Test
os.chdir('Test')

In [None]:
from ghostcoder.graph import create_coder_agent
from ghostcoder.docker import get_docker_status
from ghostcoder.utils import get_native_env_perception

#  Set workdir as current dir
current_dir = Path.cwd()
file_config.WORK_DIR = current_dir

# Task instructions, parsed from scanpy tutorials, Preprocessing and clustering, first part: QC, https://scanpy.readthedocs.io/en/stable/tutorials/basics/clustering.html
# IIn real world scenarios where the retriever is involved, the instructions are usually not this long, thanks to the reference code blocks, they prepared more detailed and standardized process for use in advance. 
task_instruction = """
Please implement the quality control for the given scRNAseq data using python with the following instructions:
The quality control phase was designed to rigorously assess and refine the cell expression dataset prior to normalization. Initially, genes were categorized based on specific nucleotide sequence patterns that indicate mitochondrial, ribosomal, or hemoglobin origin. This gene categorization was essential for the subsequent computation of comprehensive quality metrics for each cell, including the total number of genes detected, the sum of transcript counts, and the fraction of transcripts derived from mitochondrial genes. To visualize these metrics, a series of plots were created. Violin plots were employed to illustrate the distribution of gene counts per cell, overall transcript counts, and the mitochondrial transcript percentages across the entire cell population. These plots enabled the identification of cells with anomalous expression profiles that might result from technical confounders or biological stress. A scatter plot was also generated to explore the relationship between the total counts and the number of genes detected per cell, with a color gradient depicting the proportion of mitochondrial counts; this assisted in discerning potential outlier cells. Furthermore, criteria were established to filter out cells demonstrating extremely low gene expression, and an algorithm was applied to flag potential doublets. This methodical approach ensured that only high-quality cells advanced to the normalization stage, thereby preserving the integrity of downstream analyses.
"""

# Set up environment profiles, from file manager
env_profiles = {
    "task_dirs":{
        "task_dir": "Test",
        "data_dir": "data",
        "figure_dir": "figures",
        "output_dir": "results",
    },
    "docker status": get_docker_status(),
    "native env languages": get_native_env_perception(),
}

# Data perception, from file manager
data_perception = """
File Name: pbmc3k_raw.h5ad
File Format: .h5ad
Selected Programming Language: Python with anndata/scanpy (suitable for .h5ad format)
Data Structure: Expression matrix shape: (2700, 32738)
Biologically Relevant Fields: 
  - obs keys: []
  - var keys: ['gene_ids']
  - Cell Types: Not available
  - Gene Names: Not available
Metadata: No additional metadata


File Name: data_description.txt
File Format: .txt
Selected Programming Language: Python (suitable for text processing)
Content: **This file is used to provide addtionnal description for given data files**

pbmc3k_raw.h5ad: The data used in this basic preprocessing and clustering tutorial was collected from bone marrow mononuclear cells of healthy human donors. The samples used in this tutorial were measured using the 10X Multiome Gene Expression and Chromatin Accessability kit.

"""

# Reference code blocks, can be provided by retriever, here we test with no reference 
ref_CBs = []

# Initial the agent
coder = create_coder_agent(
        chat_model = chat_model, 
        code_model = code_model,
        max_retry = 3,
        )

# Parse input 
coder_input = {
    "task_instruction": task_instruction,
    "data_perception": data_perception,
    "previous_codeblock": "", # We don't have any prior process steps
    "ref_codeblocks": ref_CBs,
    "env_profiles": env_profiles,
}

coder_state = await coder.ainvoke(coder_input)



In [None]:
print(f"The coder geneerated the code within {coder_state['n_iter']} iterations.")
if "n_error" in coder_state.keys():
    print(f"Automatically fix error {coder_state['n_error']} times.")
else:
    print(f"Code executed without error.")
print(f"Generated code:\n--------\n{coder_state['generated_codeblock'][-1]}\n--------\n")
print(f"With execution result as:{coder_state['execution_outstr']}")


### Native env and Docker executor, test along

In [None]:
from ghostcoder.graph import create_executor_agent
from ghostcoder.docker import get_docker_status
from ghostcoder.utils import get_native_env_perception

#  Set workdir as current dir
current_dir = Path.cwd()
file_config.WORK_DIR = current_dir

# Set up environment profiles
env_profiles = {
    "task_dirs":{
        "task_dir": "Test",
        "data_dir": "data",
        "figure_dir": "figures",
        "output_dir": "results",
    },
    "docker status": get_docker_status(),
    "native env languages": get_native_env_perception(),
}

# Initial agent
agent = create_executor_agent(
        chat_model = chat_model, 
        code_model = code_model,
        max_retry = 3,
    )



# Test executor with bash python and R
test_bash_code ="""
ls -al
"""

test_python_code = """
print("Hello World")
"""

test_r_code ="""
my_str <- "Hello World"
print(my_str)
"""

for codeblock in [
    test_bash_code, 
    test_python_code, 
    test_r_code
    ]:
    print("Test code executor with\n",codeblock)

    exe_states = await agent.ainvoke(
        {
            "generated_codeblock":codeblock,
            "env_profiles": env_profiles,
        }
    )

    print(f"Agent detected coding language as: {exe_states['language']}\nUse docker: {exe_states['use_docker']}")
    if exe_states['use_docker']:
        print(f"With docker image{exe_states['docker_image']}")
    print(f"Code execute output:\n{exe_states['execution_results']}\n--------\n\n")

In [None]:
# Visualize the graph [optional], if failed try to run the cell again 
Image(agent.get_graph().draw_mermaid_png())

### Web crawler, test along

In [None]:
from ghostcoder.graph import create_crawler_agent

# Initial agent
agent = create_crawler_agent(
        chat_model = chat_model, 
        code_model = code_model,
        max_retry = 3,
    )

# Input 
query_context = "I'm looking for R codes for analysis single cell trajectory using monocl2"

# Invoke
crawl_state = agent.invoke(
    {
        "query_context": query_context,
    }
    )

# Print results
print("--------\nGenerated query for given context:")
for q in crawl_state['query_list']:
    print(q)
print(f"Get total {len(crawl_state['useful_results'])} useful web search results")
print("Crawled and parsed web information as follow:")
print(crawl_state['summary'])


In [None]:
# Visualize the graph [optional], if failed try to run the cell again 
Image(agent.get_graph().draw_mermaid_png())

### Test retriever agent

In [None]:
from ghostcoder.graph import create_retriever_agent

# Initial agent
agent = create_retriever_agent(
        chat_model = chat_model, 
        code_model = code_model,
        max_retry = 3,
    )

# Input 
task_description = "Single cell RNAseq quality control"

# Invoke
retriever_state = agent.invoke(
    {
        "task_description": task_description,
    }
    )

# Print reference code blocks
i = 1
for cb in retriever_state['ref_codeblocks']:
    print(f"Reference code block #{i}")
    print(cb)
    print("\n=========================\n\n"
    i+=1