In [None]:
%load_ext autoreload    
%autoreload 2          

import os 
import sys
from pathlib import Path # To set data downloading path

# Append ghostcoder folder to path 
sys.path.append(os.path.abspath('..'))

import ghostcoder

from ghostcoder import GhostCoder
from ghostcoder.utils import *
from ghostcoder.graph import create_ghostcoder_agent, create_coder_agent, create_crawler_agent, create_retriever_agent

Here we will test and illustrate each subgraph in Ghostcoder, as an important component of BIA (bioinformatics agnet), mainly functions to complete the generation and execution of bioinformatics analysis codes. It contains five subgraphs. They are filemanager, retriever, coder and webcrawler executor respectively.


### Omics data preparation

In [None]:
#──────work_dir 
#  └───data
#    └─Input_data.whatever
# First lets download a scRNAseq data
import scanpy as sc

from ghostcoder.config import file_config

current_dir = Path.cwd()

# Ghostcoder pre-set WORK_DIR and INPUT_DATA_DIR for continues bioinformatics tasks using one input data

# Download scRNAseq data
sc.settings.datasetdir = current_dir/ file_config.INPUT_DATA_DIR # Download data into data/ folder in current dir
sc.datasets.pbmc3k()

# Create a data description file to illustrate the scRNAseq data details 
data_des = "The data used in this basic preprocessing and clustering tutorial was collected from bone marrow mononuclear cells of healthy human donors. The samples used in this tutorial were measured using the 10X Multiome Gene Expression and Chromatin Accessability kit."

with open('data/data_description.txt','w') as f:
    f.write(data_des)
    
# Set workdir as current dir
file_config.WORK_DIR = current_dir

### Set up LLMs

In [None]:
# Use Openai API for example, you can use other LLM provider as well
from langchain_openai import ChatOpenAI

# I will recommend use Openai API, I haven't tested any APIs from other suppliers yet
def call_chatllm_openai(api_key, api_base, model_name):
    llm = ChatOpenAI(
        openai_api_key = api_key,
        openai_api_base=api_base,
        model = model_name)
    return llm

# Setup up api keys 
openai_api_key = ""
openai_api_base = ""
openai_chat_model = "" 
openai_code_model = ""

# Here, I recommend using an LLM with stronger coding capabilities as the code model, which is set to perform code-related work in all Ghostcoder graphs. Meanwhile, it is better to use LLM with stronger reasoning ability as the chat model.
chat_model = call_chatllm_openai(openai_api_key, openai_api_base, openai_chat_model)
code_model = call_chatllm_openai(openai_api_key, openai_api_base, openai_code_model)

### Set up Tavily search

In [None]:
# Initial Tavily
tavily_api = ""
os.environ["TAVILY_API_KEY"] = tavily_api

### File management and data perception by ghostcoder.filemanager

In [None]:
# The fill manager will automatically set up file system and percept the input data 
# A initial file system status for a task with task_id should as follow: 
#─┬─────work_dir 
# ├─┬───data
# │ └───Input_data.whatever
# └─┬───task_id_1 // Work dir for every new tasks
#   ├─┬─data  
#   │ └─Input_data.whatever // A copy from work_dir/data/
#   ├───figures // Where output figures will be saved
#   └───results // Where processed data will be saved
from ghostcoder.config import docker_config, file_config
from ghostcoder.graph import create_filemanager_agent

#  Set workdir as current dir
current_dir = Path.cwd()
file_config.WORK_DIR = current_dir


# Initial graph
file_management = create_filemanager_agent(
        chat_model = chat_model, 
        code_model = code_model,
        max_retry = 3,
        )

fm_input = {
    "task_id" : "Test", # 
    "docker_profile_dir": docker_config.DOCKER_PROFILES_DIR, # use pre-set docker profiles, please read those docker images 
    "max_iter": 10,
}

fm_state = await file_management.ainvoke(fm_input)

### Native env and Docker executor, test along

In [None]:
from ghostcoder.graph import create_executor_agent
from ghostcoder.docker import get_docker_status
from ghostcoder.utils import get_native_env_perception

# Set up environment profiles
env_profiles = {
    "task_dirs":{
        "task_dir": "Test",
        "data_dir": "data",
        "figure_dir": "figures",
        "output_dir": "results",
    },
    "docker status": get_docker_status(),
    "native env languages": get_native_env_perception(),
}

# Initial agent
agent = create_executor_agent(
        chat_model = chat_model, 
        code_model = code_model,
        max_retry = 3,
    )



# Test executor with bash python and R
test_bash_code ="""
ls -al
"""

test_python_code = """
print("Hello World")
"""

test_r_code ="""
my_str <- "Hello World"
print(my_str)
"""

for codeblock in [
    test_bash_code, 
    test_python_code, 
    test_r_code
    ]:
    print("Test code executor with\n",codeblock)

    exe_states = await agent.ainvoke(
        {
            "generated_codeblock":codeblock,
            "env_profiles": env_profiles,
        }
    )

    print(f"Agent detected coding language as: {exe_states['language']}\nUse docker: {exe_states['use_docker']}")
    if exe_states['use_docker']:
        print(f"With docker image{exe_states['docker_image']}")
    print(f"Code execute output:\n{exe_states['execution_results']}\n--------\n\n")

### Web crawler, test along

In [47]:
from ghostcoder.graph import create_crawler_agent


# Initial agent
agent = create_crawler_agent(
        chat_model = chat_model, 
        code_model = code_model,
        max_retry = 3,
    )

# Input 
query_context = "I'm looking for R codes for analysis single cell trajectory using monocl3"

# Invoke
crawl_state = agent.invoke(
    {
        "query_context": query_context,
    }
    )

# Print results
print("--------\nGenerated query for given context:")
for q in crawl_state['query_list']:
    print(q)
print(f"Get total {len(crawl_state['useful_results'])} useful web search results")
print("Crawled and parsed web information as follow:")
print(crawl_state['summary'])


Attempt 1: No content loaded.
Attempt 2: No content loaded.
Attempt 3: No content loaded.
Attempt 1: No content loaded.
Attempt 2: No content loaded.
Attempt 3: No content loaded.
--------
Generated query for given context:
R code examples for single cell trajectory analysis using Monocle3
Monocle3 tutorial for single cell trajectory inference
Step-by-step guide for analyzing single cell trajectories with Monocle3 in R
Get total 5 useful web search results
Crawled and parsed web information as follow:
The content from Page 2 provides a detailed guide on using Monocle 3 for constructing single-cell trajectory analyses using ATAC-seq data. The process involves loading single-cell ATAC-seq datasets, performing preprocessing with the Seurat and Signac packages, and using conversion functions from SeuratWrappers to integrate with Monocle 3. Here's the key code for building trajectories with Monocle 3:

```r
library(Signac)
library(Seurat)
library(SeuratWrappers)
library(monocle3)
library(Ma