In [None]:
%load_ext autoreload    
%autoreload 2          

import os 
import sys
from pathlib import Path # To set data downloading path

# Append ghostcoder folder to path 
sys.path.append(os.path.abspath('..'))

import ghostcoder

from ghostcoder import GhostCoder
from ghostcoder.utils import *
from ghostcoder.graph import create_ghostcoder_agent, create_coder_agent, create_crawler_agent, create_retriever_agent

Here we will test and illustrate each subgraph in Ghostcoder, as an important component of BIA (bioinformatics agnet), mainly functions to complete the generation and execution of bioinformatics analysis codes. It contains five subgraphs. They are filemanager, retriever, coder and webcrawler executor respectively.


### Omics data preparation

In [None]:
#──────work_dir 
#  └───data
#    └─Input_data.whatever
# First lets download a scRNAseq data
import scanpy as sc

from ghostcoder.config import file_config

current_dir = Path.cwd()

# Ghostcoder pre-set WORK_DIR and INPUT_DATA_DIR for continues bioinformatics tasks using one input data

# Download scRNAseq data
sc.settings.datasetdir = current_dir/ file_config.INPUT_DATA_DIR # Download data into data/ folder in current dir
sc.datasets.pbmc3k()

# Create a data description file to illustrate the scRNAseq data details 
data_des = "The data used in this basic preprocessing and clustering tutorial was collected from bone marrow mononuclear cells of healthy human donors. The samples used in this tutorial were measured using the 10X Multiome Gene Expression and Chromatin Accessability kit."

with open('data/data_description.txt','w') as f:
    f.write(data_des)
    
# Set workdir as current dir
file_config.WORK_DIR = current_dir

### Set up LLMs

In [None]:
bgi_api_key = 'sk-NYPE5WNGTK781Nox801a5934A9F84a03BfC525359eBaE7B6'
bgi_api_base = 'http://10.224.28.80:3000/v1'

openai_api_key = "OYXxhyOyPDsJNABxugxjJJxntQ0B1EB0"
openai_api_base = "https://api3.aitok.ai/openai/v1"

openai_chat_model = 'gpt-4o'
openai_code_model = 'gpt-4o'

In [None]:
# Use Openai API for example, you can use other LLM provider as well
from langchain_openai import ChatOpenAI

# I will recommend use Openai API, I haven't tested any APIs from other suppliers yet
def call_chatllm_openai(api_key, api_base, model_name):
    llm = ChatOpenAI(
        openai_api_key = api_key,
        openai_api_base=api_base,
        model = model_name)
    return llm

# Setup up api keys 
# openai_api_key = ""
# openai_api_base = ""
# openai_chat_model = "" 
# openai_code_model = ""

# Here, I recommend using an LLM with stronger coding capabilities as the code model, which is set to perform code-related work in all Ghostcoder graphs. Meanwhile, it is better to use LLM with stronger reasoning ability as the chat model.
chat_model = call_chatllm_openai(openai_api_key, openai_api_base, openai_chat_model)
code_model = call_chatllm_openai(openai_api_key, openai_api_base, openai_code_model)

### Set up Tavily search

In [None]:
tavily_api = "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"

In [None]:
# Initial Tavily
#tavily_api = ""
os.environ["TAVILY_API_KEY"] = tavily_api

### File management and data perception by ghostcoder.filemanager

In [None]:
# The fill manager will automatically set up file system and percept the input data 
# A initial file system status for a task with task_id should as follow: 
#─┬─────work_dir 
# ├─┬───data
# │ └───Input_data.whatever
# └─┬───task_id_1 // Work dir for every new tasks
#   ├─┬─data  
#   │ └─Input_data.whatever // A copy from work_dir/data/
#   ├───figures // Where output figures will be saved
#   └───results // Where processed data will be saved
from ghostcoder.config import docker_config, file_config
from ghostcoder.graph import create_filemanager_agent

#  Set workdir as current dir
current_dir = Path.cwd()
file_config.WORK_DIR = current_dir


# Initial graph
file_management = create_filemanager_agent(
        chat_model = chat_model, 
        code_model = code_model,
        max_retry = 3,
        )

fm_input = {
    "task_id" : "Test", # 
    "docker_profile_dir": docker_config.DOCKER_PROFILES_DIR, # use pre-set docker profiles, please read those docker images 
    "max_iter": 10,
}

fm_state = await file_management.ainvoke(fm_input)

### Native env and Docker executor, test along

In [None]:
from ghostcoder.graph import create_executor_agent
from ghostcoder.docker import get_docker_status
from ghostcoder.utils import get_native_env_perception

# Set up environment profiles
env_profiles = {
    "task_dirs":{
        "task_dir": "Test",
        "data_dir": "data",
        "figure_dir": "figures",
        "output_dir": "results",
    },
    "docker status": get_docker_status(),
    "native env languages": get_native_env_perception(),
}

# Initial agent
agent = create_executor_agent(
        chat_model = chat_model, 
        code_model = code_model,
        max_retry = 3,
    )



# Test executor with bash python and R
test_bash_code ="""
ls -al
"""

test_python_code = """
print("Hello World")
"""

test_r_code ="""
my_str <- "Hello World"
print(my_str)
"""

for codeblock in [
    test_bash_code, 
    test_python_code, 
    test_r_code
    ]:
    print("Test code executor with\n",codeblock)

    exe_states = await agent.ainvoke(
        {
            "generated_codeblock":codeblock,
            "env_profiles": env_profiles,
        }
    )

    print(f"Agent detected coding language as: {exe_states['language']}\nUse docker: {exe_states['use_docker']}")
    if exe_states['use_docker']:
        print(f"With docker image{exe_states['docker_image']}")
    print(f"Code execute output:\n{exe_states['execution_results']}\n--------\n\n")

### Web crawler, test along

In [None]:
from ghostcoder.graph import create_crawler_agent


# Initial agent
agent = create_crawler_agent(
        chat_model = chat_model, 
        code_model = code_model,
        max_retry = 3,
    )

# Input 
query_context = "I'm looking for R codes for analysis single cell trajectory using monocl3"

# Invoke
crawl_state = agent.invoke(
    {
        "query_context": query_context,
    }
    )

# Print results
print("--------\nGenerated query for given context:")
for q in crawl_state['query_list']:
    print(q)
print(f"Get total {len(crawl_state['useful_results'])} useful web search results")
print("Crawled and parsed web information as follow:")
print(crawl_state['summary'])


Attempt 1: No content loaded.
Attempt 2: No content loaded.
Attempt 3: No content loaded.
Attempt 1: No content loaded.
Attempt 2: No content loaded.
Attempt 3: No content loaded.


In [35]:
crawl_state.keys()

dict_keys(['query_context', 'query_list', 'query_results', 'useful_results', 'crawled_webs', 'summary'])

In [46]:
print(crawl_state['summary'])

The tutorial "Inferring single cell trajectories with Monocle3 (R)" provides a comprehensive guide to performing trajectory analysis on single-cell RNA sequencing data using Monocle3 in R. It covers the entire process, including transforming AnnData objects, installing and using necessary R packages, performing batch correction, and identifying differential gene expression. Key components include data preprocessing, dimensionality reduction (using UMAP), clustering, trajectory inference, and cell-type assignment. Integration with tools like JupyterLab for interactive analysis is also discussed, alongside exporting data and visualizations. The tutorial aims to offer flexibility beyond the Monocle tools available in Galaxy and includes code snippets for installation and executing steps in R.


In [44]:
print(crawl_state['crawled_webs'])

## Web-based information on related issues:  
---
### Page 0:  
Inferring single cell trajectories with Monocle3 (R)
Author(s) Julia Jakiela Editor(s) Helena Rasche Pavankumar Videm Wendi Bacon Reviewers
Overview
Creative Commons License: CC-BY
Questions:
How do I perform a trajectory analysis using Monocle3 in R?
What should I do when Galaxy’s Monocle tools are not enough?
How do I assign cell types to the clusters?
How do I infer lineage relationships between clusters, without a time series?
How do I perform batch correction and differential expression analysis in Monocle?
Objectives:
Identify which operations are necessary to transform an AnnData object into the files needed for Monocle
Describe the Monocle3 functions in R
Recognise steps that can be performed in R, but not with current Galaxy tools
Repeat the Monocle3 workflow and choose appropriate parameter values
Compare the outputs from Scanpy, Monocle in Galaxy and Monocle in R
Describe differential expression analysis methods

In [None]:
extract_code_blocks("```bash\nRscript test.R\n```")[0]

In [None]:
from autogen_ext.code_executors.docker import DockerCommandLineCodeExecutor
from autogen_core.code_executor import CodeBlock
from autogen_core import CancellationToken

async with DockerCommandLineCodeExecutor(
    image = "r-base:latest", 
    work_dir="Test", # Path in local native env
    #bind_dir="./", # Path inside the docker

    ) as executor:  # type: ignore
    print(
        await executor.execute_code_blocks(
            code_blocks=[
                CodeBlock(
                    language="bash",  # 'bash', 'shell', 'sh', 'pwsh', 'powershell', 'ps1', 'python'
                    code="Rscript script.R"),
            ],
            cancellation_token=CancellationToken(),
        )
    )

In [None]:
from autogen_ext.code_executors.local import LocalCommandLineCodeExecutor
from autogen_core import CancellationToken
from autogen_core.code_executor import CodeBlock

executor = LocalCommandLineCodeExecutor(
    work_dir = '.'
)

exeres = await executor.execute_code_blocks(
    code_blocks= [CodeBlock(
        language='bash',
        code='ls'
    )],
    cancellation_token=CancellationToken(),
)

exeres

In [None]:
print(docker_status_str)

### Model preparation 

In [None]:
["123123"]

In [None]:
os.listdir('../ghostcoder/docker/BIA_dockers.json')

In [None]:
# Initial Tavily
tavily_api = ""
os.environ["TAVILY_API_KEY"] = tavily_api

In [None]:
# Data 
adata = sc.datasets.pbmc3k()

In [None]:
# Initial graph
agent = GhostCoder(
    chat_model = chat_model, 
    code_model = code_model, 
    )

# Draw graph
agent.draw_graph()

In [None]:
# Test agent
task = "Quality control of the data. wherein the genes are targeted for labeling, the mitochondrial genes (e.g., beginning with “MT-”), the ribosomal genes (e.g., beginning with “RPS” or “RPL”), and the hemoglobin genes (using regular expression matching, e.g., ^HB[^P]); next, common quality control metrics for each cell, including total counts, number of genes detected, and the proportion of total counts represented by a specific group of genes (e.g., mitochondrial genes), were calculated using Scanpy's calculate_qc_metrics() function, and a The log1p transformation is applied to these metrics to optimize the data distribution. Subsequently, the QC metrics of each cell are visualized by violin plots and scatter plots to assess the overall quality of the data. Finally, a threshold is set based on the visualization results to exclude the cells with fewer than 100 genes and genes occurring in fewer than 3 cells to ensure the quality of the data for downstream analyses."

In [None]:
# Run Ghost coder
agent.Run(
    task, 
    input_wrap = input_variable_wrapper([adata])
    )