Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
OPENAI_API_KEY=
GOOGLE_API_KEY=
ANTHROPIC_API_KEY=
BYTEPLUS_API_KEY=
REMOTE_MODEL_URL=
OMNIPARSER_BASE_URL=
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,7 @@ node_modules
.env
**/agent_logs.txt
**/memory.txt
**/gui/config
**/gui/config
.claude/
OmniParser_CraftOS/
_launch_agent.cmd
4 changes: 1 addition & 3 deletions config.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
{
"conda_environment_name": "white-collar-agent",
"conda_environment_created": true,
"omniparser_repo_path": "/home/ahmad/Work/CraftOS/WhiteCollarAgent/OmniParser_CraftOS"
"omniparser_repo_path": "C:\\Users\\zfoong\\Desktop\\agent\\code\\git\\WhiteCollarAgent\\OmniParser_CraftOS"
}
81 changes: 78 additions & 3 deletions core/action/action_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,86 @@ def _atomic_action_venv_process(
Executes an action inside an ephemeral virtual environment.
Runs in a SEPARATE PROCESS.
"""
# GUI mode - in a Docker container
if mode == "GUI":
return GUIHandler.execute_action(GUIHandler.TARGET_CONTAINER, action_code, input_data, mode)

# Sandboxed mode - NOT in a Docker container
try:
result = GUIHandler.execute_action(GUIHandler.TARGET_CONTAINER, action_code, input_data, mode)
return result
with tempfile.TemporaryDirectory(prefix="action_venv_") as tmpdir:
tmp = Path(tmpdir)

# ─── Create virtual environment ───
venv_dir = tmp / "venv"
venv.EnvBuilder(with_pip=True).create(venv_dir)

python_bin = (
venv_dir / "Scripts" / "python.exe"
if os.name == "nt"
else venv_dir / "bin" / "python"
)

# ─── Write action script ───
# We inject input_data as a global so the action code can access it
action_file = tmp / "action.py"
action_file.write_text(
f"""
import json
import sys

input_data = json.loads({json.dumps(json.dumps(input_data))})

# ─── USER CODE ───
{action_code}

# ─── Find and call the function ───
func = None
local_vars = dict(locals())
for name, obj in local_vars.items():
if callable(obj) and not name.startswith('_') and name not in ('input_data', 'json', 'sys'):
func = obj
break

if func is None:
# Fallback: check if output variable was set (legacy behavior)
if 'output' in local_vars:
print(local_vars['output'])
sys.exit(0)
else:
sys.exit(1)

# Call the function and print result as JSON
try:
result = func(input_data)
if isinstance(result, dict):
print(json.dumps(result, ensure_ascii=False))
else:
print(str(result))
except Exception as e:
import traceback
print("Execution failed: " + str(e) + "\\n" + traceback.format_exc(), file=sys.stderr)
sys.exit(1)
""",
encoding="utf-8",
)

proc = subprocess.run(
[python_bin, str(action_file)],
capture_output=True,
text=True,
timeout=timeout,
)

return {
"stdout": proc.stdout.strip(),
"stderr": proc.stderr.strip(),
"returncode": proc.returncode,
}

except subprocess.TimeoutExpired:
return {"stdout": "", "stderr": "Execution timed out", "returncode": -1}
except Exception as e:
return {"status": "error", "message": str(e)}
return {"stdout": "", "stderr": f"Execution failed: {e}", "returncode": -1}

def _atomic_action_internal(
action_name: str,
Expand Down
4 changes: 2 additions & 2 deletions core/action/action_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ async def execute_action(
self._log_event_stream(
is_gui_task=is_gui_task,
event_type="action_start",
event=f"Running action {action.name} with input: {input_data}. {context if context else ''}",
event=f"Running action {action.name} with input: {input_data}.",
display_message=f"Running {action.name}",
action_name=action.name,
)
Expand Down Expand Up @@ -239,7 +239,7 @@ async def execute_action(
self._log_event_stream(
is_gui_task=is_gui_task,
event_type="action_end",
event=f"Action {action.name} completed with output: {outputs}. {context if context else ''}",
event=f"Action {action.name} completed with output: {outputs}.",
display_message=f"{action.name} → {display_status}",
action_name=action.name,
)
Expand Down
55 changes: 49 additions & 6 deletions core/agent_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ def __init__(
*,
data_dir: str = "core/data",
chroma_path: str = "./chroma_db",
llm_provider: str = "byteplus",
llm_provider: str = "anthropic",
deferred_init: bool = False,
) -> None:
"""
This constructor that initializes all agent components.
Expand All @@ -98,16 +99,22 @@ def __init__(
RAG components.
llm_provider: Provider name passed to :class:`LLMInterface` and
:class:`VLMInterface`.
"""

deferred_init: If True, allow LLM/VLM initialization to be deferred
until API key is configured (useful for first-time setup).
"""

# persistence & memory
self.db_interface = self._build_db_interface(
data_dir = data_dir, chroma_path=chroma_path
)

# LLM + prompt plumbing
self.llm = LLMInterface(provider=llm_provider, db_interface=self.db_interface)
self.vlm = VLMInterface(provider=llm_provider)
# LLM + prompt plumbing (may be deferred if API key not yet configured)
self.llm = LLMInterface(
provider=llm_provider,
db_interface=self.db_interface,
deferred=deferred_init,
)
self.vlm = VLMInterface(provider=llm_provider, deferred=deferred_init)

self.event_stream_manager = EventStreamManager(self.llm)

Expand Down Expand Up @@ -744,6 +751,42 @@ def _parse_reasoning_response(self, response: str) -> ReasoningResult:
action_query=action_query,
)

# =====================================
# Initialization
# =====================================

def reinitialize_llm(self, provider: str | None = None) -> bool:
"""Reinitialize LLM and VLM interfaces with updated configuration.

Call this after updating environment variables with new API keys.

Args:
provider: Optional provider to switch to. If None, uses current provider.

Returns:
True if both LLM and VLM were initialized successfully.
"""
llm_ok = self.llm.reinitialize(provider)
vlm_ok = self.vlm.reinitialize(provider)

if llm_ok and vlm_ok:
logger.info(f"[AGENT] LLM and VLM reinitialized with provider: {self.llm.provider}")
# Update GUI module provider if needed
if hasattr(self, 'action_library') and hasattr(GUIHandler, 'gui_module'):
GUIHandler.gui_module = GUIModule(
provider=self.llm.provider,
action_library=self.action_library,
action_router=self.action_router,
context_engine=self.context_engine,
action_manager=self.action_manager,
)
return llm_ok and vlm_ok

@property
def is_llm_initialized(self) -> bool:
"""Check if the LLM interface is properly initialized."""
return self.llm.is_initialized

# =====================================
# Lifecycle
# =====================================
Expand Down
73 changes: 73 additions & 0 deletions core/data/action/end task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from core.action.action_framework.registry import action


@action(
name="end task",
description=(
"End the current task for this session with a final status. "
"Use status='complete' when the task is fully done, or 'abort' when it "
"should be cancelled/failed early. Always provide a brief reason."
),
default=True,
mode="CLI",
input_schema={
"status": {
"type": "string",
"enum": ["complete", "abort"],
"example": "complete",
"description": "Final status for the task: 'complete' or 'abort'.",
},
"reason": {
"type": "string",
"example": "All steps executed successfully.",
"description": "Why the task is considered complete or why it should be aborted.",
},
},
output_schema={
"status": {
"type": "string",
"example": "success",
"description": "Result of the operation.",
},
"task_id": {
"type": "string",
"example": "user_request_1_abc123",
"description": "The session/task id affected.",
},
},
test_payload={
"status": "complete",
"reason": "All steps executed successfully.",
"simulated_mode": True,
},
)
def end_task(input_data: dict) -> dict:
import asyncio

status = (input_data.get("status") or "").strip().lower()
reason = input_data.get("reason")
simulated_mode = input_data.get("simulated_mode", False)

if status not in ("complete", "abort"):
return {
"status": "error",
"message": "Invalid status for end task. Use 'complete' or 'abort'.",
}

# In simulated mode, skip the actual interface call for testing
if simulated_mode:
return {"status": "success", "task_id": "test_task_id"}

import core.internal_action_interface as iai

if status == "complete":
res = asyncio.run(iai.InternalActionInterface.mark_task_completed(message=reason))
else:
# Map 'abort' to a cancellation by default
res = asyncio.run(iai.InternalActionInterface.mark_task_cancel(reason=reason))

if isinstance(res, dict) and res.get("status") == "ok":
res["status"] = "success"

return res

48 changes: 0 additions & 48 deletions core/data/action/mark task cancel.py

This file was deleted.

48 changes: 0 additions & 48 deletions core/data/action/mark task completed.py

This file was deleted.

Loading