In [3]:
from langchain.agents import Tool
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.messages import SystemMessage, HumanMessage
import os
import json
import getpass
import tempfile
import subprocess
import re

# Set up Gemini API key
if not os.environ.get("GOOGLE_API_KEY"):
    try:
        from google.colab import userdata
        os.environ["GOOGLE_API_KEY"] = userdata.get('GEMINI_API_KEY')  # Update secret name
    except:
        import getpass
        os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter Google API Key: ")

class PlannerAgent:
    def __init__(self):
        # Initialize Gemini model
        self.llm = ChatGoogleGenerativeAI(
            model="gemini-1.5-flash-latest",
            temperature=0.3,
            convert_system_message_to_human=True
        )

        self.system_prompt = """
        As an expert solution architect, decompose complex problems into executable sub-tasks.
        Use this JSON structure:
        {
            "subtasks": [
                {
                    "id": <unique integer>,
                    "desc": "<clear description>",
                    "dependencies": [<list of prerequisite task IDs>]
                }
            ]
        }
        """

    def decompose(self, task):
        try:
            # Create message sequence
            messages = [
                SystemMessage(content=self.system_prompt),
                HumanMessage(content=f"TASK: {task}")
            ]

            # Get structured JSON response
            llm_response = self.llm.invoke(messages)

            # Extract JSON content
            response_content = llm_response.content

            # Handle potential formatting variations
            if '```json' in response_content:
                json_match = re.search(r'```json(.*?)```', response_content, re.DOTALL)
                if json_match:
                    response_content = json_match.group(1).strip()

            # Parse and validate response
            plan = json.loads(response_content)

            # Validate structure
            if "subtasks" not in plan or not isinstance(plan["subtasks"], list):
                raise ValueError("Invalid response format: Missing 'subtasks' list")

            for subtask in plan["subtasks"]:
                if "id" not in subtask or "desc" not in subtask:
                    raise ValueError("Subtask missing required fields: 'id' or 'desc'")

            return plan

        except json.JSONDecodeError:
            print("Failed to parse JSON response. Attempting recovery...")
            try:
                # Fallback: Extract JSON from text
                json_match = re.search(r'\{.*\}', response_content, re.DOTALL)
                if json_match:
                    return json.loads(json_match.group())
                raise
            except:
                print("Could not recover valid JSON. Using fallback plan.")
                return self._fallback_plan(task)

        except Exception as e:
            print(f"Decomposition error: {str(e)}")
            return self._fallback_plan(task)

    def _fallback_plan(self, task):
        """Fallback plan for error recovery"""
        return {
            "subtasks": [
                {
                    "id": 1,
                    "desc": f"Analyze requirements: {task}",
                    "dependencies": []
                },
                {
                    "id": 2,
                    "desc": f"Design solution architecture for {task}",
                    "dependencies": [1]
                },
                {
                    "id": 3,
                    "desc": f"Implement core functionality for {task}",
                    "dependencies": [2]
                },
                {
                    "id": 4,
                    "desc": f"Test and validate solution for {task}",
                    "dependencies": [3]
                }
            ]
        }

class ExecutorAgent:
    def __init__(self):
        # Initialize Gemini model
        self.llm = ChatGoogleGenerativeAI(
            model="gemini-1.5-flash-latest",
            temperature=0.2,
            convert_system_message_to_human=True
        )
        self.tools = [
            Tool(
                name="CodeWriter",
                func=self.write_code,
                description="Generates code for given requirements"
            ),
            Tool(
                name="TestRunner",
                func=self.run_tests,
                description="Executes test cases and reports results"
            )
        ]

    def write_code(self, requirements: str) -> dict:
        """
        Uses the LLM to generate Python code for the given requirements.
        Returns a dict with 'code' (the Python source) and 'files' (list of filenames).
        """
        prompt = (
            f"You are an expert Python developer. "
            f"Write complete, self-contained Python code to {requirements}. "
            f"Return only the code block, wrapped in triple backticks for Python."
        )
        llm_response = self.llm.invoke(prompt).content

        # Extract code from triple backticks
        match = re.search(r"```(?:python)?\n([\s\S]*?)```", llm_response)
        code = match.group(1) if match else llm_response

        # Save code to a temp file
        tmp_dir = tempfile.mkdtemp()
        file_path = os.path.join(tmp_dir, "main.py")
        with open(file_path, "w") as f:
            f.write(code)

        return {"code": code, "files": [file_path]}

    def run_tests(self, code_info: dict) -> dict:
        """
        Executes the generated Python file in a sandboxed subprocess.
        Captures pass/fail status and any stderr output.
        """
        files = code_info.get("files", [])
        if not files:
            return {"passed": False, "errors": ["No file to execute."]}

        file_to_run = files[0]

        # Run the Python file
        try:
            result = subprocess.run(
                ["python", file_to_run],
                capture_output=True,
                text=True,
                timeout=10
            )
            if result.returncode == 0:
                return {"passed": True, "errors": []}
            else:
                return {"passed": False, "errors": [result.stderr.strip()]}
        except Exception as e:
            return {"passed": False, "errors": [str(e)]}

class ReviewerAgent:
    def __init__(self):
        # Initialize Gemini model
        self.llm = ChatGoogleGenerativeAI(
            model="gemini-1.5-flash-latest",
            temperature=0.2,
            convert_system_message_to_human=True
        )

    def analyze(self, code_result: dict, test_result: dict) -> dict:
        """
        Analyze the generated code and test results, providing feedback and critical issues.
        Returns a dict with:
          - 'feedback': A summary string of overall quality
          - 'critical_issues': A list of any issues that must be addressed
        """

        code = code_result.get("code", "")
        tests = test_result.get("errors", []) or []
        tests_output = "\n".join(tests) if tests else "All tests passed."

        # Construct LLM prompt requesting JSON output
        prompt = (
            "You are a senior software engineer and code reviewer. "
            "Review the following Python code snippet and its test results. "
            "Provide a JSON object with two keys:\n"
            " - feedback: A concise summary of overall code quality.\n"
            " - critical_issues: A list of strings describing any bugs, logical errors, "
            "or style issues that should be fixed.\n\n"
            f"Code:\n```python\n{code}\n```\n\n"
            f"Test Results:\n{tests_output}\n\n"
            "Output JSON only."
        )

        llm_response = self.llm.invoke(prompt).content

        # Extract JSON from LLM response
        try:
            # Sometimes the model wraps JSON in code fences
            json_str = re.search(r"\{[\s\S]*\}", llm_response).group(0)
            result = json.loads(json_str)
            # Ensure keys exist
            feedback = result.get("feedback", "")
            critical_issues = result.get("critical_issues", [])
        except Exception:
            # Fallback: return the raw response as feedback
            feedback = llm_response.strip()
            critical_issues = []

        return {
            "feedback": feedback,
            "critical_issues": critical_issues
        }

class Orchestrator:
    def __init__(self):
        self.planner = PlannerAgent()
        self.executor = ExecutorAgent()
        self.reviewer = ReviewerAgent()
        # Add a place to collect code for each subtask
        self._generated = []

    def execute_project(self, user_request):
        # Agent collaboration workflow
        plan = self.planner.decompose(user_request)

        # Check if the plan is valid before proceeding
        if not plan or "subtasks" not in plan or not isinstance(plan["subtasks"], list):
            print("Planner returned an invalid plan.")
            return "Project execution failed: Invalid plan."

        # Process each subtask
        for subtask in plan["subtasks"]:
            # Skip invalid subtasks
            if not isinstance(subtask, dict) or "desc" not in subtask:
                print(f"Skipping invalid subtask: {subtask}")
                continue

            print(f"\n🔧 Processing subtask {subtask['id']}: {subtask['desc']}")

            # Generate code for this subtask
            code_result = self.executor.write_code(subtask["desc"])

            # Check if code_result is valid
            if not code_result or "code" not in code_result:
                print(f"Executor failed to generate code for subtask: {subtask['desc']}")
                continue

            # Store the generated code
            self._generated.append({
                "id": subtask["id"],
                "desc": subtask["desc"],
                "code": code_result["code"]
            })

            print("🧪 Running tests...")
            test_result = self.executor.run_tests(code_result)

            # Check if test_result is valid
            if not test_result or "passed" not in test_result:
                print(f"Executor failed to run tests for subtask: {subtask['desc']}")
                continue

            if not test_result["passed"]:
                print("🔍 Reviewing failed tests...")
                review = self.reviewer.analyze(code_result, test_result)
                print(f"📝 Review feedback: {review.get('feedback', 'No feedback')}")
                if review.get("critical_issues"):
                    print(f"❌ Critical issues: {review['critical_issues']}")
            else:
                print("✅ Tests passed")

        # Build the final output after processing all subtasks
        if not self._generated:
            return "No code generated for any subtasks"

        final_output = "\n\n".join(
            f"# Subtask {item['id']}: {item['desc']}\n\n"
            f"{item['code']}\n"
            for item in self._generated
        )

        return final_output

# Usage
if __name__ == "__main__":
    system = Orchestrator()
    response = system.execute_project("Design a production-ready Flask API for a stock market prediction model. Include endpoints to submit historical data, get predictions, and retrieve model performance metrics. Use best practices for input validation and error handling")
    print(f"\n✅ {response}")

Enter Google API Key: ··········





🔧 Processing subtask 1: Design the API endpoints:  `/submit_data` (POST), `/predict` (POST), `/metrics` (GET).




🧪 Running tests...
🔍 Reviewing failed tests...




📝 Review feedback: The code has a critical dependency issue and several areas needing improvement.
❌ Critical issues: ["The code fails to run due to a missing dependency ('flask_restful').  The test environment needs to be configured correctly to include this library.", "Hardcoded column names ('x', 'y') make the code inflexible and difficult to maintain.  These should be configurable parameters.", 'Error handling is too generic.  The `except Exception as e` block catches all exceptions, masking potential underlying issues.  More specific exception handling is needed.', 'The model training is embedded in the main application logic. This should be separated into a dedicated training script to improve maintainability and reproducibility.', 'The data loading and saving uses pandas `to_csv`, which can be inefficient for large datasets. Consider using a more efficient database or data storage solution.', "The `Metrics` class currently only calculates the mean of 'x' and 'y'.  More comprehen



🧪 Running tests...
✅ Tests passed

🔧 Processing subtask 3: Implement input validation for all API endpoints using Pydantic or similar.  Return appropriate HTTP error codes (e.g., 400 Bad Request) for invalid input.




🧪 Running tests...
✅ Tests passed

🔧 Processing subtask 4: Develop the `/submit_data` endpoint. This endpoint should accept historical stock data in JSON format, validate the data, and store it in a persistent database (e.g., PostgreSQL, MongoDB). Implement error handling for database operations.




🧪 Running tests...
🔍 Reviewing failed tests...




📝 Review feedback: The code has a good structure and handles errors reasonably well. However, it lacks crucial aspects like database schema creation and robust input validation, and the timeout during testing suggests performance issues or environment problems.
❌ Critical issues: ['The test timed out, indicating a potential problem with database connection, slow query execution, or a missing database setup.  The code should include explicit database schema creation and checks to ensure the database is accessible and the table exists before attempting insertion.', 'The input validation is insufficient.  It only checks for the presence of fields and performs basic type checking.  More rigorous validation is needed to ensure data integrity (e.g., date format validation, range checks for price and volume).', 'Error handling could be improved by providing more specific error messages to the client.  Generic error messages like "Database error" are not helpful for debugging.', 'The use of `d



🧪 Running tests...
✅ Tests passed

🔧 Processing subtask 6: Train the prediction model using the historical data stored in the database. Implement model versioning to track different model iterations.




🧪 Running tests...
✅ Tests passed

🔧 Processing subtask 7: Develop the `/predict` endpoint. This endpoint should accept new data in JSON format, validate it, use the trained model to generate predictions, and return the predictions in JSON format. Implement error handling for model prediction failures.




🧪 Running tests...
🔍 Reviewing failed tests...




📝 Review feedback: The code is well-structured and includes comprehensive error handling.  However, there are areas for improvement in terms of clarity and robustness.
❌ Critical issues: ["The input validation only checks for the presence of keys, not their data types or values.  Add validation to ensure that 'feature1' and 'feature2' are of the correct type (e.g., numeric) and within an acceptable range.  Consider using a schema validation library like Marshmallow for more complex input validation.", 'The error handling is too broad.  The `except Exception as e` blocks catch too many different types of errors, making debugging difficult.  Be more specific in the `except` clauses to catch only the expected errors.  Log the unexpected errors for better debugging.', "The model loading should include a check for the model's version or other metadata to ensure that the correct model is loaded.  This is crucial for preventing deployment issues.", 'The code assumes a single prediction output



🧪 Running tests...
✅ Tests passed

🔧 Processing subtask 9: Develop the `/metrics` endpoint. This endpoint should retrieve and return the model performance metrics in JSON format.




🧪 Running tests...
🔍 Reviewing failed tests...




📝 Review feedback: The code is well-structured and includes good error handling. However, the test failure indicates a deployment issue, not a code problem.
❌ Critical issues: ['The test failure is due to port 5000 being in use, not a code bug.  The code should be run on a different port or the conflicting process should be stopped.']

🔧 Processing subtask 10: Implement comprehensive logging throughout the API to track requests, errors, and performance.




🧪 Running tests...
🔍 Reviewing failed tests...




📝 Review feedback: The code is well-structured and uses appropriate logging practices.  However, there's a critical deployment issue and a minor style concern.
❌ Critical issues: ['The test results indicate a port conflict.  The application should handle this gracefully, perhaps by allowing the port to be specified as a command-line argument or configuration setting.  Running with `debug=True` should not be done in production.', 'The logging statements within the `before_request` and `after_request` functions could potentially log sensitive data (e.g., request headers, request data).  Consider filtering sensitive information before logging, especially in a production environment.']

🔧 Processing subtask 11: Implement robust error handling and exception management across all endpoints. Return informative error messages to the client.




🧪 Running tests...
🔍 Reviewing failed tests...




📝 Review feedback: The code is well-structured and handles exceptions appropriately.  Error handling is good, with specific error messages returned for different exception types. However, the test failure indicates a deployment issue, not a code problem.
❌ Critical issues: ['The test failure is due to port 5000 being in use.  This is not a code bug; the server needs to be started on a different port (e.g., using the `port` argument in `app.run()`), or the conflicting process needs to be stopped.']

🔧 Processing subtask 12: Write unit and integration tests to ensure the correctness and reliability of the API and the prediction model.




🧪 Running tests...
🔍 Reviewing failed tests...




📝 Review feedback: The code has several critical issues preventing successful execution.  The API tests are failing due to incorrect assumptions about the API response (status code, content type, and JSON structure). The integration test, while using mocking, still relies on the actual API endpoint which is problematic. The prediction model tests pass but are insufficiently robust.
❌ Critical issues: ['The API endpoint `http://localhost:5000/predict` is likely not running or not configured correctly.  The tests are failing with 404 (Not Found) and the response is not valid JSON.  The tests should be made more robust to handle potential network errors and unexpected responses.', 'The `test_api_response_content_type` test fails because the API is not returning `application/json` as expected. The API needs to be fixed to return the correct content type.', 'The `test_api_response_data` test fails because the API response is not valid JSON.  The API needs to be fixed to return valid JSON.',



🧪 Running tests...
✅ Tests passed

🔧 Processing subtask 14: Implement security best practices, including input sanitization, authentication, and authorization (if needed).




🧪 Running tests...
🔍 Reviewing failed tests...




📝 Review feedback: The code has a good structure and attempts to address security concerns, but has critical flaws and needs significant improvements.
❌ Critical issues: ['**Hardcoded Credentials:**  The `ALLOWED_USERS` dictionary contains hardcoded credentials. This is a major security vulnerability.  Credentials should be stored securely (e.g., using a database with strong password hashing and salting).', '**Weak Sanitization:** The `sanitize_input` function provides only basic HTML escaping.  This is insufficient to prevent XSS and SQL injection attacks in a real-world application. A robust input validation and sanitization library should be used.', '**Insufficient Authorization:** The `authorize` function is rudimentary and lacks a proper authorization mechanism. A more sophisticated system (e.g., Role-Based Access Control (RBAC)) is needed to manage user permissions effectively.', "**Error Handling:** While the code includes some error handling, it's not comprehensive.  More robus