In [None]:
%%capture
import os

# Install dependencies only in Colab environment
if "COLAB_" in "".join(os.environ.keys()):
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    # !pip uninstall -y transformers
    !pip install transformers==4.51.3
    !pip install --no-deps unsloth
    !pip install optuna
    !pip install pillow requests openai
    !pip install git+https://github.com/openai/CLIP.git
    !pip install faiss-cpu
    !pip install ftfy regex tqdm

In [None]:
from unsloth import FastVisionModel
import torch
from PIL import Image
import requests
from io import BytesIO
import re
import pandas as pd
import string
from google.colab import drive
from google.colab import userdata
from openai import OpenAI
import json
import numpy as np
import clip
from typing import List, Dict, Tuple, Optional, Union
import faiss
import pickle
import os
from dataclasses import dataclass
import logging
from tqdm import tqdm
import time

# Mount Google Drive (adjust or remove if not using Colab)
drive.mount('/content/gdrive')
absolute_path = "/content/gdrive/My Drive/Projects/Agentic-RAG/"

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Mounted at /content/gdrive


## 🚀 GPT Orchestrator

In [None]:
def GPTOrchestrator(orchestrator_prompt, model_id):
    """
    GPT Orchestrator

    Args:
        orchestrator_prompt: Text prompt for the model
        model_id: GPT model ID (e.g., "o3-mini-2025-01-31")
    """
    # Try to get API key from different sources
    api_key = None

    # Method 1: Try environment variable
    api_key = os.environ.get("OPENAI_API_KEY")

    # Method 2: Try Google Colab userdata (if available)
    if not api_key:
        try:
            from google.colab import userdata
            api_key = userdata.get('OPENAI_API_KEY')
        except:
            pass

    # Method 3: Manual input
    # if not api_key:
    #     api_key = input("Enter your OpenAI API key: ")

    if not api_key:
        raise ValueError("OpenAI API key not found. Please set OPENAI_API_KEY environment variable or add it to Colab secrets.")

    client = OpenAI(api_key=api_key)

    conversation = [{
        'role': 'user',
        "content": [
            {"type": "text", "text": orchestrator_prompt},
        ]
    }]

    start_time = time.time()
    completion = client.chat.completions.create(
        model=model_id,
        messages=conversation
    )
    elapsed_time = time.time() - start_time

    response = completion.choices[0].message.content

    return {"model_response": response, "time": round(elapsed_time, 10), "cost": round(0.00156875, 6)}  # Each request has a fixed cost.

## qwen25.png Qwen Agent

In [None]:
def QwenAgent(agent_prompt, model_id, image_url, model_path):
    if not hasattr(QwenAgent, "model") or not hasattr(QwenAgent, "tokenizer"):
        # Load model and tokenizer once and save as function attributes
        if model_id == "finetuned":
            base_model_name = "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit"

            model, tokenizer = FastVisionModel.from_pretrained(
                base_model_name,
                load_in_4bit=True,
                use_gradient_checkpointing="unsloth",
            )
            model.load_adapter(model_path)
            FastVisionModel.for_inference(model)
        else:
            model, tokenizer = FastVisionModel.from_pretrained(
                "unsloth/Qwen2.5-VL-7B-Instruct-bnb-4bit",
                load_in_4bit=True,
                use_gradient_checkpointing="unsloth",
            )
            FastVisionModel.for_inference(model)
        QwenAgent.model = model
        QwenAgent.tokenizer = tokenizer
    else:
        model = QwenAgent.model
        tokenizer = QwenAgent.tokenizer

    # Load image (local path or URL)
    if image_url.startswith('http'):
        image = Image.open(BytesIO(requests.get(image_url).content)).convert("RGB")
    else:
        image = Image.open(image_url).convert("RGB")

    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": agent_prompt}]}]
    input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True)
    inputs = tokenizer(image, input_text, add_special_tokens=False, return_tensors="pt").to("cuda")

    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=50, temperature=1.0)
    elapsed_time = time.time() - start_time

    input_length = inputs['input_ids'].shape[1]
    generated_tokens = outputs[0][input_length:]
    response = tokenizer.decode(generated_tokens, skip_special_tokens=True)

    return {"model_response": response, "time": round(elapsed_time, 10), "cost": round(((elapsed_time / 3600) * 6.86) * 0.1325, 6)}  # Calculate the cost based on the duration of A100 GPU usage

## openai.png GPT Agent

In [None]:
def GPTAgent(agent_prompt, model_id, image_url):
    """
    GPT Multimodal agent for comparison

    Args:
        agent_prompt: Text prompt for the model
        model_id: GPT model ID (e.g., "gpt-4o")
        image_url: URL to image
    """
    # Try to get API key from different sources
    api_key = None

    # Method 1: Try environment variable
    api_key = os.environ.get("OPENAI_API_KEY")

    # Method 2: Try Google Colab userdata (if available)
    if not api_key:
        try:
            from google.colab import userdata
            api_key = userdata.get('OPENAI_API_KEY')
        except:
            pass

    # Method 3: Manual input
    # if not api_key:
    #     api_key = input("Enter your OpenAI API key: ")

    if not api_key:
        raise ValueError("OpenAI API key not found. Please set OPENAI_API_KEY environment variable or add it to Colab secrets.")

    client = OpenAI(api_key=api_key)

    conversation = [{
        'role': 'user',
        "content": [
            {"type": "text", "text": agent_prompt},
            {
                "type": "image_url",
                "image_url": {
                    "url": image_url
                }
            }
        ]
    }]

    start_time = time.time()
    completion = client.chat.completions.create(
        model=model_id,
        messages=conversation
    )
    elapsed_time = time.time() - start_time

    response = completion.choices[0].message.content

    if 'ft' in model_id: # fine-tuned model
      return {"model_response": response, "time": round(elapsed_time, 10), "cost": round(0.00199375, 6)}  # Each request has a fixed cost.
    else:  # base model
      return {"model_response": response, "time": round(elapsed_time, 10), "cost": round(0.001356756756757, 6)}  # Each request has a fixed cost.

## Image-RAG Implementation

In [None]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

@dataclass
class ImageData:
    """Data structure to hold image information"""
    image_url: str
    category: str
    embedding: Optional[np.ndarray] = None
    index: Optional[int] = None

class CLIPEmbeddingGenerator:
    """Handles CLIP model loading and embedding generation"""

    def __init__(self, model_name: str = "ViT-B/32", device: str = None):
        """
        Initialize CLIP model

        Args:
            model_name: CLIP model variant to use
            device: Device to run model on (auto-detected if None)
        """
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        logger.info(f"Loading CLIP model {model_name} on {self.device}")

        self.model, self.preprocess = clip.load(model_name, device=self.device)
        self.model.eval()

    def encode_image_from_url(self, image_url: str) -> np.ndarray:
        """
        Download image from URL and generate CLIP embedding

        Args:
            image_url: URL of the image to encode

        Returns:
            Normalized embedding vector
        """
        try:
            # Download image with more robust settings
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
            response = requests.get(image_url, timeout=60, headers=headers, stream=True)
            response.raise_for_status()

            # Check content type
            content_type = response.headers.get('content-type', '')
            if not content_type.startswith('image/'):
                raise ValueError(f"Invalid content type: {content_type}")

            # Process image
            image = Image.open(BytesIO(response.content)).convert('RGB')

            # Verify image is valid
            if image.size[0] == 0 or image.size[1] == 0:
                raise ValueError("Invalid image dimensions")

            image_tensor = self.preprocess(image).unsqueeze(0).to(self.device)

            # Generate embedding
            with torch.no_grad():
                embedding = self.model.encode_image(image_tensor)
                embedding = embedding / embedding.norm(dim=-1, keepdim=True)  # Normalize

            return embedding.cpu().numpy().flatten()

        except requests.exceptions.RequestException as e:
            logger.error(f"Network error for {image_url}: {str(e)}")
            raise
        except Exception as e:
            logger.error(f"Error processing image {image_url}: {str(e)}")
            raise

    def encode_image_from_path(self, image_path: str) -> np.ndarray:
        """
        Load local image and generate CLIP embedding

        Args:
            image_path: Path to local image file

        Returns:
            Normalized embedding vector
        """
        try:
            image = Image.open(image_path).convert('RGB')
            image_tensor = self.preprocess(image).unsqueeze(0).to(self.device)

            with torch.no_grad():
                embedding = self.model.encode_image(image_tensor)
                embedding = embedding / embedding.norm(dim=-1, keepdim=True)

            return embedding.cpu().numpy().flatten()

        except Exception as e:
            logger.error(f"Error processing local image {image_path}: {str(e)}")
            raise

class VectorDatabase:
    """FAISS-based vector database for storing and retrieving image embeddings"""

    def __init__(self, embedding_dim: int = 512):
        """
        Initialize vector database

        Args:
            embedding_dim: Dimension of embedding vectors
        """
        self.embedding_dim = embedding_dim
        self.index = faiss.IndexFlatIP(embedding_dim)  # Inner product for normalized vectors
        self.image_data: List[ImageData] = []

    def add_embeddings(self, embeddings: np.ndarray, image_data_list: List[ImageData]):
        """
        Add embeddings to the database

        Args:
            embeddings: Array of embeddings to add
            image_data_list: Corresponding image data
        """
        # Ensure embeddings are normalized
        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
        embeddings = embeddings / norms

        # Add to FAISS index
        start_idx = len(self.image_data)
        self.index.add(embeddings.astype(np.float32))

        # Store metadata
        for i, img_data in enumerate(image_data_list):
            img_data.index = start_idx + i
            self.image_data.append(img_data)

    def search(self, query_embedding: np.ndarray, k: int = 5) -> List[Tuple[ImageData, float]]:
        """
        Search for similar images

        Args:
            query_embedding: Query embedding vector
            k: Number of results to return

        Returns:
            List of (ImageData, similarity_score) tuples
        """
        # Normalize query embedding
        query_embedding = query_embedding / np.linalg.norm(query_embedding)
        query_embedding = query_embedding.reshape(1, -1).astype(np.float32)

        # Search
        similarities, indices = self.index.search(query_embedding, k)

        results = []
        for sim, idx in zip(similarities[0], indices[0]):
            if idx < len(self.image_data):
                results.append((self.image_data[idx], float(sim)))

        return results

    def save(self, filepath: str):
        """Save the vector database to disk"""
        os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else '.', exist_ok=True)

        # Save FAISS index
        faiss.write_index(self.index, f"{filepath}.faiss")

        # Save metadata
        with open(f"{filepath}.pkl", 'wb') as f:
            pickle.dump({
                'image_data': self.image_data,
                'embedding_dim': self.embedding_dim
            }, f)

        logger.info(f"Vector database saved to {filepath}")

    def load(self, filepath: str):
        """Load the vector database from disk"""
        # Load FAISS index
        self.index = faiss.read_index(f"{filepath}.faiss")

        # Load metadata
        with open(f"{filepath}.pkl", 'rb') as f:
            data = pickle.load(f)
            self.image_data = data['image_data']
            self.embedding_dim = data['embedding_dim']

        logger.info(f"Vector database loaded from {filepath}")

class PlantDiseaseRAG:
    """Main RAG pipeline for plant disease detection"""

    def __init__(self, clip_model: str = "ViT-B/32", device: str = None):
        """
        Initialize the RAG pipeline

        Args:
            clip_model: CLIP model variant to use
            device: Device to run model on
        """
        self.clip_generator = CLIPEmbeddingGenerator(clip_model, device)
        self.vector_db = VectorDatabase(embedding_dim=512)  # ViT-B/32 outputs 512-dim embeddings

    def build_database_from_json(self, json_filepath: str, batch_size: int = 32):
        """
        Build vector database from training JSON file

        Args:
            json_filepath: Path to the training JSON file
            batch_size: Number of images to process in each batch
        """
        logger.info(f"Loading training data from {json_filepath}")

        with open(json_filepath, 'r') as f:
            raw_data = json.load(f)

        # Parse the complex JSON structure to extract image-category pairs
        training_data = []
        for conversation in raw_data:
            try:
                messages = conversation.get('messages', [])
                user_message = None
                assistant_message = None

                # Find user and assistant messages
                for msg in messages:
                    if msg.get('role') == 'user':
                        user_message = msg
                    elif msg.get('role') == 'assistant':
                        assistant_message = msg

                if not user_message or not assistant_message:
                    continue

                # Extract image URL from user message
                image_url = None
                for content in user_message.get('content', []):
                    if content.get('type') == 'image':
                        image_url = content.get('image')
                        break

                # Extract category from assistant message
                category = None
                for content in assistant_message.get('content', []):
                    if content.get('type') == 'text':
                        text = content.get('text', '')
                        try:
                            # Parse the JSON response from assistant
                            category_data = json.loads(text)
                            category = category_data.get('category')
                        except:
                            # If JSON parsing fails, try to extract category directly
                            if 'healthy' in text.lower():
                                category = 'healthy'
                            elif 'black-rot' in text.lower():
                                category = 'black-rot'
                            elif 'rust' in text.lower():
                                category = 'rust'
                            elif 'scab' in text.lower():
                                category = 'scab'
                        break

                # Add to training data if both image and category found
                if image_url and category:
                    training_data.append({
                        'image': image_url,
                        'category': category
                    })

            except Exception as e:
                logger.warning(f"Failed to parse conversation: {str(e)}")
                continue

        logger.info(f"Extracted {len(training_data)} image-category pairs from {len(raw_data)} conversations")

        # Test first few URLs to diagnose issues
        logger.info("Testing first 3 URLs for connectivity...")
        for i, item in enumerate(training_data[:3]):
            try:
                image_url = item.get('image', 'No URL found')
                logger.info(f"Testing URL {i+1}: {image_url}")

                response = requests.head(image_url, timeout=10)
                logger.info(f"  Status: {response.status_code}")
                logger.info(f"  Content-Type: {response.headers.get('content-type', 'Unknown')}")

            except Exception as e:
                logger.error(f"  Failed to connect: {str(e)}")

        # Process images in batches
        all_embeddings = []
        all_image_data = []
        failed_images = []
        successful_count = 0

        for i in tqdm(range(0, len(training_data), batch_size), desc="Processing batches"):
            batch = training_data[i:i + batch_size]
            batch_embeddings = []
            batch_image_data = []

            for j, item in enumerate(batch):
                try:
                    # Extract image URL and category with better error handling
                    if not isinstance(item, dict):
                        raise ValueError(f"Invalid item format: {type(item)}")

                    image_url = item.get('image')
                    category = item.get('category')

                    if not image_url:
                        raise ValueError("No 'image' field found")
                    if not category:
                        raise ValueError("No 'category' field found")

                    # Generate embedding
                    embedding = self.clip_generator.encode_image_from_url(image_url)

                    # Store data
                    batch_embeddings.append(embedding)
                    batch_image_data.append(ImageData(
                        image_url=image_url,
                        category=category,
                        embedding=embedding
                    ))
                    successful_count += 1

                    # Log progress every 50 successful images
                    if successful_count % 50 == 0:
                        logger.info(f"Successfully processed {successful_count} images")

                except Exception as e:
                    error_url = item.get('image', 'unknown') if isinstance(item, dict) else 'invalid_item'
                    logger.debug(f"Failed to process {error_url}: {str(e)}")
                    failed_images.append(error_url)
                    continue

            if batch_embeddings:
                all_embeddings.extend(batch_embeddings)
                all_image_data.extend(batch_image_data)

        # Log results
        logger.info(f"Processing complete:")
        logger.info(f"  Successful: {len(all_embeddings)} images")
        logger.info(f"  Failed: {len(failed_images)} images")

        if failed_images:
            logger.warning(f"Failed to process {len(failed_images)} images")
            # Log first few failed URLs for debugging
            logger.info("First 5 failed URLs:")
            for url in failed_images[:5]:
                logger.info(f"  {url}")

        # Add to vector database
        if all_embeddings:
            embeddings_array = np.array(all_embeddings)
            self.vector_db.add_embeddings(embeddings_array, all_image_data)
            logger.info(f"Successfully added {len(all_embeddings)} images to vector database")
        else:
            logger.error("No images were successfully processed!")
            logger.error("Common issues:")
            logger.error("1. Check internet connectivity")
            logger.error("2. Verify JSON file format")
            logger.error("3. Check if image URLs are accessible")
            logger.error("4. Verify Google Storage bucket permissions")
            raise ValueError("No images were successfully processed")

    def query_similar_images(self, image_input: str, k: int = 5) -> List[Dict]:
        """
        Find similar images for a query image

        Args:
            image_input: Either URL or local file path of query image
            k: Number of similar images to retrieve

        Returns:
            List of similar image information with categories and similarity scores
        """
        try:
            # Generate embedding for query image
            if image_input.startswith(('http://', 'https://')):
                query_embedding = self.clip_generator.encode_image_from_url(image_input)
            else:
                query_embedding = self.clip_generator.encode_image_from_path(image_input)

            # Search for similar images
            results = self.vector_db.search(query_embedding, k)

            # Format results
            formatted_results = []
            for img_data, similarity in results:
                formatted_results.append({
                    'image_url': img_data.image_url,
                    'category': img_data.category,
                    'similarity_score': similarity,
                    'confidence': float(similarity)  # Normalized similarity as confidence
                })

            return formatted_results

        except Exception as e:
            logger.error(f"Error querying similar images: {str(e)}")
            raise

    def get_category_suggestions(self, image_input: str, k: int = 10) -> List[Dict[str, Union[str, float]]]:
        """
        Get category suggestions based on similar images
        Args:
            image_input: Either URL or local file path of query image
            k: Number of similar images to consider
        Returns:
            List of dictionaries with category names and their confidence scores
        """
        start_time = time.time()
        similar_images = self.query_similar_images(image_input, k)
        elapsed_time = time.time() - start_time

        # Aggregate categories by weighted similarity
        category_scores = {}
        total_weight = 0

        for result in similar_images:
            category = result['category']
            weight = result['similarity_score']

            if category not in category_scores:
                category_scores[category] = 0
            category_scores[category] += weight
            total_weight += weight

        # Normalize scores
        if total_weight > 0:
            for category in category_scores:
                category_scores[category] /= total_weight

        # Sort by confidence and format as list of dictionaries
        sorted_categories = sorted(category_scores.items(), key=lambda x: x[1], reverse=True)

        # Format the response as requested
        formatted_results = []
        for category, confidence in sorted_categories:
            formatted_results.append({
                "category": category,
                "confidence": confidence
            })

        return {"response": formatted_results, "time": round(elapsed_time, 10), "cost": round(((elapsed_time / 3600) * 6.86) * 0.1325, 6)}

    def save_database(self, filepath: str):
        """Save the vector database"""
        self.vector_db.save(filepath)

    def load_database(self, filepath: str):
        """Load a pre-built vector database"""
        self.vector_db.load(filepath)


# Testing
# # Initialize the RAG system
# rag_system = PlantDiseaseRAG()

# # Build RAG
# # rag_system.build_database_from_json(absolute_path + "Datasets/train_set_256.json")
# # rag_system.save_database(absolute_path + 'vector_db/plant_disease_vectordb')

# # Load the pre-built database
# rag_system.load_database(absolute_path + "vector_db/plant_disease_vectordb")

# # Query image URL
# query_image = "https://storage.googleapis.com/kroumeliotis-image-bucket/plant-disease-detection/Apple/256/healthy/healthy-810.JPG"
# query_image = "https://extension.okstate.edu/programs/digital-diagnostics/plant-diseases/site-files/cedar-apple-rust/car-apple9.jpg"
# query_image = "https://www.ces.ncsu.edu/wp-content/uploads/2021/07/Marssonina-1-768x1024.jpg"

# # Get category suggestions with confidence scores
# category_suggestions = rag_system.get_category_suggestions(query_image, k=10)
# print(category_suggestions)

## Prompt Handler

In [None]:
def PromptHandler(prompt_type, placeholder_dict):  # placeholder_dict can handle and replace multiple placeholders, just include them to the dict
  prompt_dict = {
    "agent_prompt": """Analyze the provided image of an apple leaf using your computer vision capabilities.
  Classify the leaf into the most appropriate category based on its condition, choosing from the predefined list:
  {
  "categories": [
    "black-rot",
    "healthy",
    "rust",
    "scab"
    ]
  }
  Provide your final classification in the following JSON format without explanations:
  {
    "category": "chosen_category_name",
    "reason": "brief_reason_for_this_classification",
    "confidence": "confidence_score_between_0_and_1"
  }""",
    "orchestrator_prompt": """You are acting as the orchestrator in an agentic AI system. Your role is to make an informed and well-reasoned final decision for a classification task, based on the outputs of multiple AI agents.
  Task:
  Classify an image of an apple plant leaf into the correct disease category.

  Context:
  You have received independent responses from two specialized AI agents. Each agent may use different modalities, models, or reasoning approaches. Your responsibility is to analyze, compare, and synthesize their outputs to make a final, justified classification decision.

  $agents_response

  Your Task as Orchestrator:
  Based on the agents' responses, provide your final classification decision.

  You must return your answer in the following JSON format:
  {
    "category": "chosen_category_name",
    "reason": "detailed explanation of your reasoning process",
    "confidence": "confidence_score_between_0_and_1"
  }

  Be sure your reasoning includes:
  - How you evaluated the agents’ outputs
  - Any conflicts or agreements you observed
  - The rationale behind your chosen decision""",
    "orchestrator_case_3_prompt": """You are the orchestrator in an agentic AI system.
The system's task is to classify an image of an apple plant leaf into the correct disease category.
The AI agents in this system have returned their classifications along with *self-confidence* scores for this task. However, we are unsure whether their classifications and confidence scores are reliable.
To address this, we conducted an independent evaluation of each agent’s calibration metrics, indicating how trustworthy their confidence scores are.

Think carefully and decide whether you trust the agents’ outputs, or whether they need to re-evaluate their responses using our Image-RAG.

- **Agents' responses**:
  $agents_response
- **Agents' independent evaluation**:
  $agents_confidence

If you trust them, return:
{
  "category": "chosen_category_name",
  "reason": "detailed explanation of your reasoning process",
  "confidence": "confidence_score_between_0_and_1"
}

If you do not trust one or both of them, return:
{
  "agent_1": "Please reconsider your selection, taking into account the Image-RAG. You are not required to change your selection if you are confident in it. However, if you do change your answer, please explain in detail why you changed your mind, providing reasoning based on the image.",
  "agent_2": "PPlease reconsider your selection, taking into account the Image-RAG. You are not required to change your selection if you are confident in it. However, if you do change your answer, please explain in detail why you changed your mind, providing reasoning based on the image."
}
""",
    "agent_re_evaluation_prompt": "Context from the previous conversation:\n$previous_context\n\nYour previous answer:\n$previous_response\n\nPlease reconsider your selection, taking into account the Image-RAG:\n$agentic_rag\n\nYou are not required to change your selection if you are confident in it. However, if you do change your answer, please explain in detail why you changed your mind, providing reasoning based on the image.\n\nCLASSIFICATION REQUIREMENTS:\n- You must select exactly ONE category: black-rot, healthy, rust, or scab\n- Confidence must be a decimal number between 0.0 and 1.0\n- Provide a clear, detailed reason for your classification\n\nYour response should end with this exact JSON format (complete and valid):\n\n{\n  \"category\": \"black-rot\",\n  \"reason\": \"The leaf shows characteristic dark spots with concentric rings typical of black rot disease\",\n  \"confidence\": 0.87\n}\n\nReplace the values above with your actual classification. Ensure the JSON is complete with all closing braces and quotes."
  }

  # Get the template string and wrap it with string.Template
  raw_template = prompt_dict.get(f"{prompt_type}_prompt")
  if not raw_template:
      raise ValueError(f"Invalid prompt type: {prompt_type}")
  template = string.Template(raw_template)
  final_prompt = template.safe_substitute(placeholder_dict)
  # print(final_prompt)
  return final_prompt

## Orchestration Layer

In [None]:
def OrchestrationLayer(prompt, query_image, model):
  # print("Starting inference...")
  match model:
    case "orchestrator":  # Orchestrator: GPT Reasoning Model
      print("Using GPT Reasoning Model as an Orchestrator...")
      result = GPTOrchestrator(prompt, 'o3-mini-2025-01-31')
    case "gpt_zero_shot":  # Agent 1: GPT OpenAI Base Model | Zero-Shot
      print("Using GPT OpenAI Base Model | Zero-Shot...")
      result = GPTAgent(prompt, 'gpt-4o', query_image)
    case "qwen_zero_shot":  # Agent 2: Qwen Unsloth Base Model | Zero-Shot
      print("Using Qwen Unsloth Base Model | Zero-Shot...")
      result = QwenAgent(prompt, 'base', query_image, '')
    case "gpt_few_shot":  # Agent 3: GPT Fine-Tuned Model | Few-Shot
      print("Using GPT Fine-Tuned Model | Few-Shot...")
      result = GPTAgent(prompt, "ft:gpt-4o-2024-08-06:personal:gpt-resolution-256:BCjDLKKo", query_image)
    case "qwen_few_shot":  # Agent 4: Qwen Fine-Tuned Model | Few-Shot
      print("Using Qwen Fine-Tuned Model | Few-Shot...")
      drive_path = "/content/gdrive/My Drive/Projects/VL-Models/Results/Qwen2.5-VL-7B-Instruct-bnb-4bit-4/"  # ToDO! Change path
      result = QwenAgent(prompt, "finetuned", query_image, drive_path + "best_model")
    case "orchestrator_case3":  # Orchestrator: GPT Reasoning Model
      print("Using GPT Reasoning Model as an Orchestrator...")
      result = GPTOrchestrator(prompt, 'o3-mini-2025-01-31')
    case "agentic_rag": # RAG
      print("Using RAG...")
      rag_system = PlantDiseaseRAG()  # Initialize the RAG system
      rag_system.load_database(absolute_path + "vector_db/plant_disease_vectordb")  # Load the pre-built database
      result = rag_system.get_category_suggestions(query_image, k=10)
    case _:
        print("Missing model")

  return result

### ExecutionHelper

In [None]:
# Ensure that the model_response has the correct format
#   {"model_response": response, "time": round(elapsed_time, 10), "cost": round(((elapsed_time / 3600) * 6.86) * 0.1325, 6)}
ALLOWED_CATEGORIES = {"black-rot", "healthy", "rust", "scab"}

def parse_json_from_response(response):
    if isinstance(response, dict):
        return response

    if isinstance(response, str):
        # Remove Markdown code block if present
        cleaned = response.strip()
        if cleaned.startswith("```"):
            cleaned = re.sub(r"^```(?:json)?", "", cleaned, flags=re.IGNORECASE).strip()
            if cleaned.endswith("```"):
                cleaned = cleaned[:-3].strip()

        # Extract first JSON-like block
        try:
            match = re.search(r'\{.*\}', cleaned, re.DOTALL)
            if match:
                return json.loads(match.group())
        except json.JSONDecodeError:
            return None

    return None

def ExecutionHelper(prompt, query_image, model, max_retries=10):
    required_keys = ['category', 'reason', 'confidence']

    if model == 'agentic_rag':
        return OrchestrationLayer(prompt, query_image, model)

    elif model == 'orchestrator_case3':
        full_response = OrchestrationLayer(prompt, query_image, model)
        model_raw_response = full_response["model_response"]
        return parse_json_from_response(model_raw_response)

    else:
        for attempt in range(max_retries):
            full_response = OrchestrationLayer(prompt, query_image, model)
            model_raw_response = full_response["model_response"]
            time = full_response["time"]
            cost = full_response["cost"]

            parsed = parse_json_from_response(model_raw_response)

            if (
                isinstance(parsed, dict)
                and all(k in parsed for k in required_keys)
                and parsed['category'] in ALLOWED_CATEGORIES
            ):
                return {
                    "category": parsed["category"],
                    "reason": parsed["reason"],
                    "confidence": float(parsed.get("confidence", 0.0)),
                    "time": time,
                    "cost": cost,
                }

            print(f"[Retry {attempt + 1}] Invalid or incomplete response from {model}: {full_response}")

        return {
            "category": "unknown",
            "reason": "Failed after retries",
            "confidence": 0.0,
            "time": 0.0,
            "cost": 0.0
        }

## **Experiment 1:** Zero-Shot Agentic Classification with Confidence-Aware Orchestration - **Execution Layer**


In [None]:
def InitializeDB(df):
    required_columns = [
        'RAG', 'RAG_response', 'RAG_confidence', 'RAG_time', 'RAG_cost',
        'Qwen', 'Qwen_reason', 'Qwen_confidence', 'Qwen_time', 'Qwen_cost',
        'GPT', 'GPT_reason', 'GPT_confidence', 'GPT_time', 'GPT_cost',
        'Orchestrator', 'Orchestrator_confidence', 'Orchestrator_reason', 'Orchestrator_time', 'Orchestrator_cost'
    ]

    for col in required_columns: # Add missing columns if they don't exist
        if col not in df.columns:
            if 'time' or 'cost' in col:
                df[col] = 0.0  # Use float for time
            else:
                df[col] = ''   # Use empty string for text fields
    return df

def SaveDB(df, idx, type, response):
    if type == 'RAG':
      df.at[idx, type] = response['response'][0].get('category', '')
      df.at[idx, type + '_confidence'] = response['response'][0].get('confidence', '')
      df.at[idx, type + '_response'] = response['response']
      df.at[idx, type + '_time'] = response['time']
      df.at[idx, type + '_cost'] = response['cost']
    else:
      df.at[idx, type] = response.get('category', '')
      df.at[idx, type + '_reason'] = response.get('reason', '')
      df.at[idx, type + '_confidence'] = response.get('confidence', '')
      df.at[idx, type + '_time'] = response.get('time', 0.0)
      df.at[idx, type + '_cost'] = response.get('cost', 0.0)
    return df

def ExecutionLayer():
  # Load and filter dataset
  df = pd.read_csv(absolute_path + 'Datasets/test_set_256.csv')
  df = df[['id', 'Image', 'Category']]  # Keep only specific columns

  for idx, row in df.iterrows():
    query_image = row['Image']

    # ====== Run AI Agents ======

    # Call the Qwen zero-shot agent
    agent_1_response = ExecutionHelper(PromptHandler('agent', {}), query_image, 'qwen_zero_shot')
    SaveDB(df, idx,'Qwen', agent_1_response)
    print(agent_1_response)

    # Call the GPT zero-shot agent
    agent_2_response = ExecutionHelper(PromptHandler('agent', {}), query_image, 'gpt_zero_shot')
    SaveDB(df, idx, 'GPT', agent_2_response)
    print(agent_2_response)

    # ====== Orchestration ======

    # Prepare orchestrator prompt with both agent responses
    prompt = PromptHandler('orchestrator', {
        "agents_response": f"""AI Agent 1 Response:
    {{
      "category": {json.dumps(agent_1_response['category'])},
      "reason": {json.dumps(agent_1_response['reason'])}
      "confidence": {json.dumps(agent_1_response['confidence'])}
    }}

    AI Agent 2 Response:
    {{
      "category": {json.dumps(agent_2_response['category'])},
      "reason": {json.dumps(agent_2_response['reason'])}
      "confidence": {json.dumps(agent_2_response['confidence'])}
    }}"""
    })

    # Call the orchestrator to make the final decision
    orchestrator_response = ExecutionHelper(prompt, query_image, 'orchestrator')
    SaveDB(df, idx, 'Orchestrator', orchestrator_response)
    print(orchestrator_response)

    # Save final results to CSV
    df.to_csv(absolute_path + 'Results/Case1/test_set_zero_shot_classification.csv', index=False)

# Execute the classification pipeline
ExecutionLayer()

Using Qwen Unsloth Base Model | Zero-Shot...
{'category': 'scab', 'reason': 'The presence of brown spots and scarring on the leaf surface is characteristic of scab disease.', 'confidence': 0.95, 'time': 4.0999331474, 'cost': 0.001035}
Using GPT OpenAI Base Model | Zero-Shot...
{'category': 'scab', 'reason': 'presence of dark, scabby spots on the leaf surface', 'confidence': 0.85, 'time': 2.0071856976, 'cost': 0.000507}
Using GPT Reasoning Model as an Orchestrator...
{'category': 'scab', 'reason': 'Both AI Agent 1 and AI Agent 2 identified the disease as scab based on the presence of dark, scabby spots and characteristic brown spots with scarring on the leaf surface. Given the strong agreement between both agents, and their high confidence levels (0.95 and 0.85), the evidence clearly points to scab disease. This consensus, along with the corroborating descriptions, supports a reliable classification.', 'confidence': 0.93, 'time': 3.1174688339, 'cost': 0.000787}
Using Qwen Unsloth Base M

## **Experiment 2:** Fine-Tuned Agentic Models with Confidence-Aware Orchestration - **Execution Layer**


In [None]:
def InitializeDB(df):
    required_columns = [
        'RAG', 'RAG_response', 'RAG_confidence', 'RAG_time', 'RAG_cost',
        'Qwen', 'Qwen_reason', 'Qwen_confidence', 'Qwen_time', 'Qwen_cost',
        'GPT', 'GPT_reason', 'GPT_confidence', 'GPT_time', 'GPT_cost',
        'Orchestrator', 'Orchestrator_confidence', 'Orchestrator_reason', 'Orchestrator_time', 'Orchestrator_cost'
    ]

    for col in required_columns: # Add missing columns if they don't exist
        if col not in df.columns:
            if 'time' or 'cost' in col:
                df[col] = 0.0  # Use float for time
            else:
                df[col] = ''   # Use empty string for text fields
    return df

def SaveDB(df, idx, type, response):
    if type == 'RAG':
      df.at[idx, type] = response['response'][0].get('category', '')
      df.at[idx, type + '_confidence'] = response['response'][0].get('confidence', '')
      df.at[idx, type + '_response'] = response['response']
      df.at[idx, type + '_time'] = response['time']
      df.at[idx, type + '_cost'] = response['cost']
    else:
      df.at[idx, type] = response.get('category', '')
      df.at[idx, type + '_reason'] = response.get('reason', '')
      df.at[idx, type + '_confidence'] = response.get('confidence', '')
      df.at[idx, type + '_time'] = response.get('time', 0.0)
      df.at[idx, type + '_cost'] = response.get('cost', 0.0)
    return df

def ExecutionLayer():
  # Load and filter dataset
  df = pd.read_csv(absolute_path + 'Datasets/test_set_256.csv')
  df = df[['id', 'Image', 'Category']]  # Keep only specific columns

  for idx, row in df.iterrows():
    query_image = row['Image']

    # ====== Run AI Agents ======

    # Call the Qwen agent fine-tuned for few-shot learning
    agent_1_response = ExecutionHelper(PromptHandler('agent', {}), query_image, 'qwen_few_shot')
    SaveDB(df, idx,'Qwen', agent_1_response)
    print(agent_1_response)

    # Call the GPT agent fine-tuned for few-shot learning
    agent_2_response = ExecutionHelper(PromptHandler('agent', {}), query_image, 'gpt_few_shot')
    SaveDB(df, idx, 'GPT', agent_2_response)
    print(agent_2_response)

    # ====== Orchestration ======

    # Prepare orchestrator prompt with both agent responses
    prompt = PromptHandler('orchestrator', {
        "agents_response": f"""AI Agent 1 Response:
    {{
      "category": {json.dumps(agent_1_response['category'])},
      "reason": {json.dumps(agent_1_response['reason'])}
      "confidence": {json.dumps(agent_1_response['confidence'])}
    }}

    AI Agent 2 Response:
    {{
      "category": {json.dumps(agent_2_response['category'])},
      "reason": {json.dumps(agent_2_response['reason'])}
      "confidence": {json.dumps(agent_2_response['confidence'])}
    }}"""
    })

    # Call the orchestrator to make the final decision
    orchestrator_response = ExecutionHelper(prompt, query_image, 'orchestrator')
    SaveDB(df, idx, 'Orchestrator', orchestrator_response)
    print(orchestrator_response)

    # Save final results to CSV
    df.to_csv(absolute_path + 'Results/Case2/test_set_few_shot_classification.csv', index=False)

# Execute the classification pipeline
ExecutionLayer()

Using Qwen Fine-Tuned Model | Few-Shot...
==((====))==  Unsloth 2025.6.2: Fast Qwen2_5_Vl patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.97G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/575 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/5.80k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

{'category': 'rust', 'reason': 'characteristic brown spots and discoloration indicate rust', 'confidence': 0.95, 'time': 38.3160688877, 'cost': 0.009674}
Using GPT Fine-Tuned Model | Few-Shot...
{'category': 'rust', 'reason': 'Presence of brownish spots which are typical of rust disease.', 'confidence': 0.95, 'time': 7.7808837891, 'cost': 0.001965}
Using GPT Reasoning Model as an Orchestrator...
{'category': 'rust', 'reason': 'Both AI Agent 1 and AI Agent 2 independently identified the image as representing rust, emphasizing the characteristic brown spots and discoloration typical of this disease. Their high and matching confidence levels (0.95) led to a clear consensus, reinforcing the final decision.', 'confidence': 0.95, 'time': 2.7448401451, 'cost': 0.000693}
Using Qwen Fine-Tuned Model | Few-Shot...
{'category': 'healthy', 'reason': 'The leaf appears to be without any visible signs of disease or damage.', 'confidence': 0.95, 'time': 5.7133493423, 'cost': 0.001443}
Using GPT Fine-T

## **Experiment 3:** Trust-Aware Orchestration with RAG and Re-Evaluation Loops - **Execution Layer**

In [None]:
def InitializeDB(df):
    required_columns = [
        'RAG', 'RAG_response', 'RAG_confidence', 'RAG_time', 'RAG_cost',
        'Qwen', 'Qwen_reason', 'Qwen_confidence', 'Qwen_time', 'Qwen_cost',
        'GPT', 'GPT_reason', 'GPT_confidence', 'GPT_time', 'GPT_cost',
        'Orchestrator', 'Orchestrator_confidence', 'Orchestrator_reason', 'Orchestrator_time', 'Orchestrator_cost'
    ]

    for col in required_columns: # Add missing columns if they don't exist
        if col not in df.columns:
            if 'time' or 'cost' in col:
                df[col] = 0.0  # Use float for time
            else:
                df[col] = ''   # Use empty string for text fields
    return df

def SaveDB(df, idx, type, response):
    if type == 'RAG':
      df.at[idx, type] = response['response'][0].get('category', '')
      df.at[idx, type + '_confidence'] = response['response'][0].get('confidence', '')
      # df.at[idx, type + '_response'] = response['response']
      df.at[idx, type + '_response'] = json.dumps(response['response'])  # Safely serialize
      df.at[idx, type + '_time'] = response['time']
      df.at[idx, type + '_cost'] = response['cost']
    elif type == 'Orchestrator_Re_Evaluation':
      df.at[idx, type + '_agent_1'] = response.get('agent_1', '')
      df.at[idx, type + '_agent_2'] = response.get('agent_1', '')
    else:
      df.at[idx, type] = response.get('category', '')
      df.at[idx, type + '_reason'] = response.get('reason', '')
      df.at[idx, type + '_confidence'] = response.get('confidence', '')
      df.at[idx, type + '_time'] = response.get('time', 0.0)
      df.at[idx, type + '_cost'] = response.get('cost', 0.0)
    return df

def ZeroShotPredictionTrainSet():
  # Load and filter dataset
  df = pd.read_csv(absolute_path + 'Datasets/train_set.csv')
  df = pd.read_csv(absolute_path + 'Datasets/train_set_predictions.csv')
  df = df[['id', 'Image', 'Category']]  # Keep only specific columns

  for idx, row in df.iterrows():
    query_image = 'https://storage.googleapis.com/kroumeliotis-image-bucket/plant-disease-detection/Apple/256/' + row['Category'] + '/' + row['Image']

    # ====== Run AI Agents Train Set ======

    # Call the Qwen zero-shot agent
    agent_1_response = ExecutionHelper(PromptHandler('agent', {}), query_image, 'qwen_zero_shot')
    SaveDB(df, idx,'Qwen', agent_1_response)
    print(agent_1_response)

    # Call the GPT zero-shot agent
    agent_2_response = ExecutionHelper(PromptHandler('agent', {}), query_image, 'gpt_zero_shot')
    SaveDB(df, idx, 'GPT', agent_2_response)
    print(agent_2_response)

    # Save final results to CSV
    df.to_csv(absolute_path + 'Results/Case3/train_set_predictions.csv', index=False)

# ZeroShotPredictionTrainSet()

def ExecutionLayer():
  # Load and filter dataset
  df = pd.read_csv(absolute_path + 'Datasets/test_set_256.csv')
  df = df[['id', 'Image', 'Category']]  # Keep only specific columns

  for idx, row in df.iterrows():
    query_image = row['Image']

    # Call the RAG
    rag_response = ExecutionHelper('', query_image, 'agentic_rag')
    SaveDB(df, idx, 'RAG', rag_response)
    print(rag_response)

    # ====== Run AI Agents ======

    # Call the Qwen zero-shot agent
    agent_1_response = ExecutionHelper(PromptHandler('agent', {}), query_image, 'qwen_zero_shot')
    SaveDB(df, idx,'Qwen_1', agent_1_response)
    print(agent_1_response)

    # Call the GPT zero-shot agent
    agent_2_response = ExecutionHelper(PromptHandler('agent', {}), query_image, 'gpt_zero_shot')
    SaveDB(df, idx, 'GPT_1', agent_2_response)
    print(agent_2_response)

    # ====== Orchestration ======

    # Prepare the orchestrator prompt using both agent responses and their confidence evaluations.
    prompt = PromptHandler('orchestrator_case_3', {
        "agents_response": f"""AI Agent 1 Response:
        {{
          "category": {json.dumps(agent_1_response['category'])},
          "reason": {json.dumps(agent_1_response['reason'])},
          "Self-confidence": {json.dumps(agent_1_response['confidence'])}
        }}

        AI Agent 2 Response:
        {{
          "category": {json.dumps(agent_2_response['category'])},
          "reason": {json.dumps(agent_2_response['reason'])},
          "Self-confidence": {json.dumps(agent_2_response['confidence'])}
        }}""",

        "agents_confidence": f"""AI Agent 1 Confidence Evaluation:
        {{
          "Expected_Calibration_Error": 0.45298828125,
          "Confidence_Correctness_Correlation": 0.126206735861975,
          "Overconfidence_Rate": 0.5078125
        }}

        AI Agent 2 Confidence Evaluation:
        {{
          "Expected_Calibration_Error": 0.2933203125,
          "Confidence_Correctness_Correlation": 0.360955939572398,
          "Overconfidence_Rate": 0.416015625,
        }}"""
    })

    # Call the orchestrator
    orchestrator_response = ExecutionHelper(prompt, query_image, 'orchestrator_case3')
    print(orchestrator_response)

    if orchestrator_response['agent_1'] or orchestrator_response['agent_2']:
      SaveDB(df, idx, 'Orchestrator_Re_Evaluation', orchestrator_response)

      context = PromptHandler('agent', {})

      # ====== Re-evaluation Loop ======
      # Call the Qwen zero-shot agent
      agent_1_2_response = ExecutionHelper(PromptHandler('agent_re_evaluation', {"agentic_rag":json.dumps(rag_response['response']), "previous_context":context, "previous_response":json.dumps(agent_1_response)}), query_image, 'qwen_zero_shot')
      SaveDB(df, idx,'Qwen', agent_1_2_response)
      print(agent_1_2_response)

      # Call the GPT zero-shot agent
      agent_2_2_response = ExecutionHelper(PromptHandler('agent_re_evaluation', {"agentic_rag":json.dumps(rag_response['response']), "previous_context":context, "previous_response":json.dumps(agent_2_response)}), query_image, 'gpt_zero_shot')
      SaveDB(df, idx, 'GPT', agent_2_2_response)
      print(agent_2_2_response)

      # ====== Orchestration ======

      # Prepare the orchestrator prompt to handle re-evaluated responses
      prompt = PromptHandler('orchestrator', {
          "agents_response": f"""AI Agent 1 Response:
      {{
        "category": {json.dumps(agent_1_2_response['category'])},
        "reason": {json.dumps(agent_1_2_response['reason'])}
        "confidence": {json.dumps(agent_1_2_response['confidence'])}
      }}

      AI Agent 2 Response:
      {{
        "category": {json.dumps(agent_2_2_response['category'])},
        "reason": {json.dumps(agent_2_2_response['reason'])}
        "confidence": {json.dumps(agent_2_2_response['confidence'])}
      }}"""
      })

      # Call the orchestrator to make the final decision
      orchestrator_response = ExecutionHelper(prompt, query_image, 'orchestrator')
      SaveDB(df, idx, 'Orchestrator', orchestrator_response)
      print(orchestrator_response)
    else:
      SaveDB(df, idx, 'Orchestrator_1', orchestrator_response)

    # Save final results to CSV
    df.to_csv(absolute_path + 'Results/Case3/predictions.csv', index=False)

# Execute the classification pipeline
ExecutionLayer()

Using RAG...
{'response': [{'category': 'rust', 'confidence': 1.0}], 'time': 1.5415148735, 'cost': 0.000389}
Using Qwen Unsloth Base Model | Zero-Shot...
{'category': 'scab', 'reason': 'The presence of small, circular spots and discoloration is indicative of scab, a common disease affecting apple leaves.', 'confidence': 0.95, 'time': 4.8838040829, 'cost': 0.001233}
Using GPT OpenAI Base Model | Zero-Shot...
{'category': 'scab', 'reason': 'Presence of dark, irregular spots', 'confidence': 0.85, 'time': 3.064637661, 'cost': 0.001357}
Using GPT Reasoning Model as an Orchestrator...
{'agent_1': 'Please reconsider your selection, taking into account the Image-RAG. You are not required to change your selection if you are confident in it. However, if you do change your answer, please explain in detail why you changed your mind, providing reasoning based on the image.', 'agent_2': 'Please reconsider your selection, taking into account the Image-RAG. You are not required to change your selectio