<a href="https://colab.research.google.com/github/Fasal10/Ombrulla/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import sys
import json
import logging
from pathlib import Path
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
import argparse

In [3]:
pip install cohere


Collecting cohere
  Downloading cohere-5.15.0-py3-none-any.whl.metadata (3.4 kB)
Collecting fastavro<2.0.0,>=1.9.4 (from cohere)
  Downloading fastavro-1.11.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.7 kB)
Collecting httpx-sse==0.4.0 (from cohere)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting types-requests<3.0.0,>=2.0.0 (from cohere)
  Downloading types_requests-2.32.0.20250515-py3-none-any.whl.metadata (2.1 kB)
Downloading cohere-5.15.0-py3-none-any.whl (259 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.5/259.5 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)
Downloading fastavro-1.11.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m57.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading types_requests-2.32.0.20250515-py3-none-any.whl (20 kB)
Inst

In [4]:
pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.145-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading n

In [5]:
import cv2
import numpy as np
from PIL import Image
import cohere
from ultralytics import YOLO

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [20]:
@dataclass
class DetectedObject:
    """Data class for detected objects"""
    name: str
    confidence: float
    bbox: Tuple[int, int, int, int]  # (x1, y1, x2, y2)


@dataclass
class VisionResult:
    """Data class for vision analysis results"""
    objects: List[DetectedObject]
    image_description: str


class VisionAnalyzer:
    """Computer vision module for object detection"""

    def __init__(self, model_name: str = "yolov8n.pt"):
        """
        Initialize the vision analyzer

        Args:
            model_name: YOLOv8 model name (yolov8n.pt, yolov8s.pt, etc.)
        """
        self.logger = logging.getLogger(__name__)
        try:
            self.model = YOLO(model_name)
            self.logger.info(f"Loaded YOLO model: {model_name}")
        except Exception as e:
            self.logger.error(f"Failed to load YOLO model: {e}")
            raise
    def detect_objects(self, image_path: str, confidence_threshold: float = 0.5) -> List[DetectedObject]:
        try:
            # Validate image file
            if not os.path.exists(image_path):
                raise FileNotFoundError(f"Image file not found: {image_path}")

            # Run inference
            results = self.model(image_path, conf=confidence_threshold)

            detected_objects = []

            for result in results:
                boxes = result.boxes
                if boxes is not None:
                    for box in boxes:
                        # Extract box data
                        x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()
                        confidence = float(box.conf[0])
                        class_id = int(box.cls[0])
                        class_name = self.model.names[class_id]

                        detected_objects.append(DetectedObject(
                            name=class_name,
                            confidence=confidence,
                            bbox=(int(x1), int(y1), int(x2), int(y2))
                        ))

            self.logger.info(f"Detected {len(detected_objects)} objects")
            return detected_objects

        except Exception as e:
            self.logger.error(f"Object detection failed: {e}")
            raise
    def analyze_image(self, image_path: str, confidence_threshold: float = 0.5) -> VisionResult:

        objects = self.detect_objects(image_path, confidence_threshold)

        # Generate image description based on detected objects
        if objects:
            object_names = [obj.name for obj in objects]
            unique_objects = list(set(object_names))
            object_counts = {name: object_names.count(name) for name in unique_objects}

            description_parts = []
            for name, count in object_counts.items():
                if count == 1:
                    description_parts.append(f"a {name}")
                else:
                    description_parts.append(f"{count} {name}s")

            if len(description_parts) == 1:
                description = f"The image contains {description_parts[0]}."
            elif len(description_parts) == 2:
                description = f"The image contains {description_parts[0]} and {description_parts[1]}."
            else:
                description = f"The image contains {', '.join(description_parts[:-1])}, and {description_parts[-1]}."
        else:
            description = "No objects were detected in the image."

        return VisionResult(objects=objects, image_description=description)

In [28]:
class LLMProcessor:
    """LLM module for text generation using Cohere"""

    def __init__(self, api_key: Optional[str] = None, model: str = "command"):

        api_key = "VF6cY1nbS3SWoz5L5SDG8vzlSMMt2hsqR8bE1SmN"

        self.logger = logging.getLogger(__name__)
        self.model = model

        # Set up Cohere client
        if api_key:
            self.client = cohere.Client(api_key)
        else:
            cohere_api_key = os.getenv('COHERE_API_KEY')
            if not cohere_api_key:
                raise ValueError("Cohere API key not provided. Set COHERE_API_KEY environment variable or pass api_key parameter.")
            self.client = cohere.Client(cohere_api_key)

        self.logger.info(f"Initialized LLM processor with Cohere model: {model}")
    def generate_response(self, vision_result: VisionResult, user_prompt: str) -> str:
        """
        Generate a text response combining vision analysis and user prompt

        Args:
            vision_result: Results from vision analysis
            user_prompt: User-provided text prompt

        Returns:
            Generated text response
        """
        try:
            # Prepare context from vision analysis
            objects_info = []
            for obj in vision_result.objects:
                objects_info.append(f"- {obj.name} (confidence: {obj.confidence:.2f})")

            objects_list = "\n".join(objects_info) if objects_info else "No objects detected"

            # Construct the prompt for the LLM
            # Combine the image description and user prompt
            prompt = f"Image Description: {vision_result.image_description}\n\n"
            prompt += f"Detected Objects:\n{objects_list}\n\n"
            prompt += f"User Query: {user_prompt}\n\n"
            prompt += "Based on the image analysis, respond to the user query." # Add an instruction

            # Generate response using Cohere
            response = self.client.generate(
                model=self.model,
                prompt=prompt, # Now 'prompt' is defined
                max_tokens=500,
                temperature=0.7,
                k=0,
                stop_sequences=[],
                return_likelihoods='NONE'
            )

            # Extract the generated text
            generated_text = response.generations[0].text.strip()

            # The prompt includes "Based on the image analysis...", so no need to prepend again
            # final_response = f"Based on the image analysis, I can see{generated_text}"
            final_response = generated_text # Use the generated text directly

            self.logger.info("Successfully generated Cohere response")
            return final_response

        except Exception as e:
            self.logger.error(f"Cohere generation failed: {e}")
            raise

In [34]:


class VisionLLMApp:
    """Main application class integrating vision and LLM capabilities"""

    def __init__(self, yolo_model: str = "yolov8n.pt", llm_model: str = "command"):
        """
        Initialize the integrated application

        Args:
            yolo_model: YOLO model name for object detection
            llm_model: Cohere model name for text generation (command, command-light, command-nightly)
        """
        # Set up logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

        # Initialize components
        try:
            self.vision_analyzer = VisionAnalyzer(yolo_model)
            self.llm_processor = LLMProcessor(model=llm_model)
            self.logger.info("VisionLLM application initialized successfully")
        except Exception as e:
            self.logger.error(f"Failed to initialize application: {e}")
            raise

    def process_request(self, image_path: str, text_prompt: str,
                       confidence_threshold: float = 0.5) -> Dict:
        """
        Process a complete request with image and text prompt

        Args:
            image_path: Path to the image file
            text_prompt: User text prompt
            confidence_threshold: Minimum confidence for object detection

        Returns:
            Dictionary containing results
        """
        try:
            # Validate inputs
            self._validate_inputs(image_path, text_prompt)

            # Perform vision analysis
            self.logger.info(f"Analyzing image: {image_path}")
            vision_result = self.vision_analyzer.analyze_image(image_path, confidence_threshold)

            # Generate LLM response
            self.logger.info("Generating LLM response")
            llm_response = self.llm_processor.generate_response(vision_result, text_prompt)

            # Prepare results
            results = {
                "image_path": image_path,
                "user_prompt": text_prompt,
                "detected_objects": [
                    {
                        "name": obj.name,
                        "confidence": round(obj.confidence, 3),
                        "bbox": obj.bbox
                    }
                    for obj in vision_result.objects
                ],
                "image_description": vision_result.image_description,
                "llm_response": llm_response,
                "status": "success"
            }

            self.logger.info("Request processed successfully")
            return results

        except Exception as e:
            self.logger.error(f"Request processing failed: {e}")
            return {
                "image_path": image_path,
                "user_prompt": text_prompt,
                "error": str(e),
                "status": "error"
            }

    def _validate_inputs(self, image_path: str, text_prompt: str):
        """Validate input parameters"""
        if not image_path or not isinstance(image_path, str):
            raise ValueError("Invalid image path provided")

        if not os.path.exists(image_path):
            raise FileNotFoundError(f"Image file not found: {image_path}")

        # Check if file is a valid image
        valid_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp'}
        if not any(image_path.lower().endswith(ext) for ext in valid_extensions):
            raise ValueError("Invalid image file format. Supported formats: jpg, jpeg, png, bmp, tiff, webp")

        if not text_prompt or not isinstance(text_prompt, str):
            raise ValueError("Invalid text prompt provided")

        if len(text_prompt.strip()) == 0:
            raise ValueError("Text prompt cannot be empty")

def main():
    """Main function for command-line usage"""
    parser = argparse.ArgumentParser(description="Vision-LLM Integration Application")
    parser.add_argument("image_path", help="/content/car and bike.jpg")
    parser.add_argument( "user_prompt",  help="Write a story about the things detected from the image")
    parser.add_argument("--confidence",  type=float,  default=0.5,  help="Confidence threshold for object detection (default: 0.5)" )
    parser.add_argument("--yolo-model",  default="yolov8n.pt",  help="YOLO model name to use for detection (default: yolov8n.pt)" )
    parser.add_argument("--llm-model",   default="command",  help="LLM model name to generate the story (default: command)" )
    parser.add_argument("--output",  help="Output file path for saving results in JSON format" )

    args = parser.parse_args()

    try:
        # Initialize application
        app = VisionLLMApp(yolo_model=args.yolo_model, llm_model=args.llm_model)

        # Process request
        results = app.process_request(args.image_path, args.prompt, args.confidence)

        # Output results
        if results["status"] == "success":
            print("\n" + "="*50)
            print("VISION-LLM ANALYSIS RESULTS")
            print("="*50)
            print(f"\nImage: {results['image_path']}")
            print(f"Prompt: {results['user_prompt']}")
            print(f"\nImage Description: {results['image_description']}")
            print(f"\nDetected Objects ({len(results['detected_objects'])}):")
            for obj in results['detected_objects']:
                print(f"  - {obj['name']}: {obj['confidence']:.3f} confidence")
            print(f"\nLLM Response:\n{results['llm_response']}")
        else:
            print(f"Error: {results['error']}")
            return 1

        # Save to file if requested
        if args.output:
            with open(args.output, 'w') as f:
                json.dump(results, f, indent=2)
            print(f"\nResults saved to: {args.output}")

        return 0

    except Exception as e:
        print(f"Application error: {e}")
        return 1


In [35]:
try:
    # Initialize the application without argparse
    app = VisionLLMApp(yolo_model="yolov8n.pt", llm_model="command")

    # Define your image path and prompt
    image_file_path = "/content/car and bike.jpg" # Replace with your actual image path
    user_text_prompt = "Write a story about the things detected from the image"

    # Process the request
    analysis_results = app.process_request(image_file_path, user_text_prompt, confidence_threshold=0.5)

    # Print the results
    if analysis_results["status"] == "success":
        print("\n" + "="*50)
        print("VISION-LLM ANALYSIS RESULTS")
        print("="*50)
        print(f"\nImage: {analysis_results['image_path']}")
        print(f"Prompt: {analysis_results['user_prompt']}")
        print(f"\nImage Description: {analysis_results['image_description']}")
        print(f"\nDetected Objects ({len(analysis_results['detected_objects'])}):")
        for obj in analysis_results['detected_objects']:
            print(f"  - {obj['name']}: {obj['confidence']:.3f} confidence")
        print(f"\nLLM Response:\n{analysis_results['llm_response']}")
    else:
        print(f"Error: {analysis_results['error']}")

except Exception as e:
    print(f"Application error during execution: {e}")


image 1/1 /content/car and bike.jpg: 416x640 1 car, 1 motorcycle, 188.0ms
Speed: 4.3ms preprocess, 188.0ms inference, 1.8ms postprocess per image at shape (1, 3, 416, 640)

VISION-LLM ANALYSIS RESULTS

Image: /content/car and bike.jpg
Prompt: Write a story about the things detected from the image

Image Description: The image contains a motorcycle and a car.

Detected Objects (2):
  - motorcycle: 0.854 confidence
  - car: 0.780 confidence

LLM Response:
The 85% confidence level detected motorcycle is flaunting its sleek design, proudly presenting itself in front of the 78% confidence level detected car. The motorcycle is a powerful machine, designed for speed and freedom on the roads, embracing the adventure of the open air as it drives by. The car, however, represents a more practical form of transportation, offering safety and comfort for travels long and short. Although these two vehicles differ in their appearances, they both share the same roadways and provide alternatives for ef