In [1]:
# prompt: pip install google cloud vision

!pip install google-cloud-vision


Collecting google-cloud-vision
  Downloading google_cloud_vision-3.9.0-py2.py3-none-any.whl.metadata (5.3 kB)
Downloading google_cloud_vision-3.9.0-py2.py3-none-any.whl (514 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m514.6/514.6 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-cloud-vision
Successfully installed google-cloud-vision-3.9.0


In [None]:
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq, DetrImageProcessor, DetrForObjectDetection
from PIL import Image, ImageEnhance
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from google.cloud import vision
import os
import cv2

class EnhancedImageAnalyzer:
    def __init__(self):
        print("Loading models...")

        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r"/media/amit/New Volume/Image_processing/ordinal-door-448718-v4-f6be552a9880.json"

        # Image captioning model
        self.caption_processor = AutoProcessor.from_pretrained("microsoft/git-base")
        self.caption_model = AutoModelForVision2Seq.from_pretrained("microsoft/git-base")

        # Object detection model
        self.object_processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
        self.object_model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

        self.vision_client = vision.ImageAnnotatorClient()
        print("Google Cloud Vision API client initialized.")

        print("Models loaded.")

    def preprocess_image(self, image_path):
        try:
            image = Image.open(image_path)
            if image.mode != 'RGB':
                image = image.convert('RGB')
            enchancer = ImageEnhance.Brightness(image)
            image = enchancer.enhance(2.0)
            return image
        except Exception as e:
            raise Exception(f"Error preprocessing image: {str(e)}")


    def detect_brands(self, image_path):
        """Detect brands or logos using Google Vision API."""
        try:
            # Load image
            with open(image_path, 'rb') as image_file:
                content = image_file.read()

            # Prepare image for Vision API
            image = vision.Image(content=content)

            # Perform logo detection
            response = self.vision_client.logo_detection(image=image)

            logos = response.logo_annotations

            # web_detection = self.vision_client.web_detection(image=image).web_detection



            # Parse response
            detected_brands = []
            for logo in logos:
                detected_brands.append({
                    'name': logo.description,
                    'confidence': f"{logo.score:.2%}",
                    'bounding_poly': [
                        {'x': vertex.x, 'y': vertex.y} for vertex in logo.bounding_poly.vertices
                    ]
                })
            if not detected_brands:
                response = self.vision_client.web_detection(image=image).web_detection
                if response.web_entities:
                    for entity in response.web_entities:
                        if entity.description and entity.score > 0.5:
                            detected_brands.append({
                                'name': entity.description,
                                'confidence': f"{entity.score:.2%}",
                            })

            # texts = text_response.text_annotations
            # if texts:
            #     for text in texts:
            #         detected_brands.append({
            #             'name': text.description,
            #             'confidence': f"{text.score:.2%}",
            #             'bounding_poly': [
            #                 {'x': vertex.x, 'y': vertex.y} for vertex in text.bounding_poly.vertices
            #             ]
            #         })
            # #  # Add web entities
            # if web_detection.web_entities:
            #     for entity in web_detection.web_entities:
            #         if entity.description and entity.score > 0.5:
            #             detected_brands.append({
            #                 'name': entity.description,
            #                 'confidence': f"{entity.score:.2%}",
            #                 'bounding_poly': [{'x': vertex.x, 'y': vertex.y} for vertex in entity.bounding_poly.vertices]
            #             })
            return detected_brands

        except Exception as e:
            print(f"Error detecting brands: {e}")
        return []

    def detect_text(self, image_path):
        """Detect text in the image using Google Vision API."""
        try:
          with open(image_path, 'rb') as image_file:
              content = image_file.read()
          image = vision.Image(content=content)
          text_response = self.vision_client.text_detection(image=image)
          texts = text_response.text_annotations
          deteced_texts = []
          if texts:
              for text in texts:
                  detected_texts.append({
                      'text': text.description,
                      'confidence': f"{text.score:.2%}",
                      'bounding_poly': [
                          {'x': vertex.x, 'y': vertex.y} for vertex in text.bounding_poly.vertices
                      ]
                  })
          return detected_texts
        except Exception as e:
          print(f"Error detecting text: {e}")
          return []

    def detect_shoe_types(self, detected_objects):
        """Identify types of shoes in the detected objects."""
        shoe_types = []
        for obj in detected_objects:
            if 'shoe' in obj['label'].lower():
                shoe_types.append(obj['label'])
        return shoe_types

    def analyze_image(self, image_path):
        try:
            image = self.preprocess_image(image_path)

            # Generate caption
            inputs = self.caption_processor(images=image, return_tensors="pt")
            outputs = self.caption_model.generate(
                pixel_values=inputs["pixel_values"],
                max_length=50,
                num_beams=4,
            )
            caption = self.caption_processor.decode(outputs[0], skip_special_tokens=True)

            # Detect objects
            inputs = self.object_processor(images=image, return_tensors="pt")
            outputs = self.object_model(**inputs)

            # Process object detection results
            target_sizes = torch.tensor([image.size[::-1]])
            results = self.object_processor.post_process_object_detection(
                outputs, target_sizes=target_sizes, threshold=0.7
            )[0]

            detected_objects = []
            for score, label, box in zip(results['scores'], results['labels'], results['boxes']):
                detected_objects.append({
                    'label': self.object_model.config.id2label[label.item()],
                    'confidence': f"{score.item():.2%}",
                    'box': box.tolist()
                })

            # Detect brands/logos
            detected_brands = self.detect_brands(image_path)

            # Detect shoe types
            shoe_types = self.detect_shoe_types(detected_objects)

            detected_texts = self.detect_text(image_path)

            # Color analysis
            img_array = np.array(image)
            avg_color = img_array.mean(axis=(0, 1))

            analysis = {
                'caption': caption,
                'detected_objects': detected_objects,
                'detected_brands': detected_brands,
                'detected_shoe_types': shoe_types,
                'color_analysis': {
                    'average_color_rgb': tuple(int(x) for x in avg_color),
                    'brightness': f"{np.mean(avg_color)/255:.2%}"
                },
                'technical_details': {
                    'size': image.size,
                    'mode': image.mode,
                    'format': getattr(image, 'format', 'Unknown')
                }
            }

            return analysis

        except Exception as e:
            return f"Error analyzing image: {str(e)}"

    def visualize_detection(self, image, objects, brands, save_path=None):
        """Visualize detections on the image."""
        plt.figure(figsize=(15, 10))
        plt.imshow(image)

        # Draw bounding boxes for detected objects
        for obj in objects:
            box = obj['box']
            label = obj['label']
            confidence = obj['confidence']
            rect = patches.Rectangle(
                (box[0], box[1]), box[2] - box[0], box[3] - box[1],
                linewidth=2, edgecolor='r', facecolor='none'
            )
            plt.gca().add_patch(rect)
            plt.text(box[0], box[1]-5, f'{label}: {confidence}', color='white', bbox=dict(facecolor='red', alpha=0.7))

        # Draw bounding boxes for detected brands
        for brand in brands:
            poly = brand['bounding_poly']
            x_coords = [v['x'] for v in poly]
            y_coords = [v['y'] for v in poly]
            poly_points = list(zip(x_coords, y_coords))
            poly_shape = patches.Polygon(poly_points, linewidth=2, edgecolor='g', facecolor='none')
            plt.gca().add_patch(poly_shape)
            plt.text(poly_points[0][0], poly_points[0][1]-5, f"{brand['name']}: {brand['confidence']}",
                     color='white', bbox=dict(facecolor='green', alpha=0.7))

        plt.axis('off')

        if save_path:
            plt.savefig(save_path)
            plt.close()
        else:
            plt.show()
