In [1]:

!pip install transformers torch torchvision easyocr pytesseract opencv-python-headless
!apt-get update
!apt-get install -y tesseract-ocr


Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nv

In [2]:
import torch
from transformers import (
    VisionEncoderDecoderModel,
    ViTImageProcessor,
    pipeline,
    AutoTokenizer
)
from PIL import Image
import torchvision.transforms as transforms
import easyocr
import requests
from io import BytesIO
from google.colab import files  # For file upload in Colab
import numpy as np

class ImageMemeAnalyzer:
    def __init__(self, device=None):
        # Set device (GPU if available)
        self.device = device if device else ('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {self.device}")

        # Initialize OCR reader
        self.reader = easyocr.Reader(['en'])

        # Initialize image captioning model and processor
        self.image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
        self.caption_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
        # Set pad_token_id to avoid attention mask warning if it's not set
        if self.caption_model.config.pad_token_id is None:
            self.caption_model.config.pad_token_id = self.caption_model.config.eos_token_id
        self.caption_model.to(self.device)
        # Initialize a dedicated tokenizer for decoding
        self.tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

        # Initialize object detection model
        self.object_detector = pipeline('object-detection',
                                        model='facebook/detr-resnet-50',
                                        device=0 if self.device == 'cuda' else -1)

        # Initialize sentiment analyzer
        self.sentiment_analyzer = pipeline('sentiment-analysis',
                                           device=0 if self.device == 'cuda' else -1)

        # Define image transforms (if needed for other tasks)
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]
            )
        ])

    def load_image(self, image_path_or_url):
        """Load image from file path, URL, or Colab uploaded file"""
        try:
            if isinstance(image_path_or_url, str):
                if image_path_or_url.startswith(('http://', 'https://')):
                    response = requests.get(image_path_or_url)
                    image = Image.open(BytesIO(response.content)).convert("RGB")
                else:
                    image = Image.open(image_path_or_url).convert("RGB")
            else:
                # Assuming it's a file-like object from Colab upload
                image = Image.open(image_path_or_url).convert("RGB")
            return image
        except Exception as e:
            raise Exception(f"Error loading image: {str(e)}")

    def extract_text(self, image):
        """Extract text from image using EasyOCR"""
        result = self.reader.readtext(np.array(image))
        extracted_text = ' '.join([text[1] for text in result])
        return extracted_text

    def generate_caption(self, image):
        """Generate a descriptive caption for the image"""
        inputs = self.image_processor(images=image, return_tensors="pt")
        # Move all tensors to the correct device
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        outputs = self.caption_model.generate(
            **inputs,
            max_length=30,
            num_beams=4,
            early_stopping=True
        )
        # Use the dedicated tokenizer to decode the outputs
        caption = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return caption

    def detect_objects(self, image):
        """Detect objects in the image"""
        results = self.object_detector(image)
        detected_objects = [
            {
                'label': result['label'],
                'confidence': f"{result['score']:.2f}"
            }
            for result in results
        ]
        return detected_objects

    def analyze_sentiment(self, text):
        """Analyze sentiment of extracted text"""
        if not text.strip():
            return None
        result = self.sentiment_analyzer(text)[0]
        return {
            'sentiment': result['label'],
            'confidence': f"{result['score']:.2f}"
        }

    def analyze_image(self, image_path_or_url):
        """Complete image analysis including meme text extraction and interpretation"""
        try:
            # Load image
            image = self.load_image(image_path_or_url)

            # Generate caption
            caption = self.generate_caption(image)

            # Detect objects
            objects = self.detect_objects(image)

            # Extract text
            text = self.extract_text(image)

            # Analyze sentiment (if text is present)
            sentiment = self.analyze_sentiment(text) if text else None

            # Compile analysis results
            analysis = {
                'caption': caption,
                'detected_objects': objects,
                'extracted_text': text,
                'text_sentiment': sentiment,
                'interpretation': self._generate_interpretation(caption, objects, text, sentiment)
            }

            return analysis

        except Exception as e:
            return {'error': str(e)}

    def _generate_interpretation(self, caption, objects, text, sentiment):
        """Generate a human-readable interpretation of the image analysis"""
        interpretation = []
        # Caption interpretation
        interpretation.append(f"This image appears to show {caption}.")

        # Object detection interpretation
        if objects:
            obj_text = ", ".join([f"{obj['label']} ({obj['confidence']} confidence)" for obj in objects[:3]])
            interpretation.append(f"I can identify: {obj_text}.")

        # Text extraction interpretation
        if text:
            interpretation.append(f"The text in the image reads: '{text}'")
            if sentiment:
                interpretation.append(
                    f"The text appears to be {sentiment['sentiment'].lower()} in tone ({sentiment['confidence']} confidence)."
                )
        return " ".join(interpretation)

# Function for uploading and analyzing an image in Colab
def analyze_uploaded_image():
    analyzer = ImageMemeAnalyzer()
    # Upload image using Colab's file uploader
    uploaded = files.upload()
    # Analyze the first uploaded image
    for filename in uploaded.keys():
        print(f"\nAnalyzing image: {filename}")
        results = analyzer.analyze_image(filename)
        print("\n=== Image Analysis Results ===")
        print(f"\nCaption: {results.get('caption', 'N/A')}")
        print("\nDetected Objects:")
        for obj in results.get('detected_objects', []):
            print(f"- {obj['label']} (Confidence: {obj['confidence']})")
        print(f"\nExtracted Text: {results.get('extracted_text', 'N/A')}")
        if results.get('text_sentiment'):
            print(f"\nText Sentiment: {results['text_sentiment']['sentiment']} (Confidence: {results['text_sentiment']['confidence']})")
        print(f"\nInterpretation:\n{results.get('interpretation', 'N/A')}")
        break  # Analyze only the first uploaded image

# Run the image analysis
analyze_uploaded_image()




Using device: cpu
Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |██████████████████████████████████████████████████| 100.0% Complete

preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/982M [00:00<?, ?B/s]

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.48.3"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_rang

tokenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.59k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


preprocessor_config.json:   0%|          | 0.00/290 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cpu
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Saving How Do Cats Show Affection to Their Owners_.jpeg to How Do Cats Show Affection to Their Owners_.jpeg

Analyzing image: How Do Cats Show Affection to Their Owners_.jpeg


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.



=== Image Analysis Results ===

Caption: a cat sitting on the floor next to a wall 

Detected Objects:
- cat (Confidence: 1.00)

Extracted Text: Saveifgodrecabperson

Text Sentiment: NEGATIVE (Confidence: 0.96)

Interpretation:
This image appears to show a cat sitting on the floor next to a wall . I can identify: cat (1.00 confidence). The text in the image reads: 'Saveifgodrecabperson' The text appears to be negative in tone (0.96 confidence).


In [3]:
analyze_uploaded_image()



Using device: cpu


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.48.3"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_rang

Saving meow.jpeg to meow.jpeg

Analyzing image: meow.jpeg

=== Image Analysis Results ===

Caption: a cat sitting on top of a wooden bench 

Detected Objects:
- cat (Confidence: 1.00)

Extracted Text: I don't even give a meow anymore

Text Sentiment: NEGATIVE (Confidence: 1.00)

Interpretation:
This image appears to show a cat sitting on top of a wooden bench . I can identify: cat (1.00 confidence). The text in the image reads: 'I don't even give a meow anymore' The text appears to be negative in tone (1.00 confidence).
