In [61]:
# Cell 1
%pip install transformers torch pillow reportlab opencv-python datasets accelerate bitsandbytes easyocr pytesseract

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [62]:
# Cell 2
import os
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration, DonutProcessor, VisionEncoderDecoderModel, LayoutLMv3Processor, LayoutLMv3ForTokenClassification, Kosmos2ForConditionalGeneration, AutoProcessor
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image as RLImage
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
import json
from datetime import datetime
import warnings
import easyocr
import re
warnings.filterwarnings('ignore')

In [63]:
# Cell 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

try:
    import easyocr
    ocr_reader = easyocr.Reader(['en'])
    print("EasyOCR loaded successfully")
except Exception as e:
    print(f"EasyOCR failed: {e}")
    try:
        import pytesseract
        print("Using Tesseract as fallback")
        ocr_reader = None
    except:
        print("No OCR available, will use fallback method")
        ocr_reader = None

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Using device: cpu
EasyOCR loaded successfully


In [64]:
# Cell 4
print("Loading BLIP model...")
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

print("Loading LayoutLMv3 model...")
layoutlm_processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
layoutlm_model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base").to(device)

print("Loading Kosmos-2 model...")
kosmos_processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
kosmos_model = Kosmos2ForConditionalGeneration.from_pretrained("microsoft/kosmos-2-patch14-224").to(device)

print("All models loaded successfully!")

Loading BLIP model...
Loading LayoutLMv3 model...


Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading Kosmos-2 model...
All models loaded successfully!


In [65]:
# Cell 5
def handle_path(input_path):
    input_path = input_path.strip().strip('"').strip("'")
    if os.path.isfile(input_path):
        return input_path
    elif os.path.isabs(input_path):
        return input_path
    else:
        return os.path.abspath(input_path)

def ensure_output_dir():
    output_dir = "output_reports"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    return output_dir

def validate_image_path(image_path):
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Path does not exist: {image_path}")
    if os.path.isdir(image_path):
        raise IsADirectoryError(f"Path is a directory, not a file: {image_path}")
    if not os.path.isfile(image_path):
        raise FileNotFoundError(f"Path is not a valid file: {image_path}")
    valid_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
    file_ext = os.path.splitext(image_path.lower())[1]
    if file_ext not in valid_extensions:
        raise ValueError(f"Invalid image format. Supported: {valid_extensions}")
    return True

In [66]:
# Cell 6
def extract_social_media_tags(caption):
    words = re.findall(r'\b[a-zA-Z]{3,}\b', caption.lower())
    
    exclude_words = {'the', 'and', 'with', 'for', 'are', 'that', 'this', 'has', 'was', 'from', 'they', 'have', 'been', 'will', 'can', 'said', 'each', 'which', 'more', 'also', 'its', 'would', 'may', 'about', 'out', 'many', 'time', 'very', 'when', 'much', 'new', 'some', 'could', 'state', 'other', 'after', 'first', 'well', 'way', 'even', 'years', 'work', 'through', 'over', 'government', 'into', 'than', 'because', 'most', 'only', 'over', 'think', 'also', 'back', 'after', 'use', 'two', 'how', 'our', 'work', 'life', 'way', 'even', 'back', 'any', 'good', 'woman', 'through', 'just', 'form', 'sentence', 'great', 'think', 'say', 'help', 'low', 'line', 'differ', 'turn', 'cause', 'much', 'mean', 'before', 'move', 'right', 'boy', 'old', 'too', 'same', 'tell', 'does', 'set', 'three', 'want', 'air', 'well', 'also', 'play', 'small', 'end', 'put', 'home', 'read', 'hand', 'port', 'large', 'spell', 'add', 'even', 'land', 'here', 'must', 'big', 'high', 'such', 'follow', 'act', 'why', 'ask', 'men', 'change', 'went', 'light', 'kind', 'off', 'need', 'house', 'picture', 'try', 'again', 'animal', 'point', 'mother', 'world', 'near', 'build', 'self', 'earth', 'father', 'head', 'stand', 'own', 'page', 'should', 'country', 'found', 'answer', 'school', 'grow', 'study', 'still', 'learn', 'plant', 'cover', 'food', 'sun', 'four', 'between', 'state', 'keep', 'eye', 'never', 'last', 'let', 'thought', 'city', 'tree', 'cross', 'farm', 'hard', 'start', 'might', 'story', 'saw', 'far', 'sea', 'draw', 'left', 'late', 'run', 'while', 'press', 'close', 'night', 'real', 'life', 'few', 'north', 'open', 'seem', 'together', 'next', 'white', 'children', 'begin', 'got', 'walk', 'example', 'ease', 'paper', 'group', 'always', 'music', 'those', 'both', 'mark', 'often', 'letter', 'until', 'mile', 'river', 'car', 'feet', 'care', 'second', 'book', 'carry', 'took', 'science', 'eat', 'room', 'friend', 'began', 'idea', 'fish', 'mountain', 'stop', 'once', 'base', 'hear', 'horse', 'cut', 'sure', 'watch', 'color', 'face', 'wood', 'main', 'enough', 'plain', 'girl', 'usual', 'young', 'ready', 'above', 'ever', 'red', 'list', 'though', 'feel', 'talk', 'bird', 'soon', 'body', 'dog', 'family', 'direct', 'leave', 'song', 'measure', 'door', 'product', 'black', 'short', 'numeral', 'class', 'wind', 'question', 'happen', 'complete', 'ship', 'area', 'half', 'rock', 'order', 'fire', 'south', 'problem', 'piece', 'told', 'knew', 'pass', 'since', 'top', 'whole', 'king', 'space', 'heard', 'best', 'hour', 'better', 'during', 'hundred', 'five', 'remember', 'step', 'early', 'hold', 'west', 'ground', 'interest', 'reach', 'fast', 'verb', 'sing', 'listen', 'six', 'table', 'travel', 'less', 'morning', 'ten', 'simple', 'several', 'vowel', 'toward', 'war', 'lay', 'against', 'pattern', 'slow', 'center', 'love', 'person', 'money', 'serve', 'appear', 'road', 'map', 'rain', 'rule', 'govern', 'pull', 'cold', 'notice', 'voice', 'unit', 'power', 'town', 'fine', 'certain', 'fly', 'fall', 'lead', 'cry', 'dark', 'machine', 'note', 'wait', 'plan', 'figure', 'star', 'box', 'noun', 'field', 'rest', 'correct', 'able', 'pound', 'done', 'beauty', 'drive', 'stood', 'contain', 'front', 'teach', 'week', 'final', 'gave', 'green', 'quick', 'develop', 'ocean', 'warm', 'free', 'minute', 'strong', 'special', 'mind', 'behind', 'clear', 'tail', 'produce', 'fact', 'street', 'inch', 'multiply', 'nothing', 'course', 'stay', 'wheel', 'full', 'force', 'blue', 'object', 'decide', 'surface', 'deep', 'moon', 'island', 'foot', 'system', 'busy', 'test', 'record', 'boat', 'common', 'gold', 'possible', 'plane', 'stead', 'dry', 'wonder', 'laugh', 'thousands', 'ago', 'ran', 'check', 'game', 'shape', 'equate', 'hot', 'miss', 'brought', 'heat', 'snow', 'tire', 'bring', 'yes', 'distant', 'fill', 'east', 'paint', 'language', 'among'}
    
    filtered_tags = [word for word in words if word not in exclude_words and len(word) > 2]
    
    return list(set(filtered_tags))[:15]

def generate_tags_with_blip(image_path):
    image = Image.open(image_path).convert('RGB')
    
    inputs = blip_processor(image, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = blip_model.generate(**inputs, max_length=50, num_beams=4)
    
    caption = blip_processor.decode(outputs[0], skip_special_tokens=True)
    tags = extract_social_media_tags(caption)
    
    return tags, caption

In [67]:
# Cell 7
def extract_text_with_ocr(image_path):
    try:
        if ocr_reader is not None:
            results = ocr_reader.readtext(image_path)
            extracted_texts = []
            for (bbox, text, confidence) in results:
                if confidence > 0.3:
                    cleaned_text = text.strip()
                    if len(cleaned_text) > 1:
                        extracted_texts.append(cleaned_text)
            final_text = ' '.join(extracted_texts)
            return final_text if final_text else "No text detected"
        else:
            try:
                import pytesseract
                from PIL import Image
                image = Image.open(image_path)
                text = pytesseract.image_to_string(image)
                return text.strip() if text.strip() else "No text detected"
            except:
                return "Text extraction unavailable"
        
    except Exception as e:
        return "Text extraction failed"

In [68]:
# Cell 8
def verify_tags_with_layoutlm(image_path, tags, extracted_text):
    try:
        image = Image.open(image_path).convert('RGB')
        
        all_words = tags + extracted_text.split()
        words = [word.lower() for word in all_words if word.isalpha() and len(word) > 2][:100]
        
        if not words:
            return tags
        
        inputs = layoutlm_processor(image, words, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = layoutlm_model(**inputs)
        
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        verified_tags = []
        
        for i, word in enumerate(words[:min(len(words), predictions.shape[1])]):
            if predictions[0][i].max().item() > 0.25:
                verified_tags.append(word)
        
        unique_verified = list(set(verified_tags))
        return unique_verified[:12] if unique_verified else tags[:12]
        
    except Exception as e:
        return tags[:12]

In [69]:
# Cell 9
def generate_clean_explanation_with_kosmos(image_path, verified_tags):
    try:
        image = Image.open(image_path).convert('RGB')
        
        prompt = "What do you see in this image?"
        
        inputs = kosmos_processor(text=prompt, images=image, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = kosmos_model.generate(
                **inputs,
                max_length=80,
                do_sample=False,
                num_beams=1
            )
        
        full_text = kosmos_processor.decode(outputs[0], skip_special_tokens=True)
        clean_text = full_text.replace(prompt, "").strip()
        
        if len(clean_text) < 15 or any(word in clean_text.lower() for word in ['grounding', 'bbox', 'coord']):
            clean_text = f"This image shows {', '.join(verified_tags[:4])}. The visual elements include various objects and features that define the scene's composition and context."
        
        sentences = [s.strip() + '.' for s in clean_text.split('.') if len(s.strip()) > 5]
        final_text = ' '.join(sentences[:2]) if sentences else clean_text
        
        return final_text
        
    except Exception as e:
        return f"This image contains {', '.join(verified_tags[:4])}. Visual analysis identifies key elements and features present in the scene."

In [70]:
# Cell 10
def create_pdf_report(image_path, tags, extracted_text, verified_tags, explanation, output_dir):
    filename = os.path.basename(image_path).split('.')[0]
    pdf_path = os.path.join(output_dir, f"{filename}_analysis.pdf")
    
    doc = SimpleDocTemplate(pdf_path, pagesize=letter, topMargin=36, bottomMargin=36)
    styles = getSampleStyleSheet()
    
    times_style = ParagraphStyle(
        'TimesRoman',
        parent=styles['Normal'],
        fontName='Times-Roman',
        fontSize=11,
        spaceAfter=4,
        spaceBefore=0
    )
    
    title_style = ParagraphStyle(
        'TimesTitle',
        parent=styles['Title'],
        fontName='Times-Bold',
        fontSize=14,
        spaceAfter=8,
        spaceBefore=0
    )
    
    story = []
    story.append(Paragraph("Intelligent Image Analysis Report", title_style))
    story.append(Spacer(1, 8))
    
    try:
        img = RLImage(image_path, width=280, height=200)
        story.append(img)
    except:
        story.append(Paragraph("Image embedding failed", times_style))
    
    story.append(Spacer(1, 8))
    story.append(Paragraph(f"<b>AI-Generated Tags:</b> {', '.join(tags)}", times_style))
    story.append(Spacer(1, 4))
    story.append(Paragraph(f"<b>Extracted Text:</b> {extracted_text}", times_style))
    story.append(Spacer(1, 4))
    story.append(Paragraph(f"<b>Verified Tags:</b> {', '.join(verified_tags)}", times_style))
    story.append(Spacer(1, 4))
    story.append(Paragraph(f"<b>Explanation:</b> {explanation}", times_style))
    
    doc.build(story)
    return pdf_path

In [71]:
# Cell 11
def process_single_image(image_path):
    try:
        image_path = handle_path(image_path)
        validate_image_path(image_path)
        output_dir = ensure_output_dir()
        
        print(f"Processing: {os.path.basename(image_path)}")
        
        tags, caption = generate_tags_with_blip(image_path)
        extracted_text = extract_text_with_ocr(image_path)
        verified_tags = verify_tags_with_layoutlm(image_path, tags, extracted_text)
        explanation = generate_clean_explanation_with_kosmos(image_path, verified_tags)
        
        pdf_path = create_pdf_report(image_path, tags, extracted_text, verified_tags, explanation, output_dir)
        
        return {
            'filename': os.path.basename(image_path),
            'tags': tags,
            'caption': caption,
            'extracted_text': extracted_text,
            'verified_tags': verified_tags,
            'explanation': explanation,
            'pdf_report': pdf_path
        }
    
    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None

In [72]:
# Cell 12
def process_all_images_in_directory(dir_path):
    dir_path = handle_path(dir_path)
    
    if not os.path.isdir(dir_path):
        print("Invalid directory path")
        return []
    
    valid_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
    image_files = [f for f in os.listdir(dir_path) if any(f.lower().endswith(ext) for ext in valid_extensions)]
    
    if not image_files:
        print("No valid image files found")
        return []
    
    print(f"Found {len(image_files)} images. Processing all...")
    
    results = []
    for i, filename in enumerate(image_files, 1):
        image_path = os.path.join(dir_path, filename)
        print(f"\n[{i}/{len(image_files)}] Processing: {filename}")
        
        result = process_single_image(image_path)
        if result:
            results.append(result)
            print(f"✓ Tags: {result['tags'][:6]}")
            print(f"✓ Text: {result['extracted_text'][:60]}...")
    
    return results

In [73]:
# Cell 13
data_directory = "C:\\Users\\arnav\\Desktop\\LLM_Tagger\\data"
all_results = process_all_images_in_directory(data_directory)

print(f"\n{'='*60}")
print(f"PROCESSING COMPLETE - {len(all_results)} images processed")
print(f"{'='*60}")

for i, result in enumerate(all_results, 1):
    print(f"\nImage {i}: {result['filename']}")
    print(f"Tags: {result['tags']}")
    print(f"Text Found: {result['extracted_text']}")
    print(f"Verified Tags: {result['verified_tags']}")
    print(f"PDF: {os.path.basename(result['pdf_report'])}")

Found 5 images. Processing all...

[1/5] Processing: image1.jpeg
Processing: image1.jpeg
✓ Tags: ['indian', 'army', 'flag']
✓ Text: No text detected...

[2/5] Processing: image2.jpg
Processing: image2.jpg
✓ Tags: ['store', 'sign', 'clothing']
✓ Text: Up to 52499 up 5T{ '2499 the markot Mem CLEARANCE...

[3/5] Processing: image3.webp
Processing: image3.webp
✓ Tags: ['pole', 'sign']
✓ Text: Grey Fox Tr Waterfall Dr STOP...

[4/5] Processing: image4.jpg
Processing: image4.jpg
✓ Tags: ['baby', 'elephant', 'walking', 'her']
✓ Text: No text detected...

[5/5] Processing: image5.webp
Processing: image5.webp
✓ Tags: ['store', 'people', 'walking']
✓ Text: pepperfry com IG Store...

PROCESSING COMPLETE - 5 images processed

Image 1: image1.jpeg
Tags: ['indian', 'army', 'flag']
Text Found: No text detected
Verified Tags: ['indian', 'army', 'flag']
PDF: image1_analysis.pdf

Image 2: image2.jpg
Tags: ['store', 'sign', 'clothing']
Text Found: Up to 52499 up 5T{ '2499 the markot Mem CLEARANCE
Verifie

In [74]:
# Cell 14
def simple_text_extraction_fallback(image_path):
    try:
        from PIL import Image
        import pytesseract
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image, lang='eng+hin')
        return text.strip() if text.strip() else "No text detected"
    except:
        return "No text extraction available"

def extract_text_with_ocr(image_path):
    return simple_text_extraction_fallback(image_path)

print("Using fallback OCR method")

Using fallback OCR method
