In [75]:
# Cell 1
%pip install transformers torch pillow reportlab opencv-python datasets accelerate bitsandbytes pytesseract

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [76]:
# Cell 2
import os
import torch
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration, LayoutLMv3Processor, LayoutLMv3ForTokenClassification
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image as RLImage, PageBreak
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
import json
from datetime import datetime
import warnings
import re
warnings.filterwarnings('ignore')

In [77]:
# Cell 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [78]:
# Cell 4
print("Loading BLIP model...")
blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

print("Loading LayoutLMv3 model...")
layoutlm_processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
layoutlm_model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base").to(device)

print("All models loaded successfully!")

Loading BLIP model...
Loading LayoutLMv3 model...


Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


All models loaded successfully!


In [79]:
# Cell 5
def handle_path(input_path):
    input_path = input_path.strip().strip('"').strip("'")
    if os.path.isfile(input_path):
        return input_path
    elif os.path.isabs(input_path):
        return input_path
    else:
        return os.path.abspath(input_path)

def ensure_output_dir():
    output_dir = "output_reports"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    return output_dir

def validate_image_path(image_path):
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Path does not exist: {image_path}")
    if os.path.isdir(image_path):
        raise IsADirectoryError(f"Path is a directory, not a file: {image_path}")
    if not os.path.isfile(image_path):
        raise FileNotFoundError(f"Path is not a valid file: {image_path}")
    valid_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
    file_ext = os.path.splitext(image_path.lower())[1]
    if file_ext not in valid_extensions:
        raise ValueError(f"Invalid image format. Supported: {valid_extensions}")
    return True

In [80]:
# Cell 6
def generate_comprehensive_tags(image_path):
    image = Image.open(image_path).convert('RGB')
    
    prompts = [
        "a photo of",
        "this image shows",
        "this is",
        "the image contains",
        "visible in this image:"
    ]
    
    all_tags = set()
    
    for prompt in prompts:
        inputs = blip_processor(image, text=prompt, return_tensors="pt").to(device)
        
        with torch.no_grad():
            outputs = blip_model.generate(**inputs, max_length=40, num_beams=3, temperature=0.7, do_sample=True)
        
        caption = blip_processor.decode(outputs[0], skip_special_tokens=True)
        caption_clean = caption.replace(prompt, "").strip()
        
        words = re.findall(r'\b[a-zA-Z]{3,}\b', caption_clean.lower())
        exclude_words = {'the', 'and', 'with', 'for', 'are', 'that', 'this', 'has', 'was', 'from', 'they', 'have', 'been', 'will', 'can', 'said', 'each', 'which', 'more', 'also', 'its', 'would', 'may', 'about', 'out', 'many', 'time', 'very', 'when', 'much', 'new', 'some', 'could', 'state', 'other', 'after', 'first', 'well', 'way', 'even', 'years', 'work', 'through', 'over', 'into', 'than', 'because', 'most', 'only', 'think', 'back', 'use', 'two', 'how', 'our', 'life', 'good', 'woman', 'just', 'form', 'great', 'help', 'line', 'turn', 'cause', 'mean', 'before', 'move', 'right', 'old', 'same', 'tell', 'does', 'set', 'three', 'want', 'air', 'play', 'small', 'end', 'put', 'home', 'read', 'hand', 'large', 'add', 'land', 'here', 'must', 'big', 'high', 'such', 'follow', 'act', 'why', 'ask', 'men', 'change', 'went', 'light', 'kind', 'off', 'need', 'house', 'picture', 'try', 'again', 'animal', 'point', 'mother', 'world', 'near', 'build', 'self', 'earth', 'father', 'head', 'stand', 'own', 'page', 'should', 'country', 'found', 'answer', 'school', 'grow', 'study', 'still', 'learn', 'plant', 'cover', 'food', 'sun', 'four', 'between', 'keep', 'eye', 'never', 'last', 'let', 'thought', 'city', 'cross', 'farm', 'hard', 'start', 'might', 'story', 'saw', 'far', 'sea', 'draw', 'left', 'late', 'run', 'while', 'press', 'close', 'night', 'real', 'few', 'north', 'open', 'seem', 'together', 'next', 'white', 'children', 'begin', 'got', 'walk', 'example', 'ease', 'paper', 'group', 'always', 'music', 'those', 'both', 'mark', 'often', 'letter', 'until', 'mile', 'river', 'car', 'feet', 'care', 'second', 'book', 'carry', 'took', 'science', 'eat', 'room', 'friend', 'began', 'idea', 'fish', 'mountain', 'stop', 'once', 'base', 'hear', 'horse', 'cut', 'sure', 'watch', 'color', 'face', 'wood', 'main', 'enough', 'plain', 'girl', 'usual', 'young', 'ready', 'above', 'ever', 'red', 'list', 'though', 'feel', 'talk', 'bird', 'soon', 'body', 'dog', 'family', 'direct', 'leave', 'song', 'measure', 'door', 'product', 'black', 'short', 'numeral', 'class', 'wind', 'question', 'happen', 'complete', 'ship', 'area', 'half', 'rock', 'order', 'fire', 'south', 'problem', 'piece', 'told', 'knew', 'pass', 'since', 'top', 'whole', 'king', 'space', 'heard', 'best', 'hour', 'better', 'during', 'hundred', 'five', 'remember', 'step', 'early', 'hold', 'west', 'ground', 'interest', 'reach', 'fast', 'verb', 'sing', 'listen', 'six', 'table', 'travel', 'less', 'morning', 'ten', 'simple', 'several', 'vowel', 'toward', 'war', 'lay', 'against', 'pattern', 'slow', 'center', 'love', 'person', 'money', 'serve', 'appear', 'road', 'map', 'rain', 'rule', 'govern', 'pull', 'cold', 'notice', 'voice', 'unit', 'power', 'town', 'fine', 'certain', 'fly', 'fall', 'lead', 'cry', 'dark', 'machine', 'note', 'wait', 'plan', 'figure', 'star', 'box', 'noun', 'field', 'rest', 'correct', 'able', 'pound', 'done', 'beauty', 'drive', 'stood', 'contain', 'front', 'teach', 'week', 'final', 'gave', 'green', 'quick', 'develop', 'ocean', 'warm', 'free', 'minute', 'strong', 'special', 'mind', 'behind', 'clear', 'tail', 'produce', 'fact', 'street', 'inch', 'multiply', 'nothing', 'course', 'stay', 'wheel', 'full', 'force', 'blue', 'object', 'decide', 'surface', 'deep', 'moon', 'island', 'foot', 'system', 'busy', 'test', 'record', 'boat', 'common', 'gold', 'possible', 'plane', 'stead', 'dry', 'wonder', 'laugh', 'thousands', 'ago', 'ran', 'check', 'game', 'shape', 'equate', 'hot', 'miss', 'brought', 'heat', 'snow', 'tire', 'bring', 'yes', 'distant', 'fill', 'east', 'paint', 'language', 'among'}
        
        filtered_words = [word for word in words if word not in exclude_words and len(word) > 2]
        all_tags.update(filtered_words)
    
    final_tags = list(all_tags)[:12]
    
    basic_inputs = blip_processor(image, return_tensors="pt").to(device)
    with torch.no_grad():
        basic_outputs = blip_model.generate(**basic_inputs, max_length=30, num_beams=2)
    basic_caption = blip_processor.decode(basic_outputs[0], skip_special_tokens=True)
    
    return final_tags, basic_caption

In [81]:
# Cell 7
def extract_text_with_ocr(image_path):
    try:
        import pytesseract
        from PIL import Image
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image, lang='eng')
        cleaned_text = re.sub(r'[^\w\s]', ' ', text)
        cleaned_text = ' '.join(cleaned_text.split())
        return cleaned_text.strip() if cleaned_text.strip() else "No text detected"
    except Exception as e:
        return "Text extraction failed"

In [82]:
# Cell 8
def verify_tags_with_layoutlm(image_path, tags, extracted_text):
    try:
        image = Image.open(image_path).convert('RGB')
        
        all_words = tags + extracted_text.split()
        words = [word.lower() for word in all_words if word.isalpha() and len(word) > 2][:80]
        
        if not words:
            return tags
        
        inputs = layoutlm_processor(image, words, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = layoutlm_model(**inputs)
        
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        verified_tags = []
        
        for i, word in enumerate(words[:min(len(words), predictions.shape[1])]):
            if predictions[0][i].max().item() > 0.2:
                verified_tags.append(word)
        
        unique_verified = list(set(verified_tags))
        return unique_verified[:10] if unique_verified else tags[:10]
        
    except Exception as e:
        return tags[:10]

In [83]:
# Cell 9
def generate_simple_explanation(image_path, verified_tags, caption):
    try:
        explanations = [
            f"This image shows {caption}.",
            f"The main elements visible include {', '.join(verified_tags[:5])}.",
            f"Key features identified are {', '.join(verified_tags[:4])} which define the visual content.",
            f"The image contains {', '.join(verified_tags[:6])} as primary visual elements.",
            f"Visual analysis reveals {', '.join(verified_tags[:5])} among other distinguishable features."
        ]
        
        import random
        selected_explanation = random.choice(explanations)
        return selected_explanation
        
    except Exception as e:
        return f"This image contains visual elements including {', '.join(verified_tags[:4])}."

In [84]:
# Cell 10
def create_single_comprehensive_report(all_results, output_dir):
    pdf_path = os.path.join(output_dir, "comprehensive_image_analysis_report.pdf")
    
    doc = SimpleDocTemplate(pdf_path, pagesize=letter, topMargin=36, bottomMargin=36)
    styles = getSampleStyleSheet()
    
    times_style = ParagraphStyle(
        'TimesRoman',
        parent=styles['Normal'],
        fontName='Times-Roman',
        fontSize=10,
        spaceAfter=4,
        spaceBefore=0
    )
    
    title_style = ParagraphStyle(
        'TimesTitle',
        parent=styles['Title'],
        fontName='Times-Bold',
        fontSize=16,
        spaceAfter=12,
        spaceBefore=0
    )
    
    subtitle_style = ParagraphStyle(
        'Subtitle',
        parent=styles['Normal'],
        fontName='Times-Bold',
        fontSize=12,
        spaceAfter=6,
        spaceBefore=8
    )
    
    story = []
    story.append(Paragraph("Comprehensive Image Analysis Report", title_style))
    story.append(Paragraph(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", times_style))
    story.append(Paragraph(f"Total Images Analyzed: {len(all_results)}", times_style))
    story.append(Spacer(1, 20))
    
    for i, result in enumerate(all_results, 1):
        story.append(Paragraph(f"Image {i}: {result['filename']}", subtitle_style))
        
        try:
            img = RLImage(result['image_path'], width=240, height=180)
            story.append(img)
        except:
            story.append(Paragraph("Image display failed", times_style))
        
        story.append(Spacer(1, 8))
        story.append(Paragraph(f"<b>AI-Generated Tags:</b> {', '.join(result['tags'])}", times_style))
        story.append(Paragraph(f"<b>BLIP Caption:</b> {result['caption']}", times_style))
        story.append(Paragraph(f"<b>Extracted Text:</b> {result['extracted_text']}", times_style))
        story.append(Paragraph(f"<b>Verified Tags:</b> {', '.join(result['verified_tags'])}", times_style))
        story.append(Paragraph(f"<b>Description:</b> {result['explanation']}", times_style))
        
        if i < len(all_results):
            story.append(PageBreak())
    
    doc.build(story)
    return pdf_path

In [85]:
# Cell 11
def process_single_image(image_path):
    try:
        image_path = handle_path(image_path)
        validate_image_path(image_path)
        
        print(f"Processing: {os.path.basename(image_path)}")
        
        tags, caption = generate_comprehensive_tags(image_path)
        extracted_text = extract_text_with_ocr(image_path)
        verified_tags = verify_tags_with_layoutlm(image_path, tags, extracted_text)
        explanation = generate_simple_explanation(image_path, verified_tags, caption)
        
        return {
            'filename': os.path.basename(image_path),
            'image_path': image_path,
            'tags': tags,
            'caption': caption,
            'extracted_text': extracted_text,
            'verified_tags': verified_tags,
            'explanation': explanation
        }
    
    except Exception as e:
        print(f"Error processing {image_path}: {str(e)}")
        return None

In [86]:
# Cell 12
def process_all_images_in_directory(dir_path):
    dir_path = handle_path(dir_path)
    
    if not os.path.isdir(dir_path):
        print("Invalid directory path")
        return []
    
    valid_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.webp']
    image_files = [f for f in os.listdir(dir_path) if any(f.lower().endswith(ext) for ext in valid_extensions)]
    
    if not image_files:
        print("No valid image files found")
        return []
    
    print(f"Found {len(image_files)} images. Processing all...")
    
    results = []
    for i, filename in enumerate(image_files, 1):
        image_path = os.path.join(dir_path, filename)
        print(f"\n[{i}/{len(image_files)}] Processing: {filename}")
        
        result = process_single_image(image_path)
        if result:
            results.append(result)
            print(f"✓ Generated {len(result['tags'])} tags")
            print(f"✓ Tags: {result['tags']}")
    
    return results

In [87]:
# Cell 13
data_directory = "C:\\Users\\arnav\\Desktop\\LLM_Tagger\\data"
output_dir = ensure_output_dir()

print("Starting comprehensive image analysis...")
all_results = process_all_images_in_directory(data_directory)

if all_results:
    print(f"\nCreating single comprehensive report...")
    pdf_path = create_single_comprehensive_report(all_results, output_dir)
    
    print(f"\n{'='*80}")
    print(f"ANALYSIS COMPLETE!")
    print(f"{'='*80}")
    print(f"Total images processed: {len(all_results)}")
    print(f"Comprehensive report saved: {pdf_path}")
    print(f"{'='*80}")
    
    print(f"\nSUMMARY:")
    for i, result in enumerate(all_results, 1):
        print(f"\nImage {i}: {result['filename']}")
        print(f"Tags ({len(result['tags'])}): {', '.join(result['tags'])}")
        print(f"Caption: {result['caption']}")
        print(f"Text: {result['extracted_text'][:50]}{'...' if len(result['extracted_text']) > 50 else ''}")
else:
    print("No images were successfully processed.")

Starting comprehensive image analysis...
Found 5 images. Processing all...

[1/5] Processing: image1.jpeg
Processing: image1.jpeg
✓ Generated 12 tags
✓ Tags: ['india', 'army', 'image', 'bengal', 'png', 'article', 'visible', 'titled', 'news', 'flag', 'assam', 'indian']

[2/5] Processing: image2.jpg
Processing: image2.jpg
✓ Generated 8 tags
✓ Tags: ['says', 'clearance', 'store', 'sign', 'image', 'visible', 'window', 'sale']

[3/5] Processing: image3.webp
Processing: image3.webp
✓ Generated 3 tags
✓ Tags: ['sign', 'visible', 'image']

[4/5] Processing: image4.jpg
Processing: image4.jpg
✓ Generated 5 tags
✓ Tags: ['baby', 'image', 'elephant', 'her', 'visible']

[5/5] Processing: image5.webp
Processing: image5.webp
✓ Generated 12 tags
✓ Tags: ['past', 'mumbai', 'india', 'delhi', 'image', 'store', 'popular', 'place', 'people', 'bengal', 'visible', 'exterior']

Creating single comprehensive report...

ANALYSIS COMPLETE!
Total images processed: 5
Comprehensive report saved: output_reports\comp