In [2]:
!pip install transformers torch torchvision pillow nltk

import os
from PIL import Image
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from nltk.translate.bleu_score import corpus_bleu

model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.max_length = 16
model.config.num_beams = 4

image_directory = "/content/"

ground_truth_captions = {
    "resize2.jpg": "a man and a woman standing next to a horse in a field",
    "two5.jpg": "two girls standing side by side and smiling",
    "a.jpg": "two women sitting at a table with laptops",
    "sample2.jpg": "people walking on a busy street with bicycles",
    "image.jpg": "a man balancing on a skateboard in a parking lot",
    "download2.jpg": "young men playing soccer on a green field",
    "github.jpg": "a man riding a bike on a busy street",
    "new_vid1.jpg": "a busy city street with cars and people",
    "photo.jpg": "a camera sitting on a wooden table",
    "resize.jpg": "a cowboy riding a horse in a field",
    "segmented_object_1.jpg": "a plastic container filled with items",
    "save_vid3.jpg": "people walking on a crowded city street",
}

generated_captions = []
reference_captions = []

for image_file in os.listdir(image_directory):
    if image_file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
        image_path = os.path.join(image_directory, image_file)

        try:
            image = Image.open(image_path)
            inputs = processor(images=image, return_tensors="pt").pixel_values
            outputs = model.generate(inputs)
            caption = tokenizer.decode(outputs[0], skip_special_tokens=True)

            print(f"Image: {image_file}")
            print(f"Generated Caption: {caption}")
            print("-" * 50)

            generated_captions.append(caption)
            if image_file in ground_truth_captions:
                reference_captions.append([ground_truth_captions[image_file]])

        except Exception as e:
            print(f"Error processing {image_file}: {e}")

if generated_captions and reference_captions:
    bleu_score = corpus_bleu(reference_captions, generated_captions)
    print(f"\nBLEU Score: {bleu_score:.2f}")
else:
    print("No captions generated or ground-truth captions available for evaluation.")

Defaulting to user installation because normal site-packages is not writeable


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.47.1"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_rang

FileNotFoundError: [WinError 3] The system cannot find the path specified: '/content/'

In [2]:
!pip install transformers torch torchvision pillow

# Import necessary libraries
from PIL import Image
import os
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import logging

# Suppress transformers logging
logging.getLogger("transformers").setLevel(logging.ERROR)

# Load the pre-trained model and tokenizer
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Configure the model
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.max_length = 16
model.config.num_beams = 4

# Braille dictionary
braille_dict = {
    'a': '⠁', 'b': '⠃', 'c': '⠉', 'd': '⠙', 'e': '⠑',
    'f': '⠋', 'g': '⠛', 'h': '⠓', 'i': '⠊', 'j': '⠚',
    'k': '⠅', 'l': '⠇', 'm': '⠍', 'n': '⠝', 'o': '⠕',
    'p': '⠏', 'q': '⠟', 'r': '⠗', 's': '⠎', 't': '⠞',
    'u': '⠥', 'v': '⠧', 'w': '⠺', 'x': '⠭', 'y': '⠽', 'z': '⠵',
    ' ': ' ', '1': '⠼⠁', '2': '⠼⠃', '3': '⠼⠉', '4': '⠼⠙',
    '5': '⠼⠑', '6': '⠼⠋', '7': '⠼⠛', '8': '⠼⠓', '9': '⠼⠊', '0': '⠼⠚'
}

# Function to convert text to Braille
def text_to_braille(text):
    return ''.join(braille_dict.get(char.lower(), '?') for char in text)

# Function to generate caption and Braille from an image
def generate_caption_and_braille(image_path):
    try:
        # Load and process the image
        image = Image.open(image_path)
        inputs = processor(images=image, return_tensors="pt").pixel_values
        
        # Generate caption
        outputs = model.generate(inputs)
        caption = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Convert caption to Braille
        braille_text = text_to_braille(caption)
        
        # Display results
        print(f"Generated Caption for {os.path.basename(image_path)}: {caption}")
        print(f"Braille Output: {braille_text}")
    except Exception as e:
        print(f"Error processing the image {os.path.basename(image_path)}: {e}")

# Request the image file path from the local directory
local_image_path = input("Enter the relative or absolute path to the image (e.g., 'image.jpg'): ")

# Generate caption and Braille output
generate_caption_and_braille(local_image_path)


Enter the relative or absolute path to the image (e.g., 'image.jpg'): imgg2.jpg




Generated Caption for imgg2.jpg: a green book is sitting on top of a wooden table 
Braille Output: ⠁ ⠛⠗⠑⠑⠝ ⠃⠕⠕⠅ ⠊⠎ ⠎⠊⠞⠞⠊⠝⠛ ⠕⠝ ⠞⠕⠏ ⠕⠋ ⠁ ⠺⠕⠕⠙⠑⠝ ⠞⠁⠃⠇⠑ 


In [5]:
!pip install transformers torch torchvision pillow

# Import necessary libraries
from PIL import Image
import os
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import logging

# Suppress transformers logging
logging.getLogger("transformers").setLevel(logging.ERROR)

# Load the pre-trained model and tokenizer
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Configure the model
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.max_length = 16
model.config.num_beams = 4

# Braille dictionary
braille_dict = {
    'a': '⠁', 'b': '⠃', 'c': '⠉', 'd': '⠙', 'e': '⠑',
    'f': '⠋', 'g': '⠛', 'h': '⠓', 'i': '⠊', 'j': '⠚',
    'k': '⠅', 'l': '⠇', 'm': '⠍', 'n': '⠝', 'o': '⠕',
    'p': '⠏', 'q': '⠟', 'r': '⠗', 's': '⠎', 't': '⠞',
    'u': '⠥', 'v': '⠧', 'w': '⠺', 'x': '⠭', 'y': '⠽', 'z': '⠵',
    ' ': ' ', '1': '⠼⠁', '2': '⠼⠃', '3': '⠼⠉', '4': '⠼⠙',
    '5': '⠼⠑', '6': '⠼⠋', '7': '⠼⠛', '8': '⠼⠓', '9': '⠼⠊', '0': '⠼⠚'
}

# Function to convert text to Braille
def text_to_braille(text):
    return ''.join(braille_dict.get(char.lower(), '?') for char in text)

# Function to generate caption and Braille from an image
def generate_caption_and_braille(image_path):
    try:
        # Load and process the image
        image = Image.open(image_path)
        inputs = processor(images=image, return_tensors="pt").pixel_values
        
        # Generate caption
        outputs = model.generate(inputs)
        caption = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Convert caption to Braille
        braille_text = text_to_braille(caption)
        
        # Display results
        print(f"Generated Caption for {os.path.basename(image_path)}: {caption}")
        print(f"Braille Output: {braille_text}")
    except Exception as e:
        print(f"Error processing the image {os.path.basename(image_path)}: {e}")

# Request the image file path from the local directory
local_image_path = input("Enter the relative or absolute path to the image (e.g., 'image.jpg'): ")

# Generate caption and Braille output
generate_caption_and_braille(local_image_path)


Enter the relative or absolute path to the image (e.g., 'image.jpg'): imgg1.jpg
Generated Caption for imgg1.jpg: a cell phone sitting on top of a wooden desk 
Braille Output: ⠁ ⠉⠑⠇⠇ ⠏⠓⠕⠝⠑ ⠎⠊⠞⠞⠊⠝⠛ ⠕⠝ ⠞⠕⠏ ⠕⠋ ⠁ ⠺⠕⠕⠙⠑⠝ ⠙⠑⠎⠅ 


In [4]:
!pip install transformers torch torchvision pillow

# Import necessary libraries
from PIL import Image
import os
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import logging

# Suppress transformers logging
logging.getLogger("transformers").setLevel(logging.ERROR)

# Load the pre-trained model and tokenizer
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Configure the model
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.max_length = 16
model.config.num_beams = 4

# Braille dictionary
braille_dict = {
    'a': '⠁', 'b': '⠃', 'c': '⠉', 'd': '⠙', 'e': '⠑',
    'f': '⠋', 'g': '⠛', 'h': '⠓', 'i': '⠊', 'j': '⠚',
    'k': '⠅', 'l': '⠇', 'm': '⠍', 'n': '⠝', 'o': '⠕',
    'p': '⠏', 'q': '⠟', 'r': '⠗', 's': '⠎', 't': '⠞',
    'u': '⠥', 'v': '⠧', 'w': '⠺', 'x': '⠭', 'y': '⠽', 'z': '⠵',
    ' ': ' ', '1': '⠼⠁', '2': '⠼⠃', '3': '⠼⠉', '4': '⠼⠙',
    '5': '⠼⠑', '6': '⠼⠋', '7': '⠼⠛', '8': '⠼⠓', '9': '⠼⠊', '0': '⠼⠚'
}

# Function to convert text to Braille
def text_to_braille(text):
    return ''.join(braille_dict.get(char.lower(), '?') for char in text)

# Function to generate caption and Braille from an image
def generate_caption_and_braille(image_path):
    try:
        # Load and process the image
        image = Image.open(image_path)
        inputs = processor(images=image, return_tensors="pt").pixel_values
        image.show()
        
        # Generate caption
        outputs = model.generate(inputs)
        caption = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Convert caption to Braille
        braille_text = text_to_braille(caption)
        
        # Display results
        print(f"Generated Caption for {os.path.basename(image_path)}: {caption}")
        print(f"Braille Output: {braille_text}")
    except Exception as e:
        print(f"Error processing the image {os.path.basename(image_path)}: {e}")

# Request the image file path from the local directory
local_image_path = input("Enter the relative or absolute path to the image (e.g., 'image.jpg'): ")

# Generate caption and Braille output
generate_caption_and_braille(local_image_path)


Enter the relative or absolute path to the image (e.g., 'image.jpg'): imgg3.jpg
Generated Caption for imgg3.jpg: a green plastic bottle sitting on top of a wooden table 
Braille Output: ⠁ ⠛⠗⠑⠑⠝ ⠏⠇⠁⠎⠞⠊⠉ ⠃⠕⠞⠞⠇⠑ ⠎⠊⠞⠞⠊⠝⠛ ⠕⠝ ⠞⠕⠏ ⠕⠋ ⠁ ⠺⠕⠕⠙⠑⠝ ⠞⠁⠃⠇⠑ 


In [1]:
import os
import pandas as pd
from PIL import Image
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer

# Initialize model, processor, and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Dataset class
class ImageCaptionDataset(torch.utils.data.Dataset):
    def __init__(self, captions_file, image_dir, processor):
        self.image_dir = image_dir
        self.processor = processor
        self.captions_df = pd.read_csv(captions_file, names=["filename", "caption"])
    
    def __len__(self):
        return len(self.captions_df)
    
    def __getitem__(self, idx):
        row = self.captions_df.iloc[idx]
        image_path = os.path.normpath(os.path.join(self.image_dir, row['filename']))
        
        if not os.path.exists(image_path):  # Check if file exists
            print(f"File does not exist: {image_path}")
            return None, None

        try:
            # Load and preprocess the image
            image = Image.open(image_path).convert("RGB")
            pixel_values = self.processor(images=image, return_tensors="pt").pixel_values
            caption = row['caption']
            return pixel_values.squeeze(0), caption
        except Exception as e:
            print(f"Error loading image {image_path}: {e}")
            return None, None


# Custom collate_fn
def custom_collate_fn(batch):
    return [(img, cap) for img, cap in batch if img is not None and cap is not None]

# Generate captions
def generate_captions(model, dataloader, tokenizer):
    model.eval()
    all_results = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Processing Batches"):
            valid_batch = [(img, cap) for img, cap in batch if img is not None and cap is not None]
            if not valid_batch:
                continue
            
            images, captions = zip(*valid_batch)
            images = torch.stack(images).to(device)
            outputs = model.generate(images, max_length=64, num_beams=4)
            generated_captions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            
            for orig_cap, gen_cap in zip(captions, generated_captions):
                all_results.append({
                    "generated_caption": gen_cap,
                    "original_caption": orig_cap
                })
    return all_results

# Paths
captions_file = "captions_8k.txt"
image_dir = r"C:\Users\rosyd\Coding Files\Mini Project\Images"

# Dataset and DataLoader
dataset = ImageCaptionDataset(captions_file, image_dir, processor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=custom_collate_fn)

# Generate captions
results = generate_captions(model, dataloader, tokenizer)

# Save results to CSV
output_csv = "generated_captions.csv"
pd.DataFrame(results).to_csv(output_csv, index=False)
print(f"Results saved to {output_csv}")


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.47.1"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_rang

KeyboardInterrupt: 

In [None]:
import os
import pandas as pd
from PIL import Image
from tqdm import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer

# Initialize model, processor, and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Dataset class
class ImageCaptionDataset(Dataset):
    def __init__(self, captions_file, image_dir, processor):
        self.image_dir = image_dir
        self.processor = processor
        self.captions_df = pd.read_csv(captions_file, names=["filename", "caption"])
    
    def __len__(self):
        return len(self.captions_df)
    
    def __getitem__(self, idx):
        row = self.captions_df.iloc[idx]
        image_path = os.path.normpath(os.path.join(self.image_dir, row['filename']))
        
        if not os.path.exists(image_path):
            # Skip missing files silently
            return None, None

        try:
            # Load and preprocess the image
            image = Image.open(image_path).convert("RGB")
            pixel_values = self.processor(images=image, return_tensors="pt").pixel_values
            caption = row['caption']
            return pixel_values.squeeze(0), caption
        except Exception as e:
            # Skip problematic images
            return None, None

# Custom collate function
def custom_collate_fn(batch):
    return [(img, cap) for img, cap in batch if img is not None and cap is not None]

# Generate captions
def generate_captions(model, dataloader, tokenizer):
    model.eval()
    all_results = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Processing Batches"):
            valid_batch = [(img, cap) for img, cap in batch if img is not None and cap is not None]
            if not valid_batch:
                continue
            
            images, captions = zip(*valid_batch)
            images = torch.stack(images).to(device)
            outputs = model.generate(images, max_length=64, num_beams=4)
            generated_captions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            
            for orig_cap, gen_cap in zip(captions, generated_captions):
                all_results.append({
                    "generated_caption": gen_cap,
                    "original_caption": orig_cap
                })
    return all_results

# Paths
captions_file = r"captions_8k.txt"
image_dir = r"C:\Users\rosyd\Coding Files\Mini Project\Images"

# Dataset and DataLoader
dataset = ImageCaptionDataset(captions_file, image_dir, processor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=custom_collate_fn)

# Generate captions
results = generate_captions(model, dataloader, tokenizer)

# Save results to CSV
output_csv = "generated_captions.csv"
pd.DataFrame(results).to_csv(output_csv, index=False)
print(f"Results saved to {output_csv}")


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.47.1"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_rang