In [8]:
import os
import torch
from PIL import Image
import matplotlib.pyplot as plt
from torchvision import transforms
from utils import caption_image
from model import CNN_to_LSTM
from preprocess import get_loader

In [11]:
!pip install huggingface_hub

Collecting huggingface_hub
  Using cached huggingface_hub-0.33.2-py3-none-any.whl.metadata (14 kB)
Collecting pyyaml>=5.1 (from huggingface_hub)
  Using cached PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface_hub)
  Using cached hf_xet-1.1.5-cp37-abi3-macosx_11_0_arm64.whl.metadata (879 bytes)
Using cached huggingface_hub-0.33.2-py3-none-any.whl (515 kB)
Using cached hf_xet-1.1.5-cp37-abi3-macosx_11_0_arm64.whl (2.6 MB)
Using cached PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl (173 kB)
Installing collected packages: pyyaml, hf-xet, huggingface_hub
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [huggingface_hub] [huggingface_hub]
[1A[2KSuccessfully installed hf-xet-1.1.5 huggingface_hub-0.33.2 pyyaml-6.0.2


In [12]:
import torch
from huggingface_hub import hf_hub_download

# Download model checkpoint from Hugging Face
checkpoint_path = hf_hub_download(
    repo_id="sohumgautam/captioning-cnn-lstm",
    filename="pytorch_model.bin"
)

# Initialize the model
model = CNN_to_LSTM(embed_size=256, hidden_size=512, num_layers=2, vocab_size=5240)
model.load_state_dict(torch.load(checkpoint_path, map_location="cpu"))
model.eval()


  from .autonotebook import tqdm as notebook_tqdm


CNN_to_LSTM(
  (encoder): CNN_Encoder(
    (resnet): Sequential(
      (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (4): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu): ReLU(inplace=True)
   

In [13]:

def test_specific_images(image_folder, dataset_path, captions_file, device):
    # Define image preprocessing (same as training)
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    
    # Get the dataset (just to access the vocabulary)
    _, dataset = get_loader(
        root_dir=dataset_path,
        captions_file=captions_file,
        transform=transform,
        batch_size=1  # Doesn't matter here
    )
    
    # Access the vocabulary from the dataset
    vocab = dataset.vocab
    
    # Process each image in the folder
    for filename in ['boy.png', 'boat.png', 'dog.jpg', 'horse.png', 'biker.jpg', 'man_bench.jpg']:
            # Load and process image
            img_path = os.path.join(image_folder, filename)
            image = Image.open(img_path).convert("RGB")
            image_tensor = transform(image).unsqueeze(0).to(device)
            
            # Generate caption
            generated_caption = caption_image(model, image_tensor, vocab)
            caption_text = " ".join(generated_caption)
            
            # Display results
            print(f"Image: {filename}")
            print(f"Caption: {caption_text}")
            print("-" * 50)
            
            # Optionally save the captioned image
            plt.figure(figsize=(8, 8))
            plt.imshow(image)
            plt.title(caption_text)
            plt.axis('off')
            plt.savefig(os.path.join(image_folder, f"captioned_{filename}"))
            plt.close()



In [14]:
# Parameters
test_image_folder = "test_images"  # Folder with your 4 test images
dataset_path = "data/images/"  # Original training images path
captions_file = "data/text.csv"  # Original captions file
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Run test
test_specific_images(test_image_folder, dataset_path, captions_file, device)

Image: boy.png
Caption: a young boy is playing in a yard with a hose .
--------------------------------------------------
Image: boat.png
Caption: a man fishes in the ocean .
--------------------------------------------------
Image: dog.jpg
Caption: a dog is running through a grassy field .
--------------------------------------------------
Image: horse.png
Caption: a group of people are sitting on a rock overlooking a body of water .
--------------------------------------------------
Image: biker.jpg
Caption: a man in a blue helmet is riding a bicycle on the street .
--------------------------------------------------
Image: man_bench.jpg
Caption: a man in a blue shirt and jeans is standing in a park .
--------------------------------------------------
