In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image

# Load the pre-trained ResNet model
resnet = models.resnet50(pretrained=True)
# Remove the classification layer (final fully connected layer)
resnet = nn.Sequential(*list(resnet.children())[:-1])
# Set the model to evaluation mode
resnet.eval()

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 116MB/s]


Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# Function to preprocess the image and extract features using ResNet
def extract_features(image_path):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),      # Resize the image to (224, 224) for ResNet input
        transforms.ToTensor(),              # Convert image to tensor
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))  # Normalize the image
    ])

    image = Image.open(image_path)
    image = transform(image).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        features = resnet(image)
    return features.squeeze()

# Example usage
image_path = "/content/drive/MyDrive/water.jpeg"
image_features = extract_features(image_path)

In [None]:
import torch.nn as nn
import torch.optim as optim

# Define the RNN-based captioning model
class CaptioningModel(nn.Module):
    def __init__(self, input_size, hidden_size, vocab_size):
        super(CaptioningModel, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, input_size)
        self.rnn = nn.LSTM(input_size, hidden_size)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, image_features, captions):
        embeds = self.embedding(captions)
        embeds = torch.cat((image_features.unsqueeze(0), embeds), dim=0)
        lstm_out, _ = self.rnn(embeds)
        output = self.fc(lstm_out)
        return output

# Example usage
input_size = 512  # ResNet output size
hidden_size = 256  # Size of the hidden layer in LSTM
vocab_size = 10000  # The size of the vocabulary (number of unique words)
captioning_model = CaptioningModel(input_size, hidden_size, vocab_size)

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import numpy as np
from google.colab import drive
drive.mount('/content/drive')

# Load the pre-trained ResNet model
resnet = models.resnet50(pretrained=True)
# Remove the classification layer (final fully connected layer)
resnet = nn.Sequential(*list(resnet.children())[:-1])
# Set the model to evaluation mode
resnet.eval()

# Define the transform
transform = transforms.Compose([
    transforms.Resize((224, 224)),      # Resize the image to (224, 224) for ResNet input
    transforms.ToTensor(),              # Convert image to tensor
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))  # Normalize the image
])

def extract_features(image_path):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        features = resnet(image)
    return features.squeeze()

# Define the RNN-based captioning model
class CaptioningModel(nn.Module):
    def __init__(self, input_size, hidden_size, vocab_size):
        super(CaptioningModel, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(vocab_size, input_size)
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, image_features, captions):
        embeds = self.embedding(captions)
        image_features = image_features.unsqueeze(1).expand(-1, embeds.size(1), -1)
        inputs = torch.cat((image_features, embeds), dim=2)
        lstm_out, _ = self.rnn(inputs)
        output = self.fc(lstm_out)
        return output

# Assume we have a vocabulary and a tokenizer
# These are placeholders; in a real application, you would have a proper vocabulary and tokenizer
vocab = {word: idx for idx, word in enumerate(["<start>", "<end>", "a", "cat", "on", "mat"])}
inv_vocab = {idx: word for word, idx in vocab.items()}

# Example usage
input_size = 512  # ResNet output size
hidden_size = 512  # Size of the hidden layer in LSTM
vocab_size = len(vocab)  # The size of the vocabulary
captioning_model = CaptioningModel(input_size, hidden_size, vocab_size)

# Dummy trained model loading
# captioning_model.load_state_dict(torch.load('model.pth'))

# Function to generate caption
def generate_caption(image_path, model, max_length=20):
    model.eval()
    image_features = extract_features(image_path).unsqueeze(0)  # Add batch dimension
    caption = [vocab["<start>"]]
    for _ in range(max_length):
        captions_tensor = torch.tensor(caption).unsqueeze(0)
        with torch.no_grad():
            output = model(image_features, captions_tensor)
        next_word_idx = output[0, -1].argmax().item()
        caption.append(next_word_idx)
        if next_word_idx == vocab["<end>"]:
            break
    return ' '.join([inv_vocab[idx] for idx in caption])

# Example usage
image_path = "/content/drive/MyDrive/water.jpeg"
caption = generate_caption(image_path, captioning_model)
print("Generated Caption:", caption)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




RuntimeError: input.size(-1) must be equal to input_size. Expected 512, got 2560

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
from google.colab import drive
drive.mount('/content/drive')
# Load the pre-trained ResNet model
resnet = models.resnet50(pretrained=True)
# Remove the classification layer (final fully connected layer)
resnet = nn.Sequential(*list(resnet.children())[:-1])
# Set the model to evaluation mode
resnet.eval()

# Define the transform
transform = transforms.Compose([
    transforms.Resize((224, 224)),      # Resize the image to (224, 224) for ResNet input
    transforms.ToTensor(),              # Convert image to tensor
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))  # Normalize the image
])

def extract_features(image_path):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        features = resnet(image)
    return features.squeeze()

# Define the RNN-based captioning model
class CaptioningModel(nn.Module):
    def __init__(self, feature_dim, embed_size, hidden_size, vocab_size):
        super(CaptioningModel, self).__init__()
        self.hidden_size = hidden_size
        self.feature_transform = nn.Linear(feature_dim, embed_size)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, image_features, captions):
        # Transform image features to match the embedding size
        image_features = self.feature_transform(image_features).unsqueeze(1)
        embeds = self.embedding(captions)
        # Concatenate transformed image features with embeddings
        inputs = torch.cat((image_features, embeds), dim=1)
        lstm_out, _ = self.rnn(inputs)
        output = self.fc(lstm_out)
        return output

# Assume we have a vocabulary and a tokenizer
# These are placeholders; in a real application, you would have a proper vocabulary and tokenizer
vocab = {word: idx for idx, word in enumerate(["<start>", "<end>", "a", "cat", "on", "mat","dog"])}
inv_vocab = {idx: word for word, idx in vocab.items()}

# Example usage
feature_dim = 2048  # ResNet output size
embed_size = 512  # Embedding size
hidden_size = 512  # Size of the hidden layer in LSTM
vocab_size = len(vocab)  # The size of the vocabulary
captioning_model = CaptioningModel(feature_dim, embed_size, hidden_size, vocab_size)

# Dummy trained model loading
# captioning_model.load_state_dict(torch.load('model.pth'))

# Function to generate caption
def generate_caption(image_path, model, max_length=20):
    model.eval()
    image_features = extract_features(image_path).unsqueeze(0)  # Add batch dimension
    caption = [vocab["<start>"]]
    for _ in range(max_length):
        captions_tensor = torch.tensor(caption).unsqueeze(0)
        with torch.no_grad():
            output = model(image_features, captions_tensor)
        next_word_idx = output[0, -1].argmax().item()
        caption.append(next_word_idx)
        if next_word_idx == vocab["<end>"]:
            break
    return ' '.join([inv_vocab[idx] for idx in caption])

# Example usage
image_path = "/content/drive/MyDrive/cats.jpeg"
caption = generate_caption(image_path, captioning_model)
print("Generated Caption:", caption)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Generated Caption: <start> a <end>


In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
from google.colab import drive
drive.mount('/content/drive')

# Load the pre-trained ResNet model
resnet = models.resnet50(pretrained=True)
# Remove the classification layer (final fully connected layer)
resnet = nn.Sequential(*list(resnet.children())[:-1])
# Set the model to evaluation mode
resnet.eval()

# Define the transform
transform = transforms.Compose([
    transforms.Resize((224, 224)),      # Resize the image to (224, 224) for ResNet input
    transforms.ToTensor(),              # Convert image to tensor
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))  # Normalize the image
])

def extract_features(image_path):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        features = resnet(image)
    return features.squeeze()

# Define the RNN-based captioning model
class CaptioningModel(nn.Module):
    def __init__(self, feature_dim, embed_size, hidden_size, vocab_size):
        super(CaptioningModel, self).__init__()
        self.hidden_size = hidden_size
        self.feature_transform = nn.Linear(feature_dim, embed_size)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, image_features, captions):
        # Transform image features to match the embedding size
        image_features = self.feature_transform(image_features).unsqueeze(1)
        embeds = self.embedding(captions)
        # Concatenate transformed image features with embeddings
        inputs = torch.cat((image_features, embeds), dim=1)
        lstm_out, _ = self.rnn(inputs)
        output = self.fc(lstm_out)
        return output

# Assume we have a vocabulary and a tokenizer
# These are placeholders; in a real application, you would have a proper vocabulary and tokenizer
vocab = {word: idx for idx, word in enumerate(["<start>", "<end>", "a", "cat", "on", "mat", "dog"])}
inv_vocab = {idx: word for word, idx in vocab.items()}

# Example usage
feature_dim = 2048  # ResNet output size
embed_size = 512  # Embedding size
hidden_size = 512  # Size of the hidden layer in LSTM
vocab_size = len(vocab)  # The size of the vocabulary
captioning_model = CaptioningModel(feature_dim, embed_size, hidden_size, vocab_size)

# Dummy trained model loading
# captioning_model.load_state_dict(torch.load('model.pth'))

# Function to generate caption
def generate_caption(image_path, model, max_length=20):
    model.eval()
    image_features = extract_features(image_path).unsqueeze(0)  # Add batch dimension
    caption = [vocab["<start>"]]
    for _ in range(max_length):
        captions_tensor = torch.tensor(caption).unsqueeze(0)
        with torch.no_grad():
            output = model(image_features, captions_tensor)
        next_word_idx = output[0, -1].argmax().item()
        caption.append(next_word_idx)
        if next_word_idx == vocab["<end>"]:
            break
    return ' '.join([inv_vocab[idx] for idx in caption if idx in inv_vocab])

# Example usage
image_path = "/content/drive/MyDrive/cats.jpeg"
caption = generate_caption(image_path, captioning_model)
print("Generated Caption:", caption)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Generated Caption: <start> dog a cat on on dog on on dog on on dog on on dog on on dog on on


In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the pre-trained ResNet model
resnet = models.resnet50(pretrained=True)
# Remove the classification layer (final fully connected layer)
resnet = nn.Sequential(*list(resnet.children())[:-1])
# Set the model to evaluation mode
resnet.eval()

# Define the transform
transform = transforms.Compose([
    transforms.Resize((224, 224)),      # Resize the image to (224, 224) for ResNet input
    transforms.ToTensor(),              # Convert image to tensor
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))  # Normalize the image
])

def extract_features(image_path):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0)  # Add batch dimension
    with torch.no_grad():
        features = resnet(image)
    return features.squeeze()

# Define the RNN-based captioning model
class CaptioningModel(nn.Module):
    def __init__(self, feature_dim, embed_size, hidden_size, vocab_size):
        super(CaptioningModel, self).__init__()
        self.hidden_size = hidden_size
        self.feature_transform = nn.Linear(feature_dim, embed_size)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, image_features, captions):
        # Transform image features to match the embedding size
        image_features = self.feature_transform(image_features).unsqueeze(1)
        embeds = self.embedding(captions)
        # Concatenate transformed image features with embeddings
        inputs = torch.cat((image_features, embeds), dim=1)
        lstm_out, _ = self.rnn(inputs)
        output = self.fc(lstm_out)
        return output

# Placeholder vocabulary and tokenizer
vocab = {word: idx for idx, word in enumerate(["<start>", "<end>", "a", "cat", "on", "mat", "dog"])}
inv_vocab = {idx: word for word, idx in vocab.items()}

# Example usage
feature_dim = 2048  # ResNet output size
embed_size = 512  # Embedding size
hidden_size = 512  # Size of the hidden layer in LSTM
vocab_size = len(vocab)  # The size of the vocabulary
captioning_model = CaptioningModel(feature_dim, embed_size, hidden_size, vocab_size)

# Dummy trained model loading (ensure to replace with actual trained model)
# captioning_model.load_state_dict(torch.load('model.pth'))

# Function to generate caption
def generate_caption(image_path, model, max_length=20):
    model.eval()
    image_features = extract_features(image_path).unsqueeze(0)  # Add batch dimension
    caption = [vocab["<start>"]]
    for _ in range(max_length):
        captions_tensor = torch.tensor(caption).unsqueeze(0)
        with torch.no_grad():
            output = model(image_features, captions_tensor)
        next_word_idx = output[0, -1].argmax().item()
        caption.append(next_word_idx)
        if next_word_idx == vocab["<end>"]:
            break
    return ' '.join([inv_vocab[idx] for idx in caption if idx in inv_vocab and idx != vocab["<start>"]])

# Example usage
image_path = "/content/drive/MyDrive/cats.jpeg"
caption = generate_caption(image_path, captioning_model)
print("Generated Caption:", caption)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




Generated Caption: mat <end>
