In [None]:
import os
import torch
import torch.cuda as cuda
from torch.utils.data import Dataset, DataLoader
from transformers import VisionEncoderDecoderModel, AdamW, get_scheduler, AutoTokenizer
from torch.nn.utils.rnn import pad_sequence
from tqdm.auto import tqdm

In [None]:
# Check if GPU is available
if cuda.is_available():
    device = torch.device('cuda')
    print(f'Using GPU: {device}')
else:
    device = torch.device('cpu')
    print('Using CPU')

In [None]:
# Load the tokenizer
decoder_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", add_special_tokens=True)

if decoder_tokenizer.pad_token is None:
    decoder_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
# Define the dataset class
class ImageCaptionDataset(Dataset):
    def __init__(self, image_dir, caption_dir, tokenizer):
        self.image_dir = image_dir
        self.caption_dir = caption_dir
        self.tokenizer = tokenizer
        self.image_files = os.listdir(image_dir)
        self.caption_files = os.listdir(caption_dir)

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, index):
        image_file = self.image_files[index]
        caption_file = f"{image_file.split('.')[0]}_{index % 4}.txt"

        image_tensor = torch.load(os.path.join(self.image_dir, image_file)).to(device)

        with open(os.path.join(self.caption_dir, caption_file), 'r') as f:
            caption_token = [int(token) for token in f.read().split()]

        return {"pixel_values": image_tensor, "caption_token": torch.tensor(caption_token)}

# Define the collate function
def collate_fn(batch):
    pixel_values = [item["pixel_values"] for item in batch]
    caption_tokens = [item["caption_token"] for item in batch]

    pixel_values = torch.stack(pixel_values)
    caption_tokens = pad_sequence(caption_tokens, batch_first=True, padding_value=-100)

    return {"pixel_values": pixel_values, "caption_token": caption_tokens}


In [None]:
# Load the dataset
train_dataset = ImageCaptionDataset('D:\MS COCO\preprocessed_images', 'D:\MS COCO\preprocessed_captions', decoder_tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# Load the pre-trained model
model = VisionEncoderDecoderModel.from_pretrained("D:\Final\enthavumo entho\Save_at_50_epochs.pt")
model.to(device)

In [None]:
# Define the number of epochs and batch size
num_epochs = 10
batch_size = 84

In [None]:
# Define the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=int(num_training_steps/5), num_training_steps=num_training_steps)

In [None]:
# Train the model
model.train()
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_epochs):
    losses = []
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        batch['pixel_values'] = batch['pixel_values'].view(batch['pixel_values'].size(0), 3, 224, 224)
        outputs = model(pixel_values=batch['pixel_values'], labels=batch['caption_token'])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        losses.append(loss.item())
        progress_bar.update(1)

    print(f'Epoch {epoch+1}, Loss: {sum(losses)/len(losses)}')

    if (epoch+1) % 2 == 0:
        model.save_pretrained(f"Save_at_{epoch+1}_epochs.pt")
        print(f'Saved model at {epoch+1} epochs')

In [None]:
def generate_caption(image, model, tokenizer, device):
    # Convert image to tensor and move it to the appropriate device
    image = torch.unsqueeze(torch.tensor(image), 0).to(device)
    
    # Encode image using the model's encoder
    encoder_output = model.encoder(pixel_values=image)
    
    # Initialize the decoder input with the special token for start of sequence
    decoder_input_ids = torch.tensor(tokenizer.encode("[CLS]")).unsqueeze(0).to(device)
    
    # Initialize the list to hold generated token IDs
    generated_ids = []
    
    # Set maximum length for generated caption
    max_length = 32
    
    # Generate tokens one by one using the decoder
    for _ in range(max_length):
        # Generate next token
        outputs = model.decoder(input_ids=decoder_input_ids, encoder_hidden_states=encoder_output.last_hidden_state)
        next_token_logits = outputs.logits[:, -1, :]
        next_token_id = next_token_logits.argmax(1).unsqueeze(-1)
        
        # Append the token to the list of generated tokens
        generated_ids.append(next_token_id.item())
        
        # Break if the end of sequence token is generated
        if next_token_id.item() == tokenizer.sep_token_id:
            break
        
        # Prepare input for the next iteration
        decoder_input_ids = torch.cat([decoder_input_ids, next_token_id], dim=-1)
    
    # Decode the generated token IDs into a caption string
    generated_caption = tokenizer.decode(generated_ids, skip_special_tokens=True)
    
    # Display the image and print the generated caption
    image = np.moveaxis(image[0].cpu().numpy(), 0, -1)
    plt.figure(figsize=(4, 4))
    plt.imshow(image)
    plt.axis('off')
    plt.show()
    print("Generated Caption:", generated_caption)
    
    return generated_caption


In [None]:
# Loading model
from transformers import VisionEncoderDecoderModel
model = VisionEncoderDecoderModel.from_pretrained("/kaggle/input/model50-pt").to('cuda')

In [None]:
train_dataset.__getitem__(i)['pixel_values'].numpy().shape

In [None]:
train_dataset = ImageCaptionDataset( image_path='/kaggle/input/impoleds/', text_df=text_data_train , tokenizer=decoder_tokenizer, image_height=224, image_width=224 )
val_dataloader = DataLoader(train_dataset, batch_size=batch_size)

In [None]:
import os
image_folder = '/kaggle/input/impoleds/'
image_files = ['im1.png','im2.png','im3.png','im4.png','im5.png','im6.png','im7.png','im8.png','im14.png','im9.jpg','im10.jpg','im11.jpg','im12.jpg','im13.jpg','im15.jpg','im16.jpg']

for image_file in image_files:
    image_path = os.path.join(image_folder, image_file)
    image = Image.open(image_path)
    image = image.resize((224, 224))
    image_array = np.array(image)

    if image_array.shape[2] == 4:
        image_array = image_array[:, :, :3]
    elif image_array.shape[2] == 1:
        image_array = np.repeat(image_array, 3, axis=2)
    image_array = np.transpose(image_array, (2, 0, 1))

    pred_caption = generate_caption(
        image=image_array,
        model=model.to('cuda'),
        tokenizer=decoder_tokenizer,
        device=device
    )

    