src = https://www.kaggle.com/code/burhanuddinlatsaheb/image-captioning-vit-gpt2

# 1. Imports

In [1]:
import os

import datasets
import numpy as np
import pandas as pd
from PIL import Image
from pathlib import Path
from tqdm.auto import tqdm
import multiprocessing as mp
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import io, transforms
from torch.utils.data import Dataset, DataLoader, random_split

from transformers import Seq2SeqTrainer ,Seq2SeqTrainingArguments
from transformers import VisionEncoderDecoderModel , ViTFeatureExtractor
from transformers import AutoTokenizer ,  GPT2Config , default_data_collator


if torch.cuda.is_available():    

    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


# 2. Hyperparameters

In [2]:
os.environ["WANDB_DISABLED"] = "true"
class config : 
    ENCODER = "google/vit-base-patch16-224"
    DECODER = "gpt2"
    TRAIN_BATCH_SIZE = 8
    VAL_BATCH_SIZE = 8
    VAL_EPOCHS = 1
    LR = 5e-5
    SEED = 42
    MAX_LEN = 128
    SUMMARY_LEN = 20
    WEIGHT_DECAY = 0.01
    MEAN = (0.485, 0.456, 0.406)
    STD = (0.229, 0.224, 0.225)
    TRAIN_PCT = 0.95
    NUM_WORKERS = mp.cpu_count()
    EPOCHS = 3
    IMG_SIZE = (224,224)
    LABEL_MASK = -100
    TOP_K = 1000
    TOP_P = 0.95

# 3. Helper Functions

In [3]:
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
    outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
    return outputs
AutoTokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens

In [4]:
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

  rouge = datasets.load_metric("rouge")


# 4. Dataset

### 4.1 Feature Extractor and Tokenizer

In [5]:
feature_extractor = ViTFeatureExtractor.from_pretrained(config.ENCODER)
tokenizer = AutoTokenizer.from_pretrained(config.DECODER)
tokenizer.pad_token = tokenizer.unk_token



### 4.2 Transforms and dataframe

In [6]:
transforms = transforms.Compose(
    [
        transforms.Resize(config.IMG_SIZE), 
        transforms.ToTensor(),
        transforms.Normalize(
            mean=0.5, 
            std=0.5
        )
   ]
)

df=  pd.read_csv("./Data/captions.txt")
train_df , val_df = train_test_split(df , test_size = 0.2)
df.head()

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


### 4.3 Dataset Class

In [7]:
class ImgDataset(Dataset):
    def __init__(self, df,root_dir,tokenizer,feature_extractor, transform = None):
        self.df = df
        self.transform = transform
        self.root_dir = root_dir
        self.tokenizer= tokenizer
        self.feature_extractor = feature_extractor
        self.max_length = 50
    def __len__(self,):
        return len(self.df)
    def __getitem__(self,idx):
        caption = self.df.caption.iloc[idx]
        image = self.df.image.iloc[idx]
        img_path = os.path.join(self.root_dir , image)
        img = Image.open(img_path).convert("RGB")
        
        if self.transform is not None:
            img= self.transform(img)
        pixel_values = self.feature_extractor(img, return_tensors="pt").pixel_values
        captions = self.tokenizer(caption,
                                 padding='max_length',
                                 max_length=self.max_length).input_ids
        captions = [caption if caption != self.tokenizer.pad_token_id else -100 for caption in captions]
        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(captions)}
        return encoding

### 4.4 Train and validation dataset

#### Image Normalization

In [8]:
# Cleaning images for use by the model
import os
import glob
import numpy as np
from PIL import Image

# Set the path to the directory containing the input images
input_dir = './Data/Images/'

# Set the path to the directory where the output images will be saved
output_dir = './Data/Cleaned Images/'

# Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)


In [9]:
# Iterate through all the image files in the input directory
for img_file in glob.glob(os.path.join(input_dir, '*.jpg')):
    # Load the image array
    img = np.array(Image.open(img_file))

    # Rescale the image values to be within the range [0, 1]
    img_rescaled = (img - img.min()) / (img.max() - img.min())

    # Convert the image array to a PIL image
    img_pil = Image.fromarray((img_rescaled * 255).astype(np.uint8))

    # Set the path and filename for the output image
    output_file = os.path.join(output_dir, os.path.basename(img_file))

    # Save the output image
    img_pil.save(output_file)

In [20]:
import os
import numpy as np
from PIL import Image

# Define the path to the folder containing the images
image_folder = './Data/Images/'

# Define the path to the folder to save the normalized images
normalized_folder = './Data/Normalized_Images/'

# Create the folder to save the normalized images if it does not exist
if not os.path.exists(normalized_folder):
    os.makedirs(normalized_folder)

# Loop over each image in the folder
for image_file in os.listdir(image_folder):
    # Load the image
    image_path = os.path.join(image_folder, image_file)
    img = np.array(Image.open(image_path))

    # Normalize the pixel values of the image
    img_normalized = (img - np.min(img)) / (np.max(img) - np.min(img))

    # Save the normalized image as a new file in the normalized folder
    normalized_image_file = os.path.splitext(image_file)[0] + '.jpg'
    normalized_image_path = os.path.join(normalized_folder, normalized_image_file)
    Image.fromarray(np.uint8(img_normalized * 255)).save(normalized_image_path)

#### Creating datasets

In [21]:
train_dataset = ImgDataset(train_df, root_dir="./Data/Normalized_Images/",
                           tokenizer=tokenizer, feature_extractor=feature_extractor, transform=transforms)
val_dataset = ImgDataset(val_df, root_dir="./Data/Normalized_Images/",
                         tokenizer=tokenizer, feature_extractor=feature_extractor, transform=transforms)

# 5. Model Building

### 5.1 Model Initialization

In [22]:
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(config.ENCODER, config.DECODER)

Some weights of the model checkpoint at google/vit-base-patch16-224 were not used when initializing ViTModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size
# set beam search parameters
model.config.eos_token_id = tokenizer.sep_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.max_length = 128
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

# 6. Training

### 6.1 Training Arguments

In [24]:
training_args = Seq2SeqTrainingArguments(
    output_dir='VIT_large_gpt2',
    per_device_train_batch_size=config.TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=config.VAL_BATCH_SIZE,
    predict_with_generate=True,
    evaluation_strategy="epoch",
    do_train=True,
    do_eval=True,
    logging_steps=1024,  
    save_steps=2048, 
    warmup_steps=1024,  
    learning_rate = 5e-5,
    #max_steps=1500, # delete for full training
    num_train_epochs = config.EPOCHS, #TRAIN_EPOCHS
    overwrite_output_dir=True,
    save_total_limit=1,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


### 6.2 Training using Seq2SeqTrainer

In [25]:
# instantiate trainer
trainer = Seq2SeqTrainer(
    tokenizer=feature_extractor,
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=default_data_collator,
)
trainer.train()



  0%|          | 0/12138 [00:00<?, ?it/s]

ValueError: The image to be converted to a PIL image contains values outside the range [0, 1], got [-1.0, 1.0] which cannot be converted to uint8.

In [None]:
trainer.save_model('VIT_large_gpt2')