<a href="https://colab.research.google.com/github/DhrubaAdhikary/ERA_V2/blob/master/ERA2-Session-30-Finetune-VLM-main/Transform_images_project_instruct150k_qa_embedd_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip install glob2 peft wandb datasets trl==0.8.5 transformers accelerate -q
!pip install -U bitsandbytes flash_attn -q

In [11]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoProcessor, AutoTokenizer
from PIL import Image
import requests
import numpy as np
import pandas as pd
import json

class llavaDataset(Dataset):
    """
    Custom Dataset class to load and preprocess question-answer dataset with images.
    """

    def __init__(self, qa_dataset, clip_model_name):
        self.processor = AutoProcessor.from_pretrained(clip_model_name)
        self.qa_dataset = qa_dataset

    def __len__(self):
        return len(self.qa_dataset)

    def __getitem__(self, idx):
        """
        Retrieves a single data sample including the image, question, and answer.
        """
        img_url = self.qa_dataset.iloc[idx]['img_url']
        ques = torch.tensor(
            np.array(np.matrix(self.qa_dataset.iloc[idx]['input']))[0]
        )
        ans = torch.tensor(
            np.array(np.matrix(self.qa_dataset.iloc[idx]['label']))[0]
        )

        # Load and process the image
        image_load = Image.open(requests.get(img_url, stream=True).raw)
        image_processed = self.processor(images=image_load, return_tensors="pt")['pixel_values']
        image_processed = image_processed.squeeze(0)

        return image_processed, ques, ans

def collate_fn(batch):
    """
    Custom collate function to batch image, question, and answer tensors.
    """
    images = torch.stack([item[0] for item in batch])
    questions = torch.nn.utils.rnn.pad_sequence(
        [item[1] for item in batch], batch_first=True, padding_value=0
    )
    answers = torch.nn.utils.rnn.pad_sequence(
        [item[2] for item in batch], batch_first=True, padding_value=0
    )

    return {
        'images': images,
        'questions': questions,
        'answers': answers
    }

def get_dataloader(qa_dataset, clip_model_name, batch_size=32, shuffle=True):
    """
    Function to create a DataLoader for llavaDataset.
    """
    dataset = llavaDataset(qa_dataset, clip_model_name)
    return DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=shuffle)

if __name__ == "__main__":
    # Load CSV file
    csv_file = 'train_token.csv'
    qa_dataset = pd.read_csv(csv_file)

    # Define model and tokenizer
    clip_model_name = "openai/clip-vit-base-patch32"

    # Create DataLoader
    dataloader = get_dataloader(qa_dataset, clip_model_name, batch_size=8)

    # Example usage
    for batch in dataloader:
        print(batch)

    # Load JSON file
    with open('llava_instruct_150k.json') as f:
        data = json.load(f)

    # Flatten the data and create a sample
    data_instruct150_sample_val_flatten = []
    r = 0

    for a_idx, d in enumerate(data):
        image = d['image']
        image_url = f'http://images.cocodataset.org/train2017/{image}'
        conv_iter = iter(d['conversations'])
        for i in conv_iter:
            gpt_ans = next(conv_iter)
            if len(gpt_ans['value']) > 200:  # Filter long answers
                continue
            if i['from'] == 'human' and gpt_ans['from'] == 'gpt':
                image_q = i['value'].replace('<image>\n', '').replace('\n<image>', '') + ' [QA]'
                image_a = gpt_ans['value'] + AutoTokenizer.from_pretrained(clip_model_name).eos_token
                data_instruct150_sample_val_flatten.append([image_url, image_q, image_a])

        if a_idx % 10000 == 0:
            print(f"{10000 * r} processed")
            r += 1


ParserError: Error tokenizing data. C error: EOF inside string starting at row 259607

In [13]:
import torch
import wandb
import torch
import torch.nn as nn
from transformers import CLIPVisionModel, AutoModelForCausalLM, AutoTokenizer
from torch.nn import functional as F
import random
import gc
import numpy as np
import os
import glob2
# from dataset import collate_fn, llavadataset
from torch.utils.data import random_split, DataLoader
import pickle

# import wandb
# from google.colab import userdata
# wandb1 = userdata.get('wandb')
# os.environ["WANDB_API_KEY"] = wandb1
os.environ["WANDB_API_KEY"] = "Dhruba Adhikary"

In [14]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/MyDrive/Colab_Notebooks/Session30

In [18]:
# teacher forcing simulated annealing scheduler
# Below code is used simulated annealing scheduler called frange_cycle_linear.
# This function generates a cyclic schedule for a hyperparameter, often used in teacher forcing or other training techniques where a parameter
# (such as the probability of applying teacher forcing) needs to change gradually over time.

# The frange_cycle_linear function is commonly used to schedule the teacher forcing ratio during training, which might start at a low value
# (e.g., almost always letting the model predict on its own) and gradually increase (i.e., using ground truth more often).
# With multiple cycles, this ratio oscillates throughout the training, allowing the model to learn in different regimes over time.

def frange_cycle_linear(n_iter, start=0.0001, stop=0.9999,  n_cycle=1, ratio=0.8):
    # n_iter : total number of iterations for which the schedule will be computed.
    # start : initial value of the schedule. For teacher forcing, this could represent the starting probability of forcing the model to use the ground-truth data during training.
    # stop : maximum or final value of the schedule, typically representing the probability of not using teacher forcing.
    # n_cycle : means how many times the schedule will oscillate from start to stop.
    # ratio :  fraction of each cycle where the parameter linearly increases from start to stop.

    # Create  Schedule Array:
    # This initializes an array L of length n_iter with all elements set to stop. This means that if no further changes are made, the parameter will stay at the stop value throughout the training.
    L = np.ones(n_iter) * stop

    # period: Defines the length of each cycle, i.e., how many iterations each cycle spans. If n_cycle=1, the entire schedule is a single cycle; if n_cycle=2, the period is half the total iterations.
    # step: This defines the amount by which the parameter will increase in each iteration during the linear growth phase of the cycle.
    # The ratio controls how much of the period is used for this linear increase.
    period = n_iter/n_cycle
    step = (stop-start)/(period*ratio) # linear schedule

    # runs for each cycle.
    # This loop ensures that the parameter starts at start, increases linearly over part of the cycle (determined by ratio), and then stays at stop for the remainder of the cycle.
    for c in range(n_cycle):
        # For each cycle, start the parameter value (v) at start and initialize an index (i) to 0.
        v, i = start, 0
        #  For each cycle, increment the value v by the step size and place it into the correct index of the array L.
        while v <= stop and (int(i+c*period) < n_iter):
            # Update the schedule at the correct position in L.
            L[int(i+c*period)] = v
            # The value increases linearly from start towards stop.
            v += step
            i += 1

    # After constructing the schedule in L, the function returns 1 - L.
    # This effectively inverts the values in L, making the schedule start from (1 - stop) and end at (1 - start).
    # This is often done to control the probability of certain actions, such as teacher forcing.
    return (1 - L)

In [19]:
# define models
phi_model_name  = "microsoft/phi-2"
clip_model_name = "openai/clip-vit-base-patch32"
device = 'cuda'
max_steps = 100000

In [20]:
annealing_teacher_forcing_scheduler = frange_cycle_linear(max_steps)

class SimpleResBlock(nn.Module):
    def __init__(self, phi_embed):
        super().__init__()
        # Layer Normalization: Normalizes the input to have zero mean and unit variance across the feature dimension.
        # It helps stabilize the training by ensuring that the values passed through the network remain in a consistent range.
        self.pre_norm = nn.LayerNorm(phi_embed)

        # nn.Sequential defines a sequential container, meaning a series of layers applied one after another.
        # nn.Linear(phi_embed, phi_embed): A fully connected (linear) layer that takes an input of size phi_embed and outputs the same size (phi_embed).
        # GELU (Gaussian Error Linear Unit) is an activation function that smooths out nonlinearities.
        # It's similar to ReLU, but it has a smoother gradient, which can improve performance in some architectures.
        self.proj = nn.Sequential(
            nn.Linear(phi_embed, phi_embed),
            nn.GELU(),
            nn.Linear(phi_embed, phi_embed)
        )
    # This method defines how the input x passes through the block during forward propagation.
    # It takes an input tensor x, normalizes it, processes it through two linear layers with a GELU activation in between, and then adds the original input x back to the output of the transformation.
    # The residual connection is important because it allows the model to retain information from earlier layers and makes it easier to train deep models.
    def forward(self, x):
        x = self.pre_norm(x)
        # key part of a residual block
        return x + self.proj(x)

In [22]:
# This code defines a neural network model called CLIPPhi2Model,
# which combines two pretrained models:
# a CLIP vision model for image embeddings
# and a causal language model (Phi-2)
# for text generation.
class CLIPPhi2Model(torch.nn.Module):
    # size of the image embeddings from the CLIP model (768-dimensional).
    # size of the text embeddings from the Phi-2 model (2560-dimensional).
    def __init__(self, clip_embed=768, phi_embed=2560):
        super().__init__()

        #  End-of-sequence token ID for the language model.
        self.EOS_TOKEN_ID    = 50256
        # token to represent the presence of an image.
        self.IMAGE_TOKEN_ID  = 23893 # token for comment

        # pretrained models
        self.phi_model = AutoModelForCausalLM.from_pretrained(phi_model_name,
                                            torch_dtype=torch.float16,
                                            trust_remote_code=True)
        self.clip_model = CLIPVisionModel.from_pretrained(clip_model_name)

        # projection layers
        #  A linear layer that projects the lower-dimensional image embeddings (768 from CLIP) to match the higher-dimensional text embeddings (2560 from Phi-2).
        self.projection = torch.nn.Linear(clip_embed, phi_embed)
        # A simple residual block (SimpleResBlock) that processes the projected image embeddings.
        self.resblock = SimpleResBlock(phi_embed)

        # Freeze Weights
        # IMPORTANT:
        # The pretrained models (both Phi-2 and CLIP) are frozen so their weights are not updated during training.
        # This is common in transfer learning to use pretrained features while only training new layers.
        for network in [self.phi_model, self.clip_model]:
            for param in network.parameters():
                param.requires_grad_(False)

        # load checkpoint weights
        # If pre-trained projection and residual block weights are available (clipphi_proj.pth and clipphi_resblock.pth), they are loaded.
        if os.path.isfile('model_chkpt/clipphi_proj.pth'):
            self.projection.load_state_dict(torch.load('model_chkpt/clipphi_proj.pth'))
            self.resblock.load_state_dict(torch.load('model_chkpt/clipphi_resblock.pth'))


    # To generate text (e.g., captions) from images using the model.
    # images: The processed input image data. max_length: Maximum length of the generated text.  tokenizer: Tokenizer to convert text into token IDs and vice versa.
    def generate(self,images,max_length,tokenizer):
        # clip model output for image
        # input images are passed through the CLIP model (clip_model) to get their embeddings.
        clip_outputs = self.clip_model(**images)
        # remove cls token
        images = clip_outputs.last_hidden_state[:,1:,:]
        # The CLIP image embeddings are projected to the same dimensionality as the Phi-2 embeddings (2560) using self.projection
        image_embeds = self.projection(images)
        # This is further processed by the residual block (self.resblock).
        image_embeds = self.resblock(image_embeds).to(torch.float16)

        # Batch Size: Extract the number of images (batch_size).
        batch_size = images.size(0)
        # predicted_caption: Initialize a tensor to hold the generated caption tokens. It’s filled with the EOS token (50256), which represents the end-of-sequence.
        predicted_caption = torch.full((batch_size,max_length),50256)
        # Image Token Embedding: A custom token representing the image is embedded using the Phi-2 model's token embeddings.
        img_token_tensor = torch.tensor(self.IMAGE_TOKEN_ID).repeat(batch_size, 1)
        # This acts as the start of the sequence (bos_token) for generation.
        bos_token_embeds = self.phi_model.model.embed_tokens(img_token_tensor.to(image_embeds.device))
        # Concatenate: The image embeddings (image_embeds) are concatenated with the BOS token embeddings (bos_token_embeds) to create the initial input for the text generation process.
        combined_embeds  = torch.cat([image_embeds, bos_token_embeds], dim=1) # 4,9,2560


        # Prepare for the Next Token: The predicted token is embedded using the Phi-2 model's embeddings (embed_tokens) and concatenated with the existing sequence of embeddings (combined_embeds). This updated embedding sequence is then used in the next iteration to predict the next token.

        # Repeat: The process repeats for the entire caption length.
        # Loop through max_length: For each position pos (up to max_length - 1), the model generates one token at a time.
        for pos in range(max_length - 1):
            # pass through the model

            # Model Forward Pass: The Phi-2 language model takes the combined_embeds (which includes both the image and any previously generated tokens) as input and predicts the next token's logits.
            model_output_logits = self.phi_model.forward(inputs_embeds = combined_embeds)['logits'] # 4,49,51200
            predicted_word_token_logits = model_output_logits[:, -1, :].unsqueeze(1) # 4,1,51200
            # Token Prediction: The logits for the last position ([:, -1, :]) are extracted and passed through
            # torch.argmax to get the predicted token with the highest probability.
            # This token is stored in the predicted_caption tensor.
            predicted_word_token = torch.argmax(predicted_word_token_logits, dim = -1) # 4,1
            predicted_caption[:,pos] = predicted_word_token.view(1,-1).to('cpu')
            # The predicted token is embedded using the Phi-2 model's embeddings (embed_tokens).
            next_token_embeds = self.phi_model.model.embed_tokens(predicted_word_token) # 4,1,2560
            # This is concatenated with the existing sequence of embeddings (combined_embeds).
            combined_embeds   = torch.cat([combined_embeds, next_token_embeds], dim=1)
            # This updated embedding sequence is then used in the next iteration to predict the next token.

        # After generating the tokens for all positions, the method returns the complete predicted_caption tensor containing the token IDs for the generated caption.
        return predicted_caption

    def forward(self, images, target_captions,step,max_steps):
        # batch_size: The number of samples in the batch.
        # target_length: The length of the target captions (number of tokens).
        batch_size    = target_captions.size(0)
        target_length = target_captions.shape[1]
         #print(f"GPU memory {torch.cuda.max_memory_allocated()/ (1024 ** 3):.2f} GB")

        # clip model output for image
        # Input: The input images are passed through the CLIP model, which outputs image embeddings.
        clip_outputs = self.clip_model(**images)
        # Remove CLS Token: The CLS token (used for classification) is removed, as it’s not needed for caption generation.
        images = clip_outputs.last_hidden_state[:,1:,:] # remove cls token

        # projection layer
        # The 768-dimensional CLIP image embeddings are projected to the 2560-dimensional space required by the Phi-2 model using a linear layer.
        image_embeds = self.projection(images).to(torch.float16)
        #image_embeds = self.resblock(image_embeds).to(torch.float16)

        # add comment token from phi2
        # A special image token (self.IMAGE_TOKEN_ID) is embedded using the Phi-2 model's embedding layer.
        img_token_tensor = torch.tensor(self.IMAGE_TOKEN_ID).repeat(batch_size, 1)
        # This token is a placeholder to mark where the image information ends and the text generation starts.
        img_token_embeds = self.phi_model.model.embed_tokens(img_token_tensor.to(image_embeds.device))
        # The image embeddings are concatenated with the image token embeddings to form combined_embeds, which is the input to the Phi-2 model for text generation.
        combined_embeds  = torch.cat([image_embeds, img_token_embeds], dim=1) # 4,49,2560
        del clip_outputs
        del image_embeds

        # for loss
        loss = 0
        # In each iteration, the Phi-2 model generates one token at a time based on the current input embeddings (combined_embeds), which include both image and previously generated token embeddings.
        for pos in range(target_length - 1):
            # pass through the model
            # The Phi-2 model processes the current embeddings (combined_embeds) to generate the next token's logits (model_output_logits).
            model_output_logits = self.phi_model.forward(inputs_embeds = combined_embeds)['logits'] # 4,49,51200
            # The logits for the last predicted token in the sequence are extracted ([:, -1, :]), and then reshaped to match the dimensions expected by the loss function.
            predicted_word_token_logits = model_output_logits[:, -1, :].unsqueeze(1) # 4,1,51200
            # The cross-entropy loss is computed between the predicted token logits and the actual target token at the current position (target_captions[:, pos]).
            # The loss is smoothed with label_smoothing to avoid overconfident predictions, and tokens with the EOS_TOKEN_ID are ignored.
            pos_loss = F.cross_entropy(predicted_word_token_logits.view(-1,predicted_word_token_logits.size(-1)), target_captions[:, pos].contiguous().view(-1), ignore_index=self.EOS_TOKEN_ID,label_smoothing=0.1)
            # print(f"pos {pos} loss {pos_loss}")
            # The loss for the current token is added to the total batch loss.
            loss += pos_loss

            # Store Predicted Token: The predicted token is stored in the predicted_caption tensor, which will hold the complete generated sequence.
            predicted_word_token = torch.argmax(predicted_word_token_logits, dim=-1) # 4,1
            #print(f"predicted_word_token {predicted_word_token} and target_captions {target_captions[:,pos]}")
            # Teacher Forcing: For the first few tokens (up to pos <= 5) and early in training (step <= int(0.6 * max_steps)),the model uses teacher forcing.
            # In this case, instead of relying on its own predictions, the model is fed the correct target token from target_captions.
            # do teacher forcing or model output based on annealing scheduler probability
            if pos <= 5 and step <= int(0.6 * max_steps): # teacher forcing
                next_token_embeds = self.phi_model.model.embed_tokens(target_captions[:,pos].unsqueeze(1)) # 4,1,2560
            else:
                next_token_embeds = self.phi_model.model.embed_tokens(predicted_word_token) # 4,1,2560

            # The predicted token is embedded and concatenated to the existing embeddings to generate the next token in the sequence.
            combined_embeds   = torch.cat([combined_embeds, next_token_embeds], dim=1)

        #average_loss
        # The total loss is averaged over all token positions to get the final loss for the batch.
        loss = loss / target_length

        # for efficient memory utilization
        del combined_embeds
        del model_output_logits
        torch.cuda.empty_cache()

        return loss


In [23]:
# This funcation evaluates a trained model on a single batch of data from a validation set.
# It compares the model’s predictions (generated text) with the ground truth (target captions) and prints both the target and predicted captions.
def model_validate_one_batch(model,device,val_dataloader,max_length,tokenizer):
    # This switches the model into evaluation mode, which ensures certain behaviors like dropout and batch normalization are disabled during inference.
    model.eval()
    # This disables gradient tracking, which reduces memory usage and speeds up computations since gradients are not needed during evaluation or inference.
    with torch.no_grad():
        # val_dataloader: The validation data loader provides batches of images and their corresponding target captions.
        # For each batch, images contains the input images, and target_captions contains the ground-truth captions.
        for batch_idx, (images, target_captions) in enumerate(val_dataloader):
            images = {'pixel_values': images.to(device)}
            target_captions = target_captions.to(device)
            # decodes the tokenized target captions back into human-readable text.
            # It uses the tokenizer associated with the model to convert the numerical token IDs into their corresponding text.
            # The 50256 token ID corresponds to a special token (likely the end-of-sequence or padding token), which is ignored during decoding.
            target_captions_decoded = tokenizer.batch_decode(target_captions,ignore_index = 50256)
            # This calls the model’s generate method to create text predictions from the input images.
            predicted_captions = model.generate(images,max_length,tokenizer)
            # The predicted token sequences are also decoded back into human-readable text, just like the target captions.
            predicted_captions_decoded = tokenizer.batch_decode(predicted_captions,ignore_index = 50256)

            # iterates over the decoded predicted captions and prints both the target and predicted captions for comparison.
            # pc_idx: The index of the current caption in the batch.
            # target_captions_decoded[pc_idx]: The ground-truth caption for the corresponding image.
            # predicted_captions_decoded[pc_idx]: The predicted caption generated by the model for the same image.
            for pc_idx,pc in enumerate(predicted_captions_decoded):
                print(f"{pc_idx} - Target captions:\n {target_captions_decoded[pc_idx]}  \n{pc_idx} - predicted_captions:\n {pc} ")
            return # validate only 1 batch

In [24]:
# This function, train_model(), is responsible for training the multimodal model that combines image (CLIP) and text (Phi2) embeddings.
# The function iteratively processes batches from the training data, calculates the loss, and updates the model parameters.
# It also handles periodic model validation, saving checkpoints, and logging progress
def train_model(model, train_loader, val_dataloader,optimizer, device,max_steps,model_save_step,model_val_step,log_step,max_token_filter,tokenizer):
    print(f"Training started.")

    # max_step_reached: A flag to track if the maximum number of training steps has been reached.
    max_step_reached = 0
    # step: Tracks the current step in the training process.
    step = 0
    # max_length: Sets the maximum length for the generated captions (set to 20 tokens here).
    max_length = 20
    # running_loss: Accumulates the loss across multiple steps, used for logging the average loss.
    running_loss = 0.
    # model.train(): Puts the model in training mode (this affects dropout and batch normalization layers, if any).
    model.train()

    #This outer loop iterates over a large number of epochs
    #The inner loop iterates through the train_loader, processing batches of images and their corresponding captions.
    # batch_idx: The index of the current batch.
    for epoch in range(100000):
        for batch_idx, (images, target_captions) in enumerate(train_loader):

            # images: The image data is prepared as a dictionary with the key 'pixel_values' and moved to the appropriate device (GPU).
            # target_captions: The target captions (text) are also moved to the device.
            # manage OOM issue, skip batch for long captions
            if target_captions.shape[1] >= max_token_filter:
                print(f"Batch skipped as captions too long.")
                continue
            images = {'pixel_values': images.to(device)}
            target_captions = target_captions.to(device)

            # Clears the gradients before backpropagation (a standard step to prevent accumulating gradients from previous batches).
            optimizer.zero_grad()
            # The model processes the images and target captions to compute the loss for this batch.
            # The step and max_steps parameters may influence the annealing of teacher forcing or other training aspects.
            loss = model(images, target_captions,step,max_steps)
            #print(f"teacher {teacher_forcing} and loss {loss}")
            running_loss += loss.item()

            # log step
            if (step % log_step == 0):
                if step == 0:
                    print(f"Step {step}/{max_steps}: Avg Running Loss = {running_loss}")
                else:
                    print(f"Step {step}/{max_steps}: Avg Running Loss = {running_loss /log_step}")
                running_loss = 0.
            wandb.log({"step": step, "train_loss": loss.item()})

            # increment step
            step += 1
            teacher_forcing = False

            # loss backprop
            # loss.backward(): Computes the gradients for all trainable parameters using backpropagation.
            # optimizer.step(): Updates the model parameters based on the computed gradients.
            loss.backward()
            optimizer.step()

            # save model
            if step % model_save_step == 0 or (step == max_steps):
                print("Saving Checkpoint for step : ", step)
                torch.save(model.projection.state_dict(),'model_chkpt/clipphi_proj.pth')
                torch.save(model.resblock.state_dict(),'model_chkpt/clipphi_resblock.pth')

            # check random validation of images
            if step % model_val_step == 0 or (step == max_steps):
                model_validate_one_batch(model,device,val_dataloader,max_length,tokenizer)
                model.train()

            # global max steps reached
            if step >= max_steps:
                max_step_reached = 1
                break

        if max_step_reached == 1:
            break
    print(f"Reached the max steps. Training stopped.")

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoProcessor, AutoTokenizer
from PIL import Image
import requests
import numpy as np
import pandas as pd

class llavaDataset(Dataset):
    """
    Custom Dataset class for loading and processing question-answer datasets
    with images, supporting both 'train' and 'validation/test' modes.
    """
    def __init__(self, qa_dataset, phi_model_name, clip_model_name, mode, tokenizer):
        """
        Initializes the dataset with the required parameters.

        Args:
            qa_dataset (DataFrame): Data containing image URLs, questions, and answers.
            phi_model_name (str): Name of the Phi model (not currently used in this code).
            clip_model_name (str): Name of the CLIP model for preprocessing images.
            mode (str): Specifies the dataset mode ('train' or 'validation/test').
            tokenizer (AutoTokenizer): Tokenizer for processing question and answer text.
        """
        self.processor = AutoProcessor.from_pretrained(clip_model_name)
        self.qa_dataset = qa_dataset
        self.mode = mode
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.qa_dataset)

    def __getitem__(self, idx):
        """
        Retrieves a single data sample with image, tokenized question, and tokenized answer.
        """
        img_url = self.qa_dataset.iloc[idx]['img_url']
        question_text = self.qa_dataset.iloc[idx]['input']
        answer_text = self.qa_dataset.iloc[idx]['label']

        # Load and process the image
        image_load = Image.open(requests.get(img_url, stream=True).raw)
        image_processed = self.processor(images=image_load, return_tensors="pt")['pixel_values']
        image_processed = image_processed.squeeze(0)

        # Tokenize the question and answer
        ques = self.tokenizer(
            question_text, padding='max_length', truncation=True, max_length=50, return_tensors="pt"
        )['input_ids'].squeeze(0)

        ans = self.tokenizer(
            answer_text, padding='max_length', truncation=True, max_length=50, return_tensors="pt"
        )['input_ids'].squeeze(0)

        return image_processed, ques, ans

def collate_fn(batch):
    """
    Custom collate function for batching image, question, and answer tensors.
    """
    images = torch.stack([item[0] for item in batch])
    questions = torch.nn.utils.rnn.pad_sequence(
        [item[1] for item in batch], batch_first=True, padding_value=0
    )
    answers = torch.nn.utils.rnn.pad_sequence(
        [item[2] for item in batch], batch_first=True, padding_value=0
    )

    return {
        'images': images,
        'questions': questions,
        'answers': answers
    }

def get_dataloader(qa_dataset, phi_model_name, clip_model_name, mode, tokenizer,
                   batch_size=32, num_workers=10, shuffle=True, pin_memory=True):
    """
    Creates and returns a DataLoader for the llavaDataset.

    Args:
        qa_dataset (DataFrame): Dataset containing image URLs, questions, and answers.
        phi_model_name (str): Name of the Phi model (not used in this version).
        clip_model_name (str): CLIP model name for processing images.
        mode (str): Mode of the dataset ('train' or 'validation/test').
        tokenizer (AutoTokenizer): Tokenizer for text inputs.
        batch_size (int): Batch size for DataLoader.
        num_workers (int): Number of subprocesses to use for data loading.
        shuffle (bool): Whether to shuffle the data (default is True).
        pin_memory (bool): Whether to use pinned memory for faster data transfer to GPU.

    Returns:
        DataLoader: A DataLoader instance with the custom collate function.
    """
    dataset = llavaDataset(qa_dataset, phi_model_name, clip_model_name, mode, tokenizer)
    return DataLoader(
        dataset, batch_size=batch_size, collate_fn=collate_fn,
        num_workers=num_workers, shuffle=shuffle, pin_memory=pin_memory
    )

# Example usage
if __name__ == "__main__":
    # Load the dataset (assuming it is loaded as a DataFrame)
    csv_file = 'train_token.csv'
    qa_dataset = pd.read_csv(csv_file)

    # Initialize tokenizer and model names
    phi_model_name  = "microsoft/phi-2"
    clip_model_name = "openai/clip-vit-base-patch32"
    device = 'cuda'
    max_steps = 100000
    tokenizer = AutoTokenizer.from_pretrained(clip_model_name)

    # Create the DataLoader for training
    train_batch_size = 8
    train_dataloader = get_dataloader(
        qa_dataset, phi_model_name, clip_model_name, 'train', tokenizer,
        batch_size=train_batch_size, num_workers=10, shuffle=True, pin_memory=True
    )

    # Example: Iterate over the DataLoader
    for batch in train_dataloader:
        print(batch)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]



preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
           271,   276,   273,   276,   280,   280,   273,   280,   278,   279,
           273,   278,   279,   274,   276,   278,   274,   274,   272,   271,
           275,   271,   280,   275,   273,   280,   280,   278,   274,   274,
           273,   271,   272,   272,   275,   276,   271,   316, 49407, 49407],
        [49406,   314,   277,   278,   277,   277,   275,   272,   272,   273,
           278,   279,   272,   271,   272,   277,   279,   279,   275,   273,
           273,   280,   280,   278,   274,   274,   273,   271,   272,   272,
           274,   272,   280,   272,   279,   271,   271,   273,   274,   272,
           271,   273,   277,   280,   274,   276,   274,   274,   279, 49407],
        [49406,   314,   276,   280,   280,   274,   272,   271,   277,   271,
           278,   276,   273,   277,   274,   279,   271,   275,   278,   276,
           273,   273,   273,   273,   273,   273,   280,   278,

In [25]:
# defines the workflow for training a multimodal GPT model.
# which is likely based on a combination of CLIP and the Phi2 language model.
# The code handles loading the data, setting up the model, and defining the training process.
def main():
    with open("captions.pickle", "rb") as fp:   # Unpickling
        coco_unpickle = pickle.load(fp)

    train_batch_size = 4
    val_batch_size   = 2
    tokenizer  = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)

    # model
    MModalGPT        = CLIPPhi2Model().to(device)
    # The maximum number of training steps (iterations) is set to 20,000.
    max_steps        = 20000
    model_save_step  = 100
    model_val_step   = 100
    log_step         = 100
    # Limits the maximum number of tokens (words, subwords) in the processed inputs, likely to filter out long captions or questions.
    max_token_filter = 35

    # train_dataloader: A PyTorch DataLoader for the training dataset.
    # llavadataset: A custom dataset class that combines the loaded COCO dataset (coco_unpickle), the tokenizer, and other settings. It processes both image and text data.
    # collate_fn: A custom function (collate_fn) to pad or batch the input data correctly, including images and tokenized text.
    # val_dataloader: The DataLoader for the validation dataset, which is similar to the training DataLoader but with a smaller batch size (val_batch_size=2).

    # data loaders
    train_dataloader = DataLoader(llavadataset(coco_unpickle, phi_model_name,clip_model_name,'train',tokenizer),
                      collate_fn=collate_fn, batch_size=train_batch_size, num_workers = 10, shuffle=True, pin_memory=True)

    val_dataloader   = DataLoader(llavadataset(coco_unpickle, phi_model_name,clip_model_name,'val',tokenizer),
                      collate_fn=collate_fn, batch_size=val_batch_size, num_workers = 10, shuffle=True, pin_memory=True)


    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, MModalGPT.parameters()), lr=1e-4)
    train_model(MModalGPT, train_dataloader, val_dataloader, optimizer, device, max_steps,model_save_step,model_val_step,log_step,max_token_filter,tokenizer)

In [27]:
# Initialize Weights & Biases (WandB) for Experiment Tracking:
# wandb.init(project="clip_phi2_project", name="clip_phi3_finetune")
# enables Automatic Mixed Precision (AMP) in PyTorch, specifically for CUDA-enabled GPUs.
# Mixed Precision refers to using both 16-bit and 32-bit floating-point types during training.
torch.amp.autocast('cuda', enabled=True)
# torch.cuda.empty_cache(): This function frees up unused memory held by PyTorch in the CUDA memory cache.
torch.cuda.empty_cache()
# clears up any unreferenced memory
gc.collect()
#  This controls the precision used for 32-bit floating-point matrix multiplication
torch.set_float32_matmul_precision('medium')
main()



FileNotFoundError: [Errno 2] No such file or directory: 'captions.pickle'