In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip -q install -U bitsandbytes peft

In [None]:
!pip -q install --upgrade huggingface_hub

In [None]:
import pickle

# Specify the path to your .pkl file
file_path = '/kaggle/input/sub-capstone/sub_clip_embeddings_0.pkl'

# Load the embeddings from the .pkl file
with open(file_path, 'rb') as file:
    embeddings = pickle.load(file)


# import pickle

# # Specify the path to your .pkl file
# file_path = '/kaggle/input/llava-processed/final_clip_embeddings_part_1.pkl'

# # Load the embeddings from the .pkl file
# with open(file_path, 'rb') as file:
#     embeddings_1 = pickle.load(file)


In [None]:
len(embeddings)

In [None]:
from sys import getsizeof


getsizeof(embeddings)

In [None]:
select_feature = 'patch'
def feature_select(image_forward_outs):
    image_features = image_forward_outs.hidden_states[-1]
    if select_feature == 'patch':
        image_features = image_features[:, 1:]  # Skip CLS token if selecting patch
    elif select_feature == 'cls_patch':
        image_features = image_features  # Keep CLS + patch tokens
    else:
        raise ValueError(f'Unexpected select feature: {select_feature}')
    return image_features

In [None]:
from huggingface_hub import login
login()

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
# meta-llama/Llama-3.2-1B-Instruct
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch
# Load PHI 2 model with 4-bit quantization for efficient fine-tuning
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16
)
model_name = "meta-llama/Llama-3.2-1B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=bnb_config,
    torch_dtype = torch.float32,
    trust_remote_code=True
)


In [None]:
model.dtype

In [None]:
import torch.nn as nn
import random
class MLPProjection(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=1024, depth=2):
        super(MLPProjection, self).__init__()
        modules = []
        modules.append(nn.Linear(input_dim, hidden_dim,bias = False))
        
        for _ in range(1, depth):
            modules.append(nn.GELU())
            modules.append(nn.Linear(hidden_dim, output_dim,bias=False))
        
        self.mlp = nn.Sequential(*modules)
    
    def forward(self, x):
        return self.mlp(x)

class PHI2WithMLP(nn.Module):
    def __init__(self, phi2_model, mlp_projection):
        super(PHI2WithMLP, self).__init__()
        self.phi2_model = phi2_model
        self.mlp_projection = mlp_projection
        self.config = phi2_model.config

    def forward(self, image_embeddings=None,
                inputs_embeds=None,
                input_ids=None,
                attention_mask=None,
                labels=None,
                output_attentions=False, 
        output_hidden_states=False, 
        **kwargs):  # Catch any additional arguments):
        
        if input_ids is not None:
            token_embeddings = self.phi2_model.get_input_embeddings()(input_ids)
        elif inputs_embeds is not None:
            token_embeddings = inputs_embeds
        else:
            raise ValueError("You must provide either input_ids or inputs_embeds.")

        
        if image_embeddings is not None:
            # Apply MLP to image embeddings to map to text embedding space
            projected_image_embeddings = self.mlp_projection(image_embeddings).to(device = token_embeddings.device)
            
            # Get the sequence length for the image embeddings
            image_embedding_length = projected_image_embeddings.size(1)
            
            batch_size, text_sequence_length = attention_mask.shape
            
            # Extend attention mask for image embeddings (ones for image embedding positions)
            new_attention_mask = torch.cat(
                [attention_mask, torch.ones((batch_size,image_embedding_length), device=attention_mask.device)], dim=1
            )
            
            # Combine image and token embeddings
            combined_embeddings = torch.cat([projected_image_embeddings, token_embeddings], dim=1)  # Concatenating along sequence length
            
        else:
            # No image embeddings: Use only token embeddings and the original attention mask
            combined_embeddings = token_embeddings
            new_attention_mask = attention_mask
        if labels is not None:
            # Labels should match the sequence length of combined embeddings
            # If labels correspond only to text tokens, pad them to match the new sequence length
            if image_embeddings is not None:
                label_padding = torch.full(
                    (batch_size, image_embedding_length), 1, device=labels.device  # Use -100 for ignore index
                )
                new_labels = torch.cat([label_padding,labels], dim=1)
            else:
                new_labels = labels
        else:
            new_labels = labels
        # Pass the combined embeddings through the PHI2 model with the (updated or original) attention mask
        outputs = self.phi2_model(inputs_embeds=combined_embeddings, attention_mask=new_attention_mask,labels = new_labels, output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            **kwargs)

        return outputs
    
    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, image_embeddings=None, **kwargs):
        # Generate inputs with projections where necessary
        if image_embeddings is not None:
            projected_image_embeddings = self.mlp_projection(image_embeddings)
            token_embeddings = self.phi2_model.get_input_embeddings()(input_ids)
            combined_embeddings = torch.cat([projected_image_embeddings, token_embeddings], dim=1)
            new_attention_mask = torch.cat([torch.ones_like(projected_image_embeddings[..., :1]), attention_mask], dim=1)
        else:
            combined_embeddings = self.phi2_model.get_input_embeddings()(input_ids)
            new_attention_mask = attention_mask

        return {
            "inputs_embeds": combined_embeddings,
            "attention_mask": new_attention_mask,
            **kwargs
        }

def create_phi2_model_with_lora(mlp_projection,lan_model):
    
    for param in mlp_projection.parameters():
        param.requires_grad = True

    # Return PHI2 model with MLP projection
    return PHI2WithMLP(lan_model, mlp_projection)
    
model_embedding_dim = model.config.hidden_size  # This might change based on your model architecture

# Example usage
input_dim = 768  # Input dimension of image embeddings
output_dim = model_embedding_dim  # Target dimension of text embeddings
hidden_dim = 1024  # Hidden layer dimension of the MLP

mlp_projection = MLPProjection(input_dim, output_dim, hidden_dim, depth=2).to(device)  # Customize MLP
combined_model = create_phi2_model_with_lora(mlp_projection, model)


from peft import LoraModel, LoraConfig,get_peft_model

# Set up the QLoRA configuration for attention layers in PHI 2
lora_config = LoraConfig(
    r=8,  # Low-rank dimension
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Apply QLoRA only to these layers
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)



phi_lora_model = get_peft_model(combined_model, lora_config).to(device)

In [None]:
for name, param in phi_lora_model.named_parameters():
    if 'mlp_projection' in name :
        param.requires_grad = True

phi_lora_model.print_trainable_parameters()

In [None]:
# from transformers import CLIPModel

# # Load CLIP and PHI2
# clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)

In [None]:

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device

In [None]:
# import pickle

# # Specify the path to your .pkl file
# file_path = '/kaggle/input/sample-cap/new_clip_embeddings_part_0.pkl'

# # Load the embeddings from the .pkl file
# with open(file_path, 'rb') as file:
#     embeddings = pickle.load(file)

# # Now you can use your embeddings
# # print(embeddings)


In [None]:
df = pd.read_csv('/kaggle/input/sub-capstone/turns_60k_sample.csv')

In [None]:
# import torch
# from transformers import CLIPProcessor, CLIPModel
# from transformers import Trainer, TrainingArguments
# from datasets import Dataset
# from torch.utils.data import Dataset as TorchDataset
# from transformers import AutoTokenizer, AutoModelForCausalLM
# from PIL import Image

# # Initialize the tokenizer and image model
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token
# clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
# clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')

# class CustomDataset(TorchDataset):
#     def __init__(self, image_paths, text_inputs, text_labels):
#         self.image_paths = image_paths
#         self.text_inputs = text_inputs
#         self.text_labels = text_labels
#         self.max_length = 256

#     def __len__(self):
#         return len(self.text_labels)

#     def __getitem__(self, idx):
#         image = Image.open(self.image_paths[idx])
#         inputs = clip_processor(images=image, return_tensors="pt")
#         image_forward_outs = clip_model.vision_model(**inputs, output_hidden_states=True)
#         image_features = feature_select(image_forward_outs)
#         image_embedding = image_features.squeeze(0).to(device)
        
#         # Tokenize text input
#         input_encoding = tokenizer(
#             self.text_inputs[idx].replace('<image>',' '),
#             return_tensors='pt',
#             padding='max_length',  # Pad to max length
#             truncation=True,  # Truncate if needed
#             max_length=self.max_length
#         )
        
#         # Tokenize text label (similar to inputs)
#         label_encoding = tokenizer(
#             self.text_labels[idx],
#             return_tensors='pt',
#             padding='max_length',
#             truncation=True,
#             max_length=self.max_length
#         )

#         # Extract input_ids and attention_mask for both inputs and labels
#         input_ids = input_encoding['input_ids'].squeeze(0)
#         input_attention_mask = input_encoding['attention_mask'].squeeze(0)
#         label_ids = label_encoding['input_ids'].squeeze(0)
#         label_attention_mask = label_encoding['attention_mask'].squeeze(0)
        
        
#         # Return the image embeddings, tokenized inputs/labels, and attention masks
#         return {
#             'image_embeddings': image_embedding,  # Precomputed image embedding
#             'input_ids': input_ids,  # Tokenized input
#             'attention_mask': input_attention_mask,  # Attention mask for input
#             'labels': label_ids,  # Tokenized label
# #             'label_attention_mask': label_attention_mask  # Attention mask for label (optional)
#         }

# # Create dataset (you will replace this with actual paths and data)
# image_paths = ["/kaggle/input/sample/000000000009.jpg", "/kaggle/input/sample/000000000009.jpg"]
# text_inputs = ["What is the capital of France?", "Describe a sunset."]
# text_labels = ["Paris", "A beautiful view at dusk."]  # Example text labels

# # Instantiate dataset
# dataset = CustomDataset(image_paths, text_inputs, text_labels)



In [None]:
import torch
from transformers import CLIPProcessor, CLIPModel
from transformers import Trainer, TrainingArguments
from datasets import Dataset
from torch.utils.data import Dataset as TorchDataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from PIL import Image

# Initialize the tokenizer and image model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
# clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32').to(device)
# clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')

class CustomDataset(TorchDataset):
    def __init__(self, image_paths, text_inputs, text_labels):
        self.image_paths = image_paths
        self.text_inputs = text_inputs
        self.text_labels = text_labels
        self.max_length = 256

    def __len__(self):
        return len(self.text_labels)

    def __getitem__(self, idx):
        image_embedding = embeddings[self.image_paths[idx]]
        
        # Tokenize text input
        input_encoding = tokenizer(
            self.text_inputs[idx].replace('<image>','')+self.text_labels[idx],
            return_tensors='pt',
            padding='max_length',  # Pad to max length
            truncation=True,  # Truncate if needed
            max_length=self.max_length
        )
        
        # Tokenize text label (similar to inputs)
        label_encoding = tokenizer(
            self.text_inputs[idx].replace('<image>','')+self.text_labels[idx],
            return_tensors='pt',
            padding='max_length',
            truncation=True,
            max_length=self.max_length
        )

        # Extract input_ids and attention_mask for both inputs and labels
        input_ids = input_encoding['input_ids'].squeeze(0)
        input_attention_mask = input_encoding['attention_mask'].squeeze(0)
        label_ids = label_encoding['input_ids'].squeeze(0)
        label_attention_mask = label_encoding['attention_mask'].squeeze(0)
        
        # Return the image embeddings, tokenized inputs/labels, and attention masks
        return {
            'image_embeddings': image_embedding,  # Precomputed image embedding
            'input_ids': input_ids,  # Tokenized input
            'attention_mask': input_attention_mask,  # Attention mask for input
            'labels': label_ids,  # Tokenized label
#             'label_attention_mask': label_attention_mask  # Attention mask for label (optional)
        }

# Create dataset (you will replace this with actual paths and data)
image_paths = df['image'].tolist()
text_inputs = df['input'].tolist()
text_labels = df['label'].tolist()  # Example text labels

# Instantiate dataset
dataset = CustomDataset(image_paths, text_inputs, text_labels)

In [None]:

import wandb
wandb.init(mode="disabled")

In [None]:
from transformers import DataCollatorWithPadding

class CustomDataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.default_collator = DataCollatorWithPadding(tokenizer)

    def __call__(self, features):
        # Extract input features (image embeddings, text inputs, etc.)
        input_ids = [f['input_ids'] for f in features]
        attention_mask = [f['attention_mask'] for f in features]
        image_embeddings = [f['image_embeddings'] for f in features if 'image_embeddings' in f]
        labels = [f['labels'] for f in features if 'labels' in f]

        # Collate the text inputs using the default collator
        batch = self.default_collator(features)

        # Add image embeddings if they exist
        if image_embeddings:
            batch['image_embeddings'] = torch.stack(image_embeddings)

        # Add labels to the batch
        if labels:
            batch['labels'] = torch.stack(labels)

        return batch


In [None]:

from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 4
optim = "paged_adamw_32bit"
save_steps = 1000
logging_steps = 10
learning_rate = 1e-4
max_grad_norm = 0.3
warmup_ratio = 0.03
lr_scheduler_type = "cosine"

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=1,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler_type,
    fp16=False,
    weight_decay=0.01,
    remove_unused_columns=False,
)

from transformers import DataCollatorWithPadding

# Create a data collator
data_collator = CustomDataCollator(tokenizer=tokenizer)

# Create Trainer
trainer = Trainer(
    model=phi_lora_model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,  # Use the collator
)

# Start training
trainer.train()


In [None]:
image_paths = df['image'].tolist()
text_inputs = df['input'].tolist()
text_labels = df['label'].tolist()

In [None]:
image_path,text_input,text_label # Example text label (if needed for comparison)


In [None]:
import torch

# # Load your model
# eval_model = get_peft_model(combined_model, lora_config).to(device)  # Adjust based on your setup
# eval_model.eval()  # Set the model to evaluation mode

# Example input data
image_path = image_paths[0]
text_input = text_inputs[0]  # Example text input
text_label = text_labels[0]  # Example text label (if needed for comparison)

# Prepare inputs
image_embedding = embeddings[image_path]

# Tokenize text input
input_encoding = tokenizer(
    text_inputs[0],
    return_tensors='pt',
    padding='max_length',
    truncation=True,
    max_length=256  # Set this to match your model's input size
)

# Combine inputs for inference
input_ids = input_encoding['input_ids'].squeeze(0).to(device)  # Shape: [seq_len]
attention_mask = input_encoding['attention_mask'].squeeze(0).to(device)  # Shape: [seq_len]
image_embedding = image_embedding.squeeze(0).to(device)  # Shape: [embedding_dim]


In [None]:
image_embedding.shape

In [None]:
attention_mask.shape

In [None]:

# Get token embeddings from PHI2 model
token_embeddings = model.get_input_embeddings()(input_ids)



In [None]:
mlp_projection = mlp_projection.to(device)
mlp_projection

In [None]:
projected_image_embeddings = mlp_projection(image_embedding)

In [None]:
image_embedding.shape,token_embeddings.shape,projected_image_embeddings.shape

In [None]:
torch.cat([projected_image_embeddings, token_embeddings], dim=0).shape

In [None]:
# # Load your model
eval_model = get_peft_model(combined_model, lora_config).to(device)  # Adjust based on your setup
eval_model.eval()

In [None]:
# Perform inference
with torch.no_grad():  # Disable gradient calculation
    outputs = eval_model(input_ids=input_ids.unsqueeze(0),  # Add batch dimension
                     attention_mask=attention_mask.unsqueeze(0),  # Add batch dimension
                     image_embeddings=None)  # Add batch dimension

# Extract predictions (modify based on your model's output)
predictions = outputs.logits  # Or the appropriate output field

# Process predictions as needed (e.g., applying softmax, argmax)
predicted_labels = torch.argmax(predictions, dim=-1)
# print(f"Predicted labels: {predicted_labels}")
# Convert predicted token IDs back to text using the tokenizer
predicted_text = tokenizer.decode(predicted_labels[0], skip_special_tokens=True)

print(f"Predicted text: {predicted_text}")

In [None]:
# Perform inference
with torch.no_grad():  # Disable gradient calculation
    outputs = phi_lora_model(input_ids=input_ids.unsqueeze(0),  # Add batch dimension
                     attention_mask=attention_mask.unsqueeze(0),  # Add batch dimension
                     image_embeddings=image_embedding.unsqueeze(0))  # Add batch dimension

# Extract predictions (modify based on your model's output)
predictions = outputs.logits  # Or the appropriate output field

# Process predictions as needed (e.g., applying softmax, argmax)
predicted_labels = torch.argmax(predictions, dim=-1)
print(f"Predicted labels: {predicted_labels}")

In [None]:
# outputs

In [None]:
predictions.shape

In [None]:
# Process predictions as needed (e.g., applying softmax, argmax)
predicted_token_ids = torch.argmax(predictions, dim=-1)

# Convert predicted token IDs back to text using the tokenizer
predicted_text = tokenizer.decode(predicted_token_ids[0], skip_special_tokens=True)

print(f"Predicted text: {predicted_text}")

In [None]:
predicted_text

In [None]:
outputs['logits'].shape

In [None]:
if random.random() < 0.5:
    combined_embeddings = torch.cat([projected_image_embeddings, token_embeddings], dim=0)
else:
    combined_embeddings = torch.cat([token_embeddings, projected_image_embeddings], dim=0)


In [None]:
combined_embeddings.shape

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Load your model
eval_model = get_peft_model(combined_model, lora_config).to(device)  # Adjust based on your setup
eval_model.eval()  # Set the model to evaluation mode

# Example input data
image_paths = ["/kaggle/input/sample/000000000009.jpg"]
text_inputs = ["What is the capital of France?"]  # Example text input
text_labels = ["Paris"]  # Example text label (if needed for comparison)

# Prepare inputs
images = []
for path in image_paths:
    image = Image.open(path)
    inputs = clip_processor(images=image, return_tensors="pt")
    image_forward_outs = clip_model.vision_model(**inputs, output_hidden_states=True)
    image_features = feature_select(image_forward_outs)
    images.append(image_features)

# Tokenize text input
input_encoding = tokenizer(
    text_inputs[0],
    return_tensors='pt',
    padding='max_length',
    truncation=True,
    max_length=2048  # Set this to match your model's input size
)

# Combine inputs for inference
input_ids = input_encoding['input_ids'].squeeze(0).to(device)  # Shape: [seq_len]
attention_mask = input_encoding['attention_mask'].squeeze(0).to(device)  # Shape: [seq_len]
image_embedding = images[0].squeeze(0).to(device)  # Shape: [embedding_dim]

# Ensure image_embedding has the right shape for the model
# You may need to reshape or adjust the tensor based on your model's expected input
# image_embedding = image_embedding.view(1, -1)  # Adjust this if needed

# Perform inference
with torch.no_grad():  # Disable gradient calculation
    outputs = eval_model(input_ids=input_ids.unsqueeze(0),  # Add batch dimension
                     attention_mask=attention_mask.unsqueeze(0),  # Add batch dimension
                     image_embeddings=image_embedding.unsqueeze(0))  # Add batch dimension

# Extract predictions (modify based on your model's output)
predictions = outputs.logits  # Or the appropriate output field

# Process predictions as needed (e.g., applying softmax, argmax)
predicted_labels = torch.argmax(predictions, dim=-1)
print(f"Predicted labels: {predicted_labels}")


In [None]:
!pip install trl

In [None]:
from trl import SFTTrainer

max_seq_length = 256

trainer = SFTTrainer(
    model=phi_lora_model,
    train_dataset=dataset,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator
)


In [None]:
trainer.train()

In [None]:
# Perform inference
with torch.no_grad():  # Disable gradient calculation
    outputs = phi_lora_model(input_ids=input_ids.unsqueeze(0),  # Add batch dimension
                     attention_mask=attention_mask.unsqueeze(0),  # Add batch dimension
                     image_embeddings=image_embedding.unsqueeze(0))  # Add batch dimension

# Extract predictions (modify based on your model's output)
predictions = outputs.logits  # Or the appropriate output field

# Process predictions as needed (e.g., applying softmax, argmax)
predicted_labels = torch.argmax(predictions, dim=-1)
print(f"Predicted labels: {predicted_labels}")

In [None]:
# Process predictions as needed (e.g., applying softmax, argmax)
predicted_token_ids = torch.argmax(predictions, dim=-1)

# Convert predicted token IDs back to text using the tokenizer
predicted_text = tokenizer.decode(predicted_token_ids[0], skip_special_tokens=True)

print(f"Predicted text: {predicted_text}")