In [13]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sub-capstone/sub_clip_embeddings_0.pkl
/kaggle/input/sub-capstone/turns_60k_sample.csv
/kaggle/input/sample-cap/new_clip_embeddings_part_0.pkl
/kaggle/input/sample-cap/sample.csv
/kaggle/input/sample-cap/turns_50_sample.csv
/kaggle/input/sample/000000000009.jpg
/kaggle/input/sample/000000000025.jpg


In [18]:
import os
for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/working/mlp_projection.pth
/kaggle/working/state.db
/kaggle/working/my_phi_lora_model/adapter_model.safetensors
/kaggle/working/my_phi_lora_model/README.md
/kaggle/working/my_phi_lora_model/adapter_config.json


In [None]:
!pip -q install -U bitsandbytes peft

In [None]:
!pip -q install --upgrade huggingface_hub

### load preprocessed image embedding

In [7]:
import pickle

# Specify the path to your .pkl file
file_path = '/kaggle/input/sub-capstone/sub_clip_embeddings_0.pkl'

# Load the embeddings from the .pkl file
with open(file_path, 'rb') as file:
    embeddings = pickle.load(file)

len(embeddings)


  return torch.load(io.BytesIO(b))


30946

In [6]:
select_feature = 'patch'
def feature_select(image_forward_outs):
    image_features = image_forward_outs.hidden_states[-1]
    if select_feature == 'patch':
        image_features = image_features[:, 1:]  # Skip CLS token if selecting patch
    elif select_feature == 'cls_patch':
        image_features = image_features  # Keep CLS + patch tokens
    else:
        raise ValueError(f'Unexpected select feature: {select_feature}')
    return image_features

In [None]:
from huggingface_hub import login
login()

In [5]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
# pip install -U bitsandbytes

### load model using llama 1b instead of phi because of limited resources

In [1]:
# meta-llama/Llama-3.2-1B-Instruct
from transformers import AutoModelForCausalLM, BitsAndBytesConfig,AutoTokenizer
import torch
# Load PHI 2 model with 4-bit quantization for efficient fine-tuning
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16
)
model_name = "meta-llama/Llama-3.2-1B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=bnb_config,
    torch_dtype = torch.float32,
    trust_remote_code=True
)


`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

### check inference

In [3]:
# Example prompt
prompt = "What types of food can be seen in the image?"

# Tokenize input
inputs = tokenizer(prompt, return_tensors="pt",truncation = True)

# Generate response
outputs = model.generate(**inputs, max_new_tokens=128, temperature=0.01, top_p=1)

# Decode output to text
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Model response:", response)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Model response: What types of food can be seen in the image? 
Unfortunately, I am unable to see the image you are referring to.  However, I can tell you that the image is likely a photograph of a restaurant or a food establishment.  Based on the fact that it is a restaurant, I would guess that the types of food that can be seen in the image are likely to be a variety of dishes, such as appetizers, entrees, and desserts.  The image may also show a variety of beverages, such as drinks and cocktails.  It is also possible that the image shows a variety of food presentation, such as a buffet or a food display.


## define custom Model with added mlp projection as suggested in llava 1.5

In [8]:
import torch.nn as nn
import random
class MLPProjection(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=768, depth=2):
        super(MLPProjection, self).__init__()
        modules = []
        modules.append(nn.Linear(input_dim, hidden_dim,bias = False))
        
        for _ in range(1, depth):
            modules.append(nn.GELU())
            modules.append(nn.Linear(hidden_dim, output_dim,bias=False))
        
        self.mlp = nn.Sequential(*modules)
        
    
    def forward(self, x):
        return self.mlp(x)

class PHI2WithMLP(nn.Module):
    def __init__(self, phi2_model, mlp_projection):
        super(PHI2WithMLP, self).__init__()
        self.phi2_model = phi2_model
        self.mlp_projection = mlp_projection
        self.config = phi2_model.config
        
    def forward(self, image_embeddings=None,
                inputs_embeds=None,
                input_ids=None,
                attention_mask=None,
                labels=None,
                output_attentions=False, 
        output_hidden_states=False, 
        **kwargs):  # Catch any additional arguments):
        
        if input_ids is not None:
            token_embeddings = self.phi2_model.get_input_embeddings()(input_ids)
        elif inputs_embeds is not None:
            token_embeddings = inputs_embeds
        else:
            raise ValueError("You must provide either input_ids or inputs_embeds.")

        
        if image_embeddings is not None:
            # Apply MLP to image embeddings to map to text embedding space
            projected_image_embeddings = self.mlp_projection(image_embeddings).to(device = token_embeddings.device)
            
            # Get the sequence length for the image embeddings
            image_embedding_length = projected_image_embeddings.size(1)
            
            batch_size, text_sequence_length = attention_mask.shape
            
            # Extend attention mask for image embeddings (ones for image embedding positions)
            new_attention_mask = torch.cat(
                [torch.ones((batch_size,image_embedding_length), device=attention_mask.device),attention_mask ], dim=1
            )
            
            # Combine image and token embeddings
            combined_embeddings = torch.cat([projected_image_embeddings, token_embeddings], dim=1)  # Concatenating along sequence length
            
        else:
            # No image embeddings: Use only token embeddings and the original attention mask
            combined_embeddings = token_embeddings
            new_attention_mask = attention_mask
        if labels is not None:
            # Labels should match the sequence length of combined embeddings
            # If labels correspond only to text tokens, pad them to match the new sequence length
            if image_embeddings is not None:
                label_padding = torch.full(
                    (batch_size, image_embedding_length), -100, device=labels.device  # Use -100 for ignore index
                )
                new_labels = torch.cat([label_padding,labels], dim=1)
            else:
                new_labels = labels
        else:
            new_labels = labels
        # Pass the combined embeddings through the PHI2 model with the (updated or original) attention mask
        outputs = self.phi2_model(inputs_embeds=combined_embeddings, attention_mask=new_attention_mask,labels = new_labels, output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            **kwargs)

        return outputs
    
    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, image_embeddings=None, **kwargs):
        # Generate inputs with projections where necessary
        if image_embeddings is not None:
            projected_image_embeddings = self.mlp_projection(image_embeddings)
            projected_image_embeddings = projected_image_embeddings.unsqueeze(0)
            
            token_embeddings = self.phi2_model.get_input_embeddings()(input_ids)
           
            combined_embeddings = torch.cat([projected_image_embeddings, token_embeddings], dim=1)
            image_embedding_length = projected_image_embeddings.size(1)
            
            image_embedding_length = projected_image_embeddings.size(1)
            
            batch_size, text_sequence_length = attention_mask.shape
            
            # Extend attention mask for image embeddings (ones for image embedding positions)
            new_attention_mask = torch.cat(
                [torch.ones((batch_size,image_embedding_length), device=attention_mask.device),attention_mask ], dim=1
            )
            
           
        else:
            combined_embeddings = self.phi2_model.get_input_embeddings()(input_ids)
            new_attention_mask = attention_mask

        return {
            "inputs_embeds": combined_embeddings,
            "attention_mask": new_attention_mask,
            
            **kwargs
        }
    
    def generate(self, input_ids, attention_mask=None, image_embeddings=None, **kwargs):
        self.eval()  # Set to evaluation mode
        # Prepare inputs for generation
        inputs = self.prepare_inputs_for_generation(input_ids, attention_mask, image_embeddings, **kwargs)
        # Use the model's built-in generate method
        return self.phi2_model.generate(**inputs)

def create_phi2_model_with_lora(mlp_projection,lan_model):
    
    for param in mlp_projection.parameters():
        param.requires_grad = True

    # Return PHI2 model with MLP projection
    return PHI2WithMLP(lan_model, mlp_projection)
    
model_embedding_dim = model.config.hidden_size  # This might change based on your model architecture

# Example usage
input_dim = 768  # Input dimension of image embeddings
output_dim = model_embedding_dim  # Target dimension of text embeddings
hidden_dim = 1024  # Hidden layer dimension of the MLP

mlp_projection = MLPProjection(input_dim, output_dim, hidden_dim, depth=2).to(device)  # Customize MLP
combined_model = create_phi2_model_with_lora(mlp_projection, model)


from peft import LoraModel, LoraConfig,get_peft_model

# Set up the QLoRA configuration for attention layers in PHI 2
lora_config = LoraConfig(
    r=8,  # Low-rank dimension
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Apply QLoRA only to these layers
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)



phi_lora_model = get_peft_model(combined_model, lora_config)

### add mlp_layer params for training along with lora params

In [9]:
for name, param in phi_lora_model.named_parameters():
    if 'mlp_projection' in name :
        param.requires_grad = True

phi_lora_model.print_trainable_parameters()

trainable params: 4,587,520 || all params: 1,240,401,920 || trainable%: 0.3698


#### check inference on custom model

In [10]:
from transformers import GenerationConfig

# Create a new GenerationConfig with desired settings
generation_config = GenerationConfig(max_new_tokens=128, temperature=0.01, top_p=1)
phi_lora_model.generation_config = generation_config

outputs = phi_lora_model.generate(**inputs,image_embeddings = None, max_new_tokens=128, temperature=0.01, top_p=1)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Model response:", response)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Model response:  
Unfortunately, I am unable to see the image you are referring to.  However, I can tell you that the image is likely a photograph of a restaurant or a food establishment.  Based on the fact that it is a restaurant, I would guess that the types of food that can be seen in the image are likely to be a variety of dishes, such as appetizers, entrees, and desserts.  The image may also show a variety of beverages, such as drinks and cocktails.  It is also possible that the image shows a variety of food presentation, such as a buffet or a food display.


### load processed data from llava_150k (subset)

In [19]:
df = pd.read_csv('/kaggle/input/sub-capstone/turns_60k_sample.csv')

### define dataloader and collator

In [20]:
import torch
from transformers import CLIPProcessor, CLIPModel
from transformers import Trainer, TrainingArguments
from datasets import Dataset
from torch.utils.data import Dataset as TorchDataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from PIL import Image

# Initialize the tokenizer and image model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
# clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32').to(device)
# clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')

class CustomDataset(TorchDataset):
    def __init__(self, image_paths, text_inputs, text_labels):
        self.image_paths = image_paths
        self.text_inputs = text_inputs
        self.text_labels = text_labels
        self.max_length = 256 - 49

    def __len__(self):
        return len(self.text_labels)

    def __getitem__(self, idx):
        image_embedding = embeddings[self.image_paths[idx]]
        
        # Tokenize text input
        input_encoding = tokenizer(
            self.text_inputs[idx].replace('<image>','')+self.text_labels[idx],
            return_tensors='pt',
            truncation = True,
            padding='max_length',  # Ensures padding to a consistent length
            max_length=self.max_length
        )
        
        

        # Extract input_ids and attention_mask for both inputs and labels
        input_ids = input_encoding['input_ids'].squeeze(0)
        input_attention_mask = input_encoding['attention_mask'].squeeze(0)
        label_ids = input_ids.clone()
        label_ids[label_ids == tokenizer.pad_token_id] = -100
        
        # Return the image embeddings, tokenized inputs/labels, and attention masks
        return {
            'image_embeddings': image_embedding,  # Precomputed image embedding
            'input_ids': input_ids,  # Tokenized input
            'attention_mask': input_attention_mask,  # Attention mask for input
            'labels': label_ids,  # Tokenized label
        }

# Create dataset (you will replace this with actual paths and data)
image_paths = df['image'].tolist()
text_inputs = df['input'].tolist()
text_labels = df['label'].tolist()  # Example text labels

# Instantiate dataset
dataset = CustomDataset(image_paths, text_inputs, text_labels)

In [21]:

import wandb
wandb.init(mode="disabled")

In [22]:
from transformers import DataCollatorWithPadding

class CustomDataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.default_collator = DataCollatorWithPadding(tokenizer)

    def __call__(self, features):
        # Extract input features (image embeddings, text inputs, etc.)

        input_ids = [f['input_ids'] for f in features]
        attention_mask = [f['attention_mask'] for f in features]
        image_embeddings = [f['image_embeddings'] for f in features if 'image_embeddings' in f]
        labels = [f['labels'] for f in features if 'labels' in f]

        # Collate the text inputs using the default collator
        batch = self.default_collator(features)

        # Add image embeddings if they exist
        if image_embeddings:
            batch['image_embeddings'] = torch.stack(image_embeddings)

        # Add labels to the batch
        if labels:
            batch['labels'] = torch.stack(labels)

        return batch


#### training 10000 steps

In [23]:

from transformers import TrainingArguments

output_dir = "./results_coupled"
per_device_train_batch_size = 4
optim = "paged_adamw_32bit"
save_steps = 1000
logging_steps = 100
learning_rate = 2e-4
max_grad_norm = 0.3
warmup_ratio = 0.03
lr_scheduler_type = "cosine"

training_args = TrainingArguments(
    output_dir="./results1",
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=1,
    max_steps = 10000,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler_type,
    fp16=False,
    weight_decay=0.01,
    remove_unused_columns=False
)

from transformers import DataCollatorWithPadding

# Create a data collator
data_collator = CustomDataCollator(tokenizer=tokenizer)

# Create Trainer
trainer = Trainer(
    model=phi_lora_model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
    # Use the collator
)

# Start training
trainer.train()


max_steps is given, it will override any value given in num_train_epochs
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
100,1.9046
200,1.7104
300,1.6699
400,1.6531
500,1.6197
600,1.6155
700,1.6105
800,1.5843
900,1.5978
1000,1.5692


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=10000, training_loss=1.4997929794311524, metrics={'train_runtime': 35509.5138, 'train_samples_per_second': 2.253, 'train_steps_per_second': 0.282, 'total_flos': 0.0, 'train_loss': 1.4997929794311524, 'epoch': 0.5799454851243983})

In [24]:
trainer.save_model()

In [25]:
hf_adapter_repo="Kartheekb7/peft_llava_llama_2"

In [None]:
phi_lora_model

In [26]:
phi_lora_model.push_to_hub(hf_adapter_repo)

adapter_model.safetensors:   0%|          | 0.00/6.83M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Kartheekb7/peft_llava_llama_2/commit/a65f68ce2809da647eff38791ac465e8fa014772', commit_message='Upload model', commit_description='', oid='a65f68ce2809da647eff38791ac465e8fa014772', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Kartheekb7/peft_llava_llama_2', endpoint='https://huggingface.co', repo_type='model', repo_id='Kartheekb7/peft_llava_llama_2'), pr_revision=None, pr_num=None)

In [27]:
trainer.push_to_hub(hf_adapter_repo)

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/6.83M [00:00<?, ?B/s]

events.out.tfevents.1730223711.4be4fe3b6de8.115.0:   0%|          | 0.00/27.1k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Kartheekb7/results1/commit/8bfeb9e37ebb9b595ba4574b794e973094c06c5a', commit_message='Kartheekb7/peft_llava_llama_2', commit_description='', oid='8bfeb9e37ebb9b595ba4574b794e973094c06c5a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Kartheekb7/results1', endpoint='https://huggingface.co', repo_type='model', repo_id='Kartheekb7/results1'), pr_revision=None, pr_num=None)

In [28]:
# Specify the directory where you want to save the model
model_save_directory = "my_phi_lora_model"

# Save the combined model with LoRA
phi_lora_model.save_pretrained(model_save_directory)



In [57]:
base_config = phi_lora_model.base_model.config

In [56]:
phi_lora_model.config

LlamaConfig {
  "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "float32",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8b

In [55]:
base_config

LlamaConfig {
  "_name_or_path": "meta-llama/Llama-3.2-1B-Instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "float32",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8b

In [35]:
image_paths = df['image'].tolist()
text_inputs = df['input'].tolist()
text_labels = df['label'].tolist()

In [37]:
text_labels[0]

'In the image, you can see various types of food, including broccoli, bread, meat, vegetables, and fruit. These foods are presented in colorful dishes or containers.'

In [40]:
image_path,text_input,text_label # Example text label (if needed for comparison)


('000000000009.jpg',
 'What types of food can be seen in the image?\n',
 'In the image, you can see various types of food, including broccoli, bread, meat, vegetables, and fruit. These foods are presented in colorful dishes or containers.')

### inference on trained model

In [30]:
import torch

# # Load your model
# eval_model = get_peft_model(combined_model, lora_config).to(device)  # Adjust based on your setup
# eval_model.eval()  # Set the model to evaluation mode

# Example input data
image_path = image_paths[0]
text_input = text_inputs[0]  # Example text input
text_label = text_labels[0]  # Example text label (if needed for comparison)

# Prepare inputs
image_embedding = embeddings[image_path]

# Tokenize text input
input_encoding = tokenizer(
    text_inputs[0],
    return_tensors='pt',
    padding='max_length',
    truncation=True,
    max_length=256-49  # Set this to match your model's input size
)

# Combine inputs for inference
input_ids = input_encoding['input_ids'].squeeze(0).to(device)  # Shape: [seq_len]
attention_mask = input_encoding['attention_mask'].squeeze(0).to(device)  # Shape: [seq_len]
image_embedding = image_embedding.squeeze(0).to(device)  # Shape: [embedding_dim]


In [31]:
image_path = image_paths[0]
image_embedding = embeddings[image_path]
image_embedding = image_embedding.squeeze(0).to(device)

In [32]:
from transformers import GenerationConfig

# # Create a new GenerationConfig with desired settings
# generation_config = GenerationConfig(max_new_tokens=128, temperature=0.01, top_p=1)
# phi_lora_model.generation_config = generation_config

outputs = phi_lora_model.generate(**input_encoding,image_embeddings = image_embedding, max_new_tokens=128, temperature=0.01, top_p=1)
# Decode output to text
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Model response:", response)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Model response: 
The image shows a variety of food items, including meat, vegetables, and fruits. The food is contained in a plastic container or a plastic bag, which is open and partially filled. The contents include meat, broccoli, carrots, and other vegetables. There are also fruits, such as apples and oranges, visible in the image. The food is arranged in a colorful and visually appealing way, making it an attractive and appetizing meal. The food is likely to be served together in a meal or a snack. The image also includes a bowl, which could be used to serve the food. The overall presentation suggests a well-organized and


In [38]:
text_label

'In the image, you can see various types of food, including broccoli, bread, meat, vegetables, and fruit. These foods are presented in colorful dishes or containers.'

In [43]:
phi_lora_model.base_model.mlp_projection

MLPProjection(
  (mlp): Sequential(
    (0): Linear(in_features=768, out_features=1024, bias=False)
    (1): GELU(approximate='none')
    (2): Linear(in_features=1024, out_features=2048, bias=False)
  )
)

In [49]:
phi_lora_model.base_model.model.mlp_projection

MLPProjection(
  (mlp): Sequential(
    (0): Linear(in_features=768, out_features=1024, bias=False)
    (1): GELU(approximate='none')
    (2): Linear(in_features=1024, out_features=2048, bias=False)
  )
)

In [51]:
import torch

# Access the MLP layer
mlp_layer = phi_lora_model.base_model.mlp_projection
torch.save(mlp_layer.state_dict(), "mlp_projection_weights.pth")



In [52]:
# Reload the saved weights
loaded_mlp_weights = torch.load("mlp_projection_weights.pth")

# Check if reloaded weights match the original weights
is_same = True
for key in loaded_mlp_weights:
    if not torch.equal(loaded_mlp_weights[key], mlp_layer.state_dict()[key]):
        is_same = False
        break

if is_same:
    print("The loaded MLP layer is identical to the original model's MLP layer.")
else:
    print("The loaded MLP layer differs from the original model's MLP layer.")


The loaded MLP layer is identical to the original model's MLP layer.


  loaded_mlp_weights = torch.load("mlp_projection_weights.pth")


In [62]:
phi_lora_model.base_model

LoraModel(
  (model): PHI2WithMLP(
    (phi2_model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(
 

In [59]:
# meta-llama/Llama-3.2-1B-Instruct
from transformers import AutoModelForCausalLM, BitsAndBytesConfig,AutoTokenizer
import torch
# Load PHI 2 model with 4-bit quantization for efficient fine-tuning
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16
)
model_name = "meta-llama/Llama-3.2-1B-Instruct"

infer_model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    quantization_config=bnb_config,
    torch_dtype = torch.float32,
    trust_remote_code=True
)


`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [60]:
# load saved model
mlp_inf = MLPProjection(input_dim, output_dim, hidden_dim, depth=2).to(device)  # Customize MLP
inference_base = create_phi2_model_with_lora(mlp_inf, infer_model)


In [61]:
# Reload the saved weights
loaded_mlp_weights = torch.load("mlp_projection_weights.pth")

# Check if reloaded weights match the original weights
is_same = True
for key in loaded_mlp_weights:
    if not torch.equal(loaded_mlp_weights[key], mlp_inf.state_dict()[key]):
        is_same = False
        break

if is_same:
    print("The loaded MLP layer is identical to the original model's MLP layer.")
else:
    print("The loaded MLP layer differs from the original model's MLP layer.")


The loaded MLP layer differs from the original model's MLP layer.


  loaded_mlp_weights = torch.load("mlp_projection_weights.pth")


In [64]:
from peft import PeftModel, PeftConfig

In [69]:
peft_model_id = "Kartheekb7/results1"
loaded_model = PeftModel.from_pretrained(inference_base, peft_model_id,is_trainable=True)


In [71]:
loaded_model.print_trainable_parameters()

trainable params: 1,703,936 || all params: 1,240,401,920 || trainable%: 0.1374


In [73]:
for name, param in loaded_model.named_parameters():
    if 'mlp_projection' in name :
        param.requires_grad = True

loaded_model.print_trainable_parameters()

trainable params: 4,587,520 || all params: 1,240,401,920 || trainable%: 0.3698


In [75]:
generation_config = GenerationConfig(max_new_tokens=128, temperature=0.01, top_p=1)
loaded_model.generation_config = generation_config

outputs = loaded_model.generate(**input_encoding,image_embeddings = image_embedding, max_new_tokens=128, temperature=0.01, top_p=1)
# Decode output to text
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Model response:", response)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Model response: 
The image shows a close-up view of the person eating the food. The image is taken from a close-up angle, so we can see the person eating the food. The image is taken from a close-up angle, which gives us a better idea of the person eating the food. The image is taken from a close-up angle, which allows us to see the person eating the food. The image is taken from a close-up angle, which gives us a better idea of the person eating the food. The image is taken from a close-up angle, which allows us to see the person eating the food. The image is taken from


In [76]:
import torch

# Access the MLP layer
mlp_layer = loaded_model.base_model.mlp_projection
# Reload the saved weights
loaded_mlp_weights = torch.load("mlp_projection_weights.pth")

# Check if reloaded weights match the original weights
is_same = True
for key in loaded_mlp_weights:
    if not torch.equal(loaded_mlp_weights[key], mlp_inf.state_dict()[key]):
        is_same = False
        break

if is_same:
    print("The loaded MLP layer is identical to the original model's MLP layer.")
else:
    print("The loaded MLP layer differs from the original model's MLP layer.")


The loaded MLP layer differs from the original model's MLP layer.


  loaded_mlp_weights = torch.load("mlp_projection_weights.pth")


In [77]:
loaded_model.base_model.model.mlp_projection

MLPProjection(
  (mlp): Sequential(
    (0): Linear(in_features=768, out_features=1024, bias=False)
    (1): GELU(approximate='none')
    (2): Linear(in_features=1024, out_features=2048, bias=False)
  )
)

In [78]:
loaded_model.base_model.model.mlp_projection.load_state_dict(loaded_mlp_weights)
print("Projection layer weights loaded successfully.")

Projection layer weights loaded successfully.


In [79]:
import torch

# Access the MLP layer
mlp_layer = loaded_model.base_model.mlp_projection
# Reload the saved weights
loaded_mlp_weights = torch.load("mlp_projection_weights.pth")

# Check if reloaded weights match the original weights
is_same = True
for key in loaded_mlp_weights:
    if not torch.equal(loaded_mlp_weights[key], mlp_inf.state_dict()[key]):
        is_same = False
        break

if is_same:
    print("The loaded MLP layer is identical to the original model's MLP layer.")
else:
    print("The loaded MLP layer differs from the original model's MLP layer.")


The loaded MLP layer is identical to the original model's MLP layer.


  loaded_mlp_weights = torch.load("mlp_projection_weights.pth")


In [80]:
generation_config = GenerationConfig(max_new_tokens=128, temperature=0.01, top_p=1)
loaded_model.generation_config = generation_config

outputs = loaded_model.generate(**input_encoding,image_embeddings = image_embedding, max_new_tokens=128, temperature=0.01, top_p=1)
# Decode output to text
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Model response:", response)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Model response: 
The image shows a variety of food items, including meat, vegetables, and fruits. The food is contained in a plastic container or a plastic bag, which is open and partially filled. The contents include meat, broccoli, carrots, and other vegetables. There are also fruits, such as apples and oranges, visible in the image. The food is arranged in a colorful and visually appealing way, making it an attractive and appetizing meal. The food is likely to be served together in a meal or a snack. The image also includes a bowl, which could be used to serve the food. The overall presentation suggests a well-organized and


In [None]:
loaded_projection_w