In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sample/000000000009.jpg
/kaggle/input/sample/000000000025.jpg


In [8]:
!pip install -U bitsandbytes peft

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes, peft
Successfully installed bitsandbytes-0.44.1 peft-0.13.2


In [9]:
# from huggingface_hub import notebook_login
# notebook_login()

In [10]:
# meta-llama/Llama-3.2-1B-Instruct
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import torch
# Load PHI 2 model with 4-bit quantization for efficient fine-tuning
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.float16
# )
model_name = "Qwen/Qwen2.5-1.5B-Instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
#     quantization_config=bnb_config,
    trust_remote_code=True
)


In [11]:
model

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qw

In [49]:
import torch.nn as nn
import random
class MLPProjection(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=1024, depth=2):
        super(MLPProjection, self).__init__()
        modules = []
        modules.append(nn.Linear(input_dim, hidden_dim))
        
        for _ in range(1, depth):
            modules.append(nn.GELU())
            modules.append(nn.Linear(hidden_dim, output_dim))
        
        self.mlp = nn.Sequential(*modules)
    
    def forward(self, x):
        return self.mlp(x)

class PHI2WithMLP(nn.Module):
    def __init__(self, phi2_model, mlp_projection):
        super(PHI2WithMLP, self).__init__()
        self.phi2_model = phi2_model
        self.mlp_projection = mlp_projection

    def forward(self, image_embeddings=None, input_ids=None, attention_mask=None, labels=None):
       
        # Get token embeddings from PHI2 model
        token_embeddings = self.phi2_model.get_input_embeddings()(input_ids)
        
        if image_embeddings is not None:
            # Apply MLP to image embeddings to map to text embedding space
            projected_image_embeddings = self.mlp_projection(image_embeddings)

            # Get the sequence length for the image embeddings
            image_embedding_length = projected_image_embeddings.size(1)
            
            batch_size, text_sequence_length = attention_mask.shape
            print(attention_mask.shape)
            print(batch_size, text_sequence_length)
            # Extend attention mask for image embeddings (ones for image embedding positions)
            new_attention_mask = torch.cat(
                [attention_mask, torch.ones((batch_size,image_embedding_length), device=attention_mask.device)], dim=1
            )
            print(new_attention_mask.shape)
            print(projected_image_embeddings.shape,token_embeddings.shape)
            # Combine image and token embeddings
            if random.random() < 0.5:
                combined_embeddings = torch.cat([projected_image_embeddings, token_embeddings], dim=1)  # Concatenating along sequence length
            else:
                combined_embeddings = torch.cat([token_embeddings, projected_image_embeddings], dim=1)
        else:
            # No image embeddings: Use only token embeddings and the original attention mask
            combined_embeddings = token_embeddings
            new_attention_mask = attention_mask

        # Pass the combined embeddings through the PHI2 model with the (updated or original) attention mask
        outputs = self.phi2_model(inputs_embeds=combined_embeddings, attention_mask=new_attention_mask)

        return outputs

def create_phi2_model_with_lora(mlp_projection,lan_model):
    
    for param in mlp_projection.parameters():
        param.requires_grad = True

    # Return PHI2 model with MLP projection
    return PHI2WithMLP(lan_model, mlp_projection)
    
model_embedding_dim = model.config.hidden_size  # This might change based on your model architecture

# Example usage
input_dim = 768  # Input dimension of image embeddings
output_dim = model_embedding_dim  # Target dimension of text embeddings
hidden_dim = 1024  # Hidden layer dimension of the MLP

mlp_projection = MLPProjection(input_dim, output_dim, hidden_dim, depth=2)  # Customize MLP
combined_model = create_phi2_model_with_lora(mlp_projection, model)


In [50]:
from peft import LoraModel, LoraConfig,get_peft_model

# Set up the QLoRA configuration for attention layers in PHI 2
lora_config = LoraConfig(
    r=8,  # Low-rank dimension
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Apply QLoRA only to these layers
    lora_dropout=0.05,
    bias="none"
)

# Wrap PHI 2 with QLoRA
# phi_lora_model = LoraModel(model, lora_config,"default")




In [51]:
phi_lora_model = get_peft_model(combined_model, lora_config)

In [52]:
phi_lora_model

PeftModel(
  (base_model): LoraModel(
    (model): PHI2WithMLP(
      (phi2_model): Qwen2ForCausalLM(
        (model): Qwen2Model(
          (embed_tokens): Embedding(151936, 1536)
          (layers): ModuleList(
            (0-27): 28 x Qwen2DecoderLayer(
              (self_attn): Qwen2SdpaAttention(
                (q_proj): lora.Linear(
                  (base_layer): Linear(in_features=1536, out_features=1536, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=1536, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=8, out_features=1536, bias=False)
                  )
                  (lora_embedding_A): ParameterDict()
                  (lora_embedding_B): ParameterDict()
                  (lora_magnitude_vector): Mod

In [16]:
phi_lora_model.print_trainable_parameters()

trainable params: 2,179,072 || all params: 1,548,255,232 || trainable%: 0.1407


In [17]:
phi_lora_model.print_trainable_parameters()

trainable params: 2,179,072 || all params: 1,548,255,232 || trainable%: 0.1407


trainable params: 2,179,072 || all params: 1,545,893,376 || trainable%: 0.1410

trainable params: 2,179,072 || all params: 1,547,205,888 || trainable%: 0.1408




In [18]:
for name, param in phi_lora_model.named_parameters():
    if 'mlp_projection' in name :
        param.requires_grad = True

In [19]:
phi_lora_model.print_trainable_parameters()

trainable params: 4,540,928 || all params: 1,548,255,232 || trainable%: 0.2933


In [20]:
from transformers import CLIPModel

# Load CLIP and PHI2
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

In [21]:

# Now the model can be trained, and the optimizer only updates LoRA and projection
optimizer = torch.optim.AdamW(
    [p for p in combined_model.parameters() if p.requires_grad], lr=1e-4
)

In [22]:

# # Training loop
# def train_model(combined_model, data_loader, optimizer, num_epochs=1, device="cuda"):
#     combined_model.train()
#     combined_model = combined_model.to(device)
    
#     for epoch in range(num_epochs):
#         total_loss = 0
#         for batch in data_loader:
#             image_embeddings = batch['image_embeddings'].to(device)
#             input_ids = batch['input_ids'].to(device)
#             labels = batch['labels'].to(device)
            
#             # Forward pass
#             optimizer.zero_grad()
#             outputs = combined_model(image_embeddings, input_ids)
            
#             # Assume outputs is a tuple where the first element is logits
#             logits = outputs.logits
            
#             # Flatten the logits and labels for cross-entropy loss
#             logits = logits.view(-1, logits.size(-1))
#             labels = labels.view(-1)
            
#             # Calculate loss (cross-entropy loss for language modeling)
#             loss = F.cross_entropy(logits, labels)
#             total_loss += loss.item()
            
#             # Backward pass and optimization
#             loss.backward()
#             optimizer.step()
        
#         avg_loss = total_loss / len(data_loader)
#         print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}")

# # Usage
# data_loader = get_data_loader(batch_size=16)  # Adjust the batch size as needed
# train_model(combined_model, data_loader, optimizer, num_epochs=1)


In [23]:
# for batch in data_loader:
#     image_embeddings = batch['image_embeddings'].to(device)  # Assuming pre-extracted embeddings
#     input_ids = batch['input_ids'].to(device)  # Tokenized text input
#     labels = batch['labels'].to(device)  # Labels for training
    
#     # Forward pass through the model
#     optimizer.zero_grad()
#     outputs = combined_model(image_embeddings, input_ids)
    
#     # Get logits and calculate loss
#     logits = outputs.logits.view(-1, logits.size(-1))
#     labels = labels.view(-1)
#     loss = F.cross_entropy(logits, labels)
    
#     # Backward pass and optimization
#     loss.backward()
#     optimizer.step()


In [24]:
import torch
from transformers import CLIPProcessor, CLIPModel
from transformers import Trainer, TrainingArguments
from datasets import Dataset
from torch.utils.data import Dataset as TorchDataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from PIL import Image

# Initialize the tokenizer and image model
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
clip_model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')

class CustomDataset(TorchDataset):
    def __init__(self, image_paths, text_inputs, text_labels):
        self.image_paths = image_paths
        self.text_inputs = text_inputs
        self.text_labels = text_labels
        self.max_length = 2048

    def __len__(self):
        return len(self.text_labels)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx])
        inputs = clip_processor(images=image, return_tensors="pt")
        image_embedding = clip_model.get_image_features(**inputs)

        # Tokenize text input
        input_encoding = tokenizer(
            self.text_inputs[idx],
            return_tensors='pt',
            padding='max_length',  # Pad to max length
            truncation=True,  # Truncate if needed
            max_length=self.max_length
        )
        
        # Tokenize text label (similar to inputs)
        label_encoding = tokenizer(
            self.text_labels[idx],
            return_tensors='pt',
            padding='max_length',
            truncation=True,
            max_length=self.max_length
        )

        # Extract input_ids and attention_mask for both inputs and labels
        input_ids = input_encoding['input_ids'].squeeze(0)
        input_attention_mask = input_encoding['attention_mask'].squeeze(0)
        label_ids = label_encoding['input_ids'].squeeze(0)
        label_attention_mask = label_encoding['attention_mask'].squeeze(0)

        
        # Return the image embeddings, tokenized inputs/labels, and attention masks
        return {
            'image_embeddings': image_embedding,  # Precomputed image embedding
            'input_ids': input_ids,  # Tokenized input
            'attention_mask': input_attention_mask,  # Attention mask for input
            'labels': label_ids,  # Tokenized label
            'label_attention_mask': label_attention_mask  # Attention mask for label (optional)
        }

# Create dataset (you will replace this with actual paths and data)
image_paths = ["/kaggle/input/sample/000000000009.jpg", "/kaggle/input/sample/000000000009.jpg"]
text_inputs = ["What is the capital of France?", "Describe a sunset."]
text_labels = ["Paris", "A beautiful view at dusk."]  # Example text labels

# Instantiate dataset
dataset = CustomDataset(image_paths, text_inputs, text_labels)



tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]



In [25]:

import wandb
wandb.init(mode="disabled")

In [190]:
import torch
from transformers import DataCollator

class CustomDataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def __call__(self, batch):
        # Extract image embeddings
        image_embeddings = torch.stack([item['image_embeddings'] for item in batch])
        # Extract input_ids and labels
        input_ids = [item['input_ids'] for item in batch]
        labels = [item['labels'] for item in batch]

        # Pad the input_ids and labels
        padded_input_ids = self.tokenizer.pad({'input_ids': input_ids}, padding=True, return_tensors='pt')['input_ids']
        padded_labels = self.tokenizer.pad({'input_ids': labels}, padding=True, return_tensors='pt')['input_ids']

        # Create attention masks
        input_attention_mask = (padded_input_ids != self.tokenizer.pad_token_id).type(torch.float)
        label_attention_mask = (padded_labels != self.tokenizer.pad_token_id).type(torch.float)
        
        # Prepare collated inputs
        collated_inputs = {
            'input_ids': padded_input_ids,
            'labels': padded_labels,
            'image_embeddings': image_embeddings,
            'attention_mask': input_attention_mask
        }
        
        return collated_inputs


In [None]:

from transformers import TrainingArguments



training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=1e-4,
    per_device_train_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.01,
    remove_unused_columns=False,
)
# mlp_projection = MLPProjection(input_dim=512, output_dim=768, hidden_dim=1024, num_layers=2)
# model = PHI2WithMLP(mlp_projection,phi_lora_model = phi_lora_model)
from transformers import DataCollatorWithPadding

# Create a data collator
data_collator = CustomDataCollator(tokenizer=tokenizer)

# Create Trainer
trainer = Trainer(
    model=phi_lora_model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,  # Use the collator
)

# Start training
trainer.train()


In [26]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Load your model
# eval_model = get_peft_model(combined_model, lora_config).to(device)  # Adjust based on your setup
# eval_model.eval()  # Set the model to evaluation mode

# Example input data
image_paths = ["/kaggle/input/sample/000000000009.jpg"]
text_inputs = ["What is the capital of France?"]  # Example text input
text_labels = ["Paris"]  # Example text label (if needed for comparison)

# Prepare inputs
images = []
for path in image_paths:
    image = Image.open(path)
    inputs = clip_processor(images=image, return_tensors="pt")
    image_embedding = clip_model.get_image_features(**inputs)
    images.append(image_embedding)

# Tokenize text input
input_encoding = tokenizer(
    text_inputs[0],
    return_tensors='pt',
    padding='max_length',
    truncation=True,
    max_length=2048  # Set this to match your model's input size
)

# Combine inputs for inference
input_ids = input_encoding['input_ids'].squeeze(0).to(device)  # Shape: [seq_len]
attention_mask = input_encoding['attention_mask'].squeeze(0).to(device)  # Shape: [seq_len]
image_embedding = images[0].squeeze(0).to(device)  # Shape: [embedding_dim]


In [27]:
attention_mask.shape

torch.Size([2048])

In [28]:

# Get token embeddings from PHI2 model
token_embeddings = model.get_input_embeddings()(input_ids)



In [29]:
select_feature = 'cls_patch'

In [30]:
def feature_select(image_forward_outs):
    image_features = image_forward_outs.hidden_states[-1]
    if select_feature == 'patch':
        image_features = image_features[:, 1:]  # Skip CLS token if selecting patch
    elif select_feature == 'cls_patch':
        image_features = image_features  # Keep CLS + patch tokens
    else:
        raise ValueError(f'Unexpected select feature: {select_feature}')
    return image_features

In [31]:
image_forward_outs = clip_model.vision_model(**inputs, output_hidden_states=True)
image_features = feature_select(image_forward_outs)


In [34]:
image_features = image_features.squeeze(0).to(device)

In [35]:
mlp_projection = mlp_projection.to(device)
mlp_projection

MLPProjection(
  (mlp): Sequential(
    (0): Linear(in_features=768, out_features=1024, bias=True)
    (1): GELU(approximate='none')
    (2): Linear(in_features=1024, out_features=1536, bias=True)
  )
)

In [36]:
projected_image_embeddings = mlp_projection(image_features)

In [37]:
image_embedding.shape,token_embeddings.shape,projected_image_embeddings.shape,image_features.shape

(torch.Size([512]),
 torch.Size([2048, 1536]),
 torch.Size([50, 1536]),
 torch.Size([50, 768]))

In [38]:
torch.cat([projected_image_embeddings, token_embeddings], dim=0).shape

torch.Size([2098, 1536])

In [53]:
# # Load your model
eval_model = get_peft_model(combined_model, lora_config).to(device)  # Adjust based on your setup
eval_model.eval()

PeftModel(
  (base_model): LoraModel(
    (model): PHI2WithMLP(
      (phi2_model): Qwen2ForCausalLM(
        (model): Qwen2Model(
          (embed_tokens): Embedding(151936, 1536)
          (layers): ModuleList(
            (0-27): 28 x Qwen2DecoderLayer(
              (self_attn): Qwen2SdpaAttention(
                (q_proj): lora.Linear(
                  (base_layer): Linear(in_features=1536, out_features=1536, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=1536, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDict(
                    (default): Linear(in_features=8, out_features=1536, bias=False)
                  )
                  (lora_embedding_A): ParameterDict()
                  (lora_embedding_B): ParameterDict()
                  (lora_magnitude_vector): Mod

In [63]:
# Perform inference
with torch.no_grad():  # Disable gradient calculation
    outputs = eval_model(input_ids=input_ids.unsqueeze(0),  # Add batch dimension
                     attention_mask=attention_mask.unsqueeze(0),  # Add batch dimension
                     image_embeddings=image_features.unsqueeze(0))  # Add batch dimension

# Extract predictions (modify based on your model's output)
predictions = outputs.logits  # Or the appropriate output field

# Process predictions as needed (e.g., applying softmax, argmax)
predicted_labels = torch.argmax(predictions, dim=-1)
print(f"Predicted labels: {predicted_labels}")

torch.Size([1, 2048])
1 2048
torch.Size([1, 2098])
torch.Size([1, 50, 1536]) torch.Size([1, 2048, 1536])
Predicted labels: tensor([[  198, 31914,   198,  ...,   198,   198,   198]])


In [58]:
predicted_token_ids.shape

torch.Size([1, 2098])

In [61]:
predictions.shape

torch.Size([1, 2098])

In [57]:
# Process predictions as needed (e.g., applying softmax, argmax)
predicted_token_ids = torch.argmax(predictions, dim=-1)

# Convert predicted token IDs back to text using the tokenizer
predicted_text = tokenizer.decode(predicted_token_ids[0], skip_special_tokens=True)

print(f"Predicted text: {predicted_text}")

Predicted text:  is the value of the?
 The  is of of.. the city of. is and a of of is is is the is of of is. is the of of of is the the the of of is is is

 of of is is is the is, is is The Paris of of of is is the the of of's? the: of of is is the the of of is is Paris the of of is is is is of of is is is the the the of of is is the capital of the is is is the of of is is
 The of is is is the the of is is is Paris The the,, the is the the of of the is, the??, the is the of of, is is The the?, is is the the of,. is The the the of? is the the the? the the is the the of is?, the
 of? is is?

 the, the is is the, of is is the is the of of of is,
 is of?'s the? of of is is is the the of of, is is is of is, is the the of of is? what is the of,, is,,,, the is is the of of is is is the the,, is is the the of that?? the the of of is?
 the is of? is the the of of is? is the the of of,?,? of,,,
 the of of? is What What the of, the is

 of of, is the the of of? is the the,,, is, the? of,, is is t

In [66]:
outputs['logits'].shape

torch.Size([1, 2098, 151936])

In [164]:
if random.random() < 0.5:
    combined_embeddings = torch.cat([projected_image_embeddings, token_embeddings], dim=0)
else:
    combined_embeddings = torch.cat([token_embeddings, projected_image_embeddings], dim=0)


In [172]:
combined_embeddings.shape

torch.Size([2098, 1536])

In [180]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Load your model
eval_model = get_peft_model(combined_model, lora_config).to(device)  # Adjust based on your setup
eval_model.eval()  # Set the model to evaluation mode

# Example input data
image_paths = ["/kaggle/input/sample/000000000009.jpg"]
text_inputs = ["What is the capital of France?"]  # Example text input
text_labels = ["Paris"]  # Example text label (if needed for comparison)

# Prepare inputs
images = []
for path in image_paths:
    image = Image.open(path)
    inputs = clip_processor(images=image, return_tensors="pt")
    image_forward_outs = clip_model.vision_model(**inputs, output_hidden_states=True)
    image_features = feature_select(image_forward_outs)
    images.append(image_features)

# Tokenize text input
input_encoding = tokenizer(
    text_inputs[0],
    return_tensors='pt',
    padding='max_length',
    truncation=True,
    max_length=2048  # Set this to match your model's input size
)

# Combine inputs for inference
input_ids = input_encoding['input_ids'].squeeze(0).to(device)  # Shape: [seq_len]
attention_mask = input_encoding['attention_mask'].squeeze(0).to(device)  # Shape: [seq_len]
image_embedding = images[0].squeeze(0).to(device)  # Shape: [embedding_dim]

# Ensure image_embedding has the right shape for the model
# You may need to reshape or adjust the tensor based on your model's expected input
# image_embedding = image_embedding.view(1, -1)  # Adjust this if needed

# Perform inference
with torch.no_grad():  # Disable gradient calculation
    outputs = eval_model(input_ids=input_ids.unsqueeze(0),  # Add batch dimension
                     attention_mask=attention_mask.unsqueeze(0),  # Add batch dimension
                     image_embeddings=image_embedding.unsqueeze(0))  # Add batch dimension

# Extract predictions (modify based on your model's output)
predictions = outputs.logits  # Or the appropriate output field

# Process predictions as needed (e.g., applying softmax, argmax)
predicted_labels = torch.argmax(predictions, dim=-1)
print(f"Predicted labels: {predicted_labels}")


torch.Size([1, 2048])
1 2048


RuntimeError: Sizes of tensors must match except in dimension 0. Expected size 2048 but got size 1 for tensor number 1 in the list.

In [None]:
!pip install trl

In [40]:
from trl import SFTTrainer

max_seq_length = 512

trainer = SFTTrainer(
    model=phi_lora_model,
    train_dataset=dataset,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)


NameError: name 'training_arguments' is not defined

In [62]:
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TypeError: Caught TypeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
    output = module(*input, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
TypeError: PHI2WithMLP.forward() got an unexpected keyword argument 'attention_mask'


In [201]:
a = torch.randn(7484, 1, 1)

# works as we are expanding singleton dimensions
b = a.expand(-1, 100, 200)
print(b.shape)
# torch.Size([7484, 100, 200])



torch.Size([7484, 100, 200])


In [203]:
# fails
b = a.expand(19, 100, 200)

RuntimeError: The expanded size of the tensor (19) must match the existing size (7484) at non-singleton dimension 0.  Target sizes: [19, 100, 200].  Tensor sizes: [7484, 1, 1]

In [200]:
b.shape

torch.Size([7484, 1, 200])