In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!git clone https://github.com/Anil-Banjade/Nepali_Multi_Modal.git


Cloning into 'Nepali_Multi_Modal'...
remote: Enumerating objects: 456, done.[K
remote: Counting objects: 100% (188/188), done.[K
remote: Compressing objects: 100% (116/116), done.[K
remote: Total 456 (delta 108), reused 121 (delta 52), pack-reused 268 (from 1)[K
Receiving objects: 100% (456/456), 2.04 MiB | 4.96 MiB/s, done.
Resolving deltas: 100% (265/265), done.


In [3]:
!ls '/content/drive/MyDrive/MinorProject_Nepali_MultiModal_LLM/trained_models'

contrastive_model.pt  datasets	fused_embeddings_model.pt  prefix_and_word_model.pt


In [4]:
!git fetch origin
!git reset --hard origin/main

fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git


In [5]:
%cd Nepali_Multi_Modal

/content/Nepali_Multi_Modal


In [6]:
%%capture
!pip install -r requirements.txt

In [7]:
import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import dataset,DataLoader

from src.multimodal_text_generation.config import config
from src.multimodal_text_generation.models.transformer import Transformer
from src.multimodal_text_generation.data.dataset import CaptionEmbeddingDataset, collate_fn
from src.multimodal_text_generation.utils.inference import run_inference
from src.multimodal_text_generation.trainer import train_model

from src.multimodal_embedding_fusion.models.model import ContrastiveModel
from src.multimodal_embedding_fusion.models.multimodal_fusion import MultiModalFusion

from torchvision import transforms

Using device: cuda


In [8]:

def Pipeline_test(input_image=None, input_text=None):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokenizer = AutoTokenizer.from_pretrained('NepBERTa/NepBERTa')

    contrastive_model = ContrastiveModel().to(device)
    contrastive_model.load_state_dict(torch.load('/content/drive/MyDrive/MinorProject_Nepali_MultiModal_LLM/trained_models/contrastive_model.pt'))
    contrastive_model.eval()

    fusion_model = MultiModalFusion().to(device)
    fusion_model.load_state_dict(torch.load('/content/drive/MyDrive/MinorProject_Nepali_MultiModal_LLM/trained_models/fused_embeddings_model.pt'))
    fusion_model.eval()

    transformer_model = Transformer(tokenizer).to(device)
    transformer_model.load_state_dict(torch.load('/content/drive/MyDrive/MinorProject_Nepali_MultiModal_LLM/autoregressive_model.pt'))
    transformer_model.eval()

    with torch.no_grad():
        if input_image is not None:
            image_features=contrastive_model.image_encoder(input_image)
        if input_text is not None:
            text_features=contrastive_model.text_encoder(
                input_ids=input_text['input_ids'],
                attention_mask=input_text['attention_mask']
            )

        if input_image is not None and input_text is not None:
            fused_embedding=fusion_model(image_features,text_features)
        elif input_image is not None:
            fused_embedding=fusion_model.image_projection(image_features)
        elif input_text is not None:
            fused_embedding=fusion_model.text_projection(text_features)
        else:
            raise ValueError('Must provide at least one input.')

        print(f"Pre-padding shape: {fused_embedding.shape}")  # Debug

        # Ensure we have [batch_size, features]
        if len(fused_embedding.shape) == 3:
            fused_embedding = fused_embedding.squeeze(1)  # Remove sequence dimension

        # Handle dimension mismatch
        if fused_embedding.shape[-1] < 768:
            padding = torch.zeros(
                fused_embedding.size(0),  # batch size
                768 - fused_embedding.shape[-1]
            ).to(device)

            fused_embedding = torch.cat([
                fused_embedding,  # [batch, original_dim]
                padding           # [batch, padding_dim]
            ], dim=-1)  # Result: [batch, 768]

        print(f"Post-padding shape: {fused_embedding.shape}")

        input_ids = torch.tensor([tokenizer.cls_token_id]).unsqueeze(0).to(device)

        for _ in range(config.max_seq_len - 1):
            outputs = transformer_model(fused_embedding, input_ids)
            next_token = outputs.argmax(-1)[:, -1].unsqueeze(-1)
            input_ids = torch.cat([input_ids, next_token], dim=-1)

            if next_token.item() == tokenizer.sep_token_id:
                break

        generated_caption = tokenizer.decode(
            input_ids.squeeze().tolist(),
            skip_special_tokens=True
        )



    return generated_caption






In [9]:
from PIL import Image
image_path = "/content/drive/MyDrive/MinorProject_Nepali_MultiModal_LLM/a.jpg"
raw_image = Image.open(image_path).convert("RGB")

In [10]:
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
processed_image = image_transform(raw_image).unsqueeze(0).to(device)

tokenizer = AutoTokenizer.from_pretrained('NepBERTa/NepBERTa')
text_input = tokenizer(
    "बिरालो",
    return_tensors='pt',
    padding='max_length',
    max_length=128,
    truncation=True
).to(device)

# Image only
caption = Pipeline_test(input_image=processed_image)
# Text only
caption1 = Pipeline_test(input_text=text_input)
# Multimodal
caption2 = Pipeline_test(input_image=processed_image, input_text=text_input)
print(caption)
print(caption1)
print(caption2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/547k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/534M [00:00<?, ?B/s]

All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
  contrastive_model.load_state_dict(torch.load('/content/drive/MyDrive/MinorProject_Nepali_MultiModal_LLM/trained_models/contrastive_model.pt'))
  fusion_model.load_state_dict(torch.load('/content/drive/MyDrive/MinorProject_Nepali_MultiModal_LLM/trained_models/fused_embeddings_model.pt'))
  transformer_model.load_state_dict(torch.load('/content/drive/MyDrive/MinorProject_Nepali_MultiModal_LLM/autoregressive_model.pt'))


Pre-padding shape: torch.Size([1, 512])
Post-padding shape: torch.Size([1, 768])


All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.


Pre-padding shape: torch.Size([1, 512])
Post-padding shape: torch.Size([1, 768])


All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.


Pre-padding shape: torch.Size([1, 1, 512])
Post-padding shape: torch.Size([1, 768])
एउटा ककर रखको टपपोमाथि उफरिरहको छ ।
एक कटा र कटी सरयासत भएको समदर तटमा हिडिरहका छन ।
एउटा ककर रखको टपपोमाथि उफरिद


In [11]:
ls '/content/drive/MyDrive/MinorProject_Nepali_MultiModal_LLM/'

a.jpg                    contrastive_model.pt                prefix_and_word.pt
autoregressive_model.pt  fused_embeddings.pt                 testing_pipeline.ipynb
best.pt                  Nepali_MultiModal_Generation.ipynb  [0m[01;34mtrained_models[0m/
b.jpg                    Nepali_MultiModal.ipynb             translated_nepali_captions.txt


In [12]:
from PIL import Image
image_path = "/content/drive/MyDrive/MinorProject_Nepali_MultiModal_LLM/b.jpg"
raw_image = Image.open(image_path).convert("RGB")

image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
processed_image = image_transform(raw_image).unsqueeze(0).to(device)

tokenizer = AutoTokenizer.from_pretrained('NepBERTa/NepBERTa')
text_input = tokenizer(
    "बिरालो",
    return_tensors='pt',
    padding='max_length',
    max_length=128,
    truncation=True
).to(device)

p1 = Pipeline_test(input_image=processed_image)
p2 = Pipeline_test(input_text=text_input)
p3 = Pipeline_test(input_image=processed_image, input_text=text_input)

print(p1)
print(p2)
print(p3)

All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.
  contrastive_model.load_state_dict(torch.load('/content/drive/MyDrive/MinorProject_Nepali_MultiModal_LLM/trained_models/contrastive_model.pt'))
  fusion_model.load_state_dict(torch.load('/content/drive/MyDrive/MinorProject_Nepali_MultiModal_LLM/trained_models/fused_embeddings_model.pt'))
  transformer_model.load_state_dict(torch.load('/content/drive/MyDrive/MinorProject_Nepali_MultiModal_LLM/autoregressive_model.pt'))


Pre-padding shape: torch.Size([1, 512])
Post-padding shape: torch.Size([1, 768])


All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.


Pre-padding shape: torch.Size([1, 512])
Post-padding shape: torch.Size([1, 768])


All TF 2.0 model weights were used when initializing BertModel.

All the weights of BertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.


Pre-padding shape: torch.Size([1, 1, 512])
Post-padding shape: torch.Size([1, 768])
एउटा ककर बलौट समदर तटमा पानीको खोलामाथि हाम फालद छ ।
एक कटा र कटी सरयासत भएको समदर तटमा हिडिरहका छन ।
एउटा ककर रखको टपपोमाथि उफरिद
