In [None]:
! cp -r '/content/drive/MyDrive/dataset/merged.zip' '/'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
torch.cuda.empty_cache()

In [None]:
from numba import cuda

device = cuda.get_current_device()
device.reset()

In [None]:
!nvidia-smi

Sat Nov 25 10:03:09 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   72C    P8    12W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
! sudo apt-get install unzip
! unzip '/merged.zip' -d '/data'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /data/dataset_image/851580029368647680.jpg  
  inflating: /data/dataset_image/851581265782243328.jpg  
  inflating: /data/dataset_image/851584901593931776.jpg  
  inflating: /data/dataset_image/851589786385223680.jpg  
  inflating: /data/dataset_image/851623486648754176.jpg  
  inflating: /data/dataset_image/851638105492910080.jpg  
  inflating: /data/dataset_image/851668637681422337.jpg  
  inflating: /data/dataset_image/851683467209441282.jpg  
  inflating: /data/dataset_image/851688823708889088.jpg  
  inflating: /data/dataset_image/851704924568268800.jpg  
  inflating: /data/dataset_image/851747075129606145.jpg  
  inflating: /data/dataset_image/851754779046944768.jpg  
  inflating: /data/dataset_image/851756040895049728.jpg  
  inflating: /data/dataset_image/851774723772080128.jpg  
  inflating: /data/dataset_image/851781920442781698.jpg  
  inflating: /data/dataset_image/851797551254601728.jpg  
  infla

In [None]:
!pip install transformers



In [None]:
import torch
import numpy as np
from torch.utils.data import Dataset

In [None]:
from PIL import Image
import json
import os
from transformers import CLIPProcessor, CLIPModel
import torchvision.transforms as transforms
import pandas as pd

In [None]:
class dataset(Dataset):
    def __init__(self, mode = 'train',data_dir = 'Data',language='en'):
        self.image_dir = os.path.join(data_dir,'dataset_image')

        if mode=='train':
            self.text_dir = os.path.join(data_dir,'text_json_final','train.json')
        elif mode=='val':
            self.text_dir = os.path.join(data_dir,'text_json_final','valid.json')
        else:
            self.text_dir = os.path.join(data_dir,'text_json_final','test.json')
        if language =='multi':
            if mode=='train':
                self.text_dir_multi = os.path.join(data_dir,'Multilingual','gold_multi_ling_train.csv')
            elif mode=='val':
                self.text_dir_multi = os.path.join(data_dir,'Multilingual','gold_multi_ling_valid.csv')
            else:
                self.text_dir_multi = os.path.join(data_dir,'Multilingual','gold_multi_ling_test.csv')


        #loading the text data
        self.data = []
        with open(self.text_dir,'r',encoding='utf-8') as f:
            f1_json = json.load(f)
            for line in f1_json:
               self.data.append(
                   {
                       'image_id' : str(line['image_id']),
                       'text' : line['text'],
                       'label': line['label']
                   }
               )
        if language =='multi':
            df = pd.read_csv(self.text_dir_multi)
            for index, row in df.iterrows():
                text = row['text']
                label = row['label']
                image_id = row['iamge_id']
                self.data.append(
                   {
                       'image_id' : str(image_id),
                       'text' : text,
                       'label': label
                   }
               )



    def __getitem__(self, index):
        text = self.data[index]['text']
        image = Image.open(os.path.join(self.image_dir,self.data[index]['image_id']+'.jpg'))
        label =self.data[index]['label']
        return text,image,label

    def __len__(self):
        return len(self.data)


    def collate_func(batch_data):
        batch_size = len(batch_data)

        if batch_size == 0:
            return {}

        text_list = []
        image_list = []
        label_list = []

        for instance in batch_data:
            text_list.append(instance[0])
            image_list.append(instance[1])
            label_list.append(instance[2])
       #     id_list.append(instance[3])
        return text_list, image_list, label_list

In [None]:
config = {
    'device': '1',
    'model': 'MV_CLIP',
    'text_name': 'text_json_final',
    'simple_linear': False,
    'num_train_epochs': 10,
    'train_batch_size': 32,
    'dev_batch_size': 32,
    'label_number': 2,
    'text_size': 512,
    'image_size': 768,
    'adam_epsilon': 1e-8,
    'optimizer_name': 'adam',
    'learning_rate': 5e-4,
    'clip_learning_rate': 1e-6,
    'max_len': 77,
    'layers': 3,
    'max_grad_norm': 5.0,
    'weight_decay': 0.05,
    'warmup_proportion': 0.2,
    'dropout_rate': 0.1,
    'output_dir': '../output_dir/',
    'limit': None,
    'seed': 42
}





Use multilingual clip instead of clip

In [None]:
!pip install multilingual-clip torch

Collecting multilingual-clip
  Downloading multilingual_clip-1.0.10-py3-none-any.whl (20 kB)
Installing collected packages: multilingual-clip
Successfully installed multilingual-clip-1.0.10


In [None]:
from transformers import CLIPProcessor, CLIPModel, CLIPVisionModel, CLIPImageProcessor,CLIPTextModel
import torch.nn as nn
from transformers.models.bert.modeling_bert import BertLayer,BertConfig
import copy
from multilingual_clip import pt_multilingual_clip
import transformers

class TransformerEncoder(nn.Module):
    def __init__(self,num_layers,hid_size,n_head):
        super(TransformerEncoder, self).__init__()
        self.encoder = nn.ModuleList([nn.TransformerEncoderLayer(d_model=hid_size,nhead=n_head) for _ in range(num_layers)])
    def forward(self,src,att_mask):
        for layer in self.encoder:
            src = layer(src=src,src_mask=att_mask)
        return src

class MultimodalEncoder(nn.Module):
    def __init__(self, config, layer_number):
        super(MultimodalEncoder, self).__init__()
        layer = BertLayer(config)
        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(layer_number)])

    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
        all_encoder_layers = []
        all_encoder_attentions = []
        for layer_module in self.layer:
            hidden_states, attention = layer_module(hidden_states, attention_mask, output_attentions=True)
            all_encoder_attentions.append(attention)
            if output_all_encoded_layers:
                all_encoder_layers.append(hidden_states)
        if not output_all_encoded_layers:
            all_encoder_layers.append(hidden_states)
        return all_encoder_layers, all_encoder_attentions

class MV_CLIP(nn.Module):
    def __init__(self,device):
        super(MV_CLIP, self).__init__()
        self.image_size= 768
        self.text_size = 1024
        self.label_number = 2


        self.processor_vision = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32")
        self.clip_model_vision = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        self.processor_text = transformers.AutoTokenizer.from_pretrained('M-CLIP/XLM-Roberta-Large-Vit-B-32')
        self.clip_model_text = pt_multilingual_clip.MultilingualCLIP.from_pretrained('M-CLIP/XLM-Roberta-Large-Vit-B-32')
        self.visual_projection = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").visual_projection
        #.from_pretrained('M-CLIP/XLM-Roberta-Large-Vit-L-14')

        self.text_linear = nn.Sequential(
                                        nn.Linear(self.text_size,self.text_size),
                                        nn.Dropout(0.1),
                                        nn.GELU()
                                        )
        self.image_linear = nn.Sequential(
                                        nn.Linear(self.image_size,self.image_size),
                                        nn.Dropout(0.1),
                                        nn.GELU()
                                        )
       # self.image_linear = nn.Linear(self.image_size,self.image_size)

        self.config = BertConfig.from_pretrained("bert-base-uncased")
        self.config.hidden_size = 512
        self.config.num_attention_heads = 8

        self.transformerEncoder = MultimodalEncoder(self.config, layer_number=3)#TransformerEncoder(8,self.text_size,4)

        self.keyless_att = nn.Linear(512,1,bias= False)

        self.classifier_fuse = nn.Linear(512 , self.label_number)
        self.classifier_text = nn.Linear(self.text_size, self.label_number)
        self.classifier_image = nn.Linear(self.image_size, self.label_number)

        self.device = device

    def forward(self,text,images):
       # inputs = self.processor(text=text,images=images,return_tensors='pt',padding='max_length', truncation=True,max_length = 77).to(self.device)
        #print(inputs['attention_mask'].shape)
        inputs_text = self.processor_text(text = text,return_tensors='pt',padding='max_length',truncation=True,max_length=77).to(self.device)
        #print(len(images))
        inputs_image = self.processor_vision(images=images,return_tensors ='pt').to(self.device)

        image_outputs = self.clip_model_vision(**inputs_image)
        self.clip_model_text = self.clip_model_text.to('cuda')
        text_outputs = self.clip_model_text.transformer(**inputs_text)
        #print(out.keys())
       # embs = out[0]


        #att = inputs_text['attention_mask']
        #embs = (embs * att.unsqueeze(2)).sum(dim=1) / att.sum(dim=1)[:, None]
        #text_outputs = self.clip_model_text.LinearTransformation(embs)


        #outputs = self.clip_model(**inputs)

        text_feat = text_outputs['last_hidden_state']
        vision_feat = image_outputs['last_hidden_state']
        text_cls = text_outputs['pooler_output']
        vision_cls = image_outputs['pooler_output']
       # print(text_feat.size(),'text_feat')
       # print(vision_feat.size(),'vision_feat')
       # print(vision_cls.size(),'visioncls_feat')
       # print(text_cls.size(),'textcls_feat')
    #    print(text_cls.shape)
        y_image = nn.functional.softmax(self.classifier_image(vision_cls),dim=-1)
        y_text = nn.functional.softmax(self.classifier_text(text_cls),dim=-1)
        #print(y_image.size(),'y_image')
        #print(y_text.size(),'y_text')

        text_embed = self.text_linear(text_feat)
        vision_embed = self.image_linear(vision_feat)
        text_embed = self.clip_model_text.LinearTransformation(text_embed)
      #  print(vision_embed.shape)
        vision_embed = self.visual_projection(vision_embed)

        #print(text_embed.size(),'text embed')
        #print(vision_embed.size(),'vision embed')

        fused_feat = torch.cat([vision_embed,text_embed],dim=1)
        #print(fused_feat.shape,'fused_feat')
        attention_mask = torch.cat((torch.ones(text_cls.shape[0], 50).to(text_cls.device), inputs_text['attention_mask']), dim=-1)
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

        hid_layers,_ = self.transformerEncoder(fused_feat, extended_attention_mask, output_all_encoded_layers=False)
        decoded_feat = hid_layers[-1]
      #  print(decoded_feat.size(),'decoded_feat')
        decoded_image_cls = decoded_feat[:,0,:].squeeze(1)
        decoded_text_feat= decoded_feat[:,50:,:]#.squeeze(1)
        #print(torch.arange(decoded_text_feat.shape[0]))
        #print(inputs['input_ids'].to(torch.int).argmax(dim=-1))
        #print(inputs['input_ids'].shape)
        #print(inputs.keys())
        decoded_text_cls =  decoded_feat[torch.arange(decoded_text_feat.shape[0], device=inputs_text['input_ids'].device), inputs_text['input_ids'].to(torch.int).argmax(dim=-1)]
       # decoded_text_cls =  #decoded_text_feat[:,0,:].squeeze(1)#decoded_feat[torch.arange(decoded_text_feat.shape[0], device=inputs['input_ids'].device), inputs['input_ids'].to(torch.int).argmax(dim=-1)]
        #print(decoded_image_cls.size(),'decoded_imgfeat_cls')
        #print(decoded_text_cls.size(),'decoded_txtfeat_cls')
      #  print(decoded_image_cls.size())
        pt = self.keyless_att(decoded_text_cls)
        pv = self.keyless_att(decoded_image_cls)
     #   print(pt.size(),'pt',pv.size())
      #  print(torch.stack((pt, pv), dim=1).size(),'stacked')
        att_score = nn.functional.softmax(torch.stack((pt, pv), dim=-1),dim=-1)
       # print(att_score.size(),'att')
        pt, pv = att_score.split([1,1], dim=-1)
       # print(pt.size(),'pt after')
       # print(decoded_text_cls.size(),'cls')
        fuse_feat = pt.squeeze(1)*decoded_text_cls+pv.squeeze(1)*decoded_image_cls
        #print(fuse_feat.shape,'fuse_feat')
        y_fuse = nn.functional.softmax(self.classifier_fuse(fuse_feat),dim=-1)


        y_o = (nn.functional.softmax(y_text,dim=-1) + nn.functional.softmax(y_image,dim=-1) + nn.functional.softmax(y_fuse,dim=-1) )

        return y_o,y_text,y_image,y_fuse

training lloop

In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import f1_score
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

In [None]:

def train():
    device = 'cuda'
    train_dataset = dataset(mode='train',data_dir='/data',language = 'multi')
    valid_dataset = dataset(mode='val',data_dir='/data',language = 'multi')

    train_dataloader = DataLoader(train_dataset,batch_size = 4, shuffle = True,collate_fn=dataset.collate_func)
    valid_dataloader = DataLoader(valid_dataset,batch_size = 32, shuffle = False,collate_fn=dataset.collate_func)

    epochs = 10

    model =  MV_CLIP(device).to(device=device)
    #gc.collect()
    loss_fn = nn.CrossEntropyLoss()
    best_val_acc = -1
    clip_params = list(map(id, model.clip_model_text.parameters()))+list(map(id, model.clip_model_vision.parameters()))
    base_params = filter(lambda p: id(p) not in clip_params, model.parameters())
    optimizer = torch.optim.AdamW([
        {"params": base_params},
                    {"params": model.clip_model_text.parameters(),"lr": 1e-6},
                    {"params": model.clip_model_vision.parameters(),"lr": 1e-6}
                    ],lr=5e-4,weight_decay=0.05)
    total_steps = int(len(train_dataloader) * epochs)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.2 * total_steps),
                                                    num_training_steps=total_steps)
    for epoch in range(epochs):
        label_lst = torch.tensor([])
        predict_lst = torch.tensor([])
        cnt=0
        for texts,images,labels in tqdm(train_dataloader):
            optimizer.zero_grad()
            outputs, y_t,y_i,y_fuse = model(texts,images)
            labels =  torch.tensor(labels).to(device)
            #print(outputs.shape)
            #print(labels.shape)

            loss = loss_fn(y_t,labels)+loss_fn(y_i,labels)+loss_fn(y_fuse,labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
            optimizer.step()
            scheduler.step()
        # print(predict_lst.shape,outputs.shape)
            predict_lst=torch.concat([predict_lst,outputs.detach().cpu()],dim=0)
            label_lst=torch.concat([label_lst,labels.detach().cpu()])
            #print(predict_lst.shape,label_lst.shape)
            #break
            cnt+=1
           # break
      #  print(predict_lst.shape)
        predict_lst = np.argmax(predict_lst.numpy(),axis=-1)
        label_lst = label_lst.numpy()
       # predict_lst = predict_lst.numpy()
        #print(predict_lst.shape,label_lst.shape)
        train_acc =  (predict_lst == label_lst).sum() / label_lst.shape[0]
        train_f1 = f1_score(label_lst, predict_lst)
        with torch.no_grad():
            model.eval()
            label_lst = torch.tensor([])
            predict_lst = torch.tensor([])
            for texts,images,labels in tqdm(valid_dataloader):
                outputs,_,_,_ = model(texts,images)
                labels =  torch.tensor(labels).to(device)
                predict_lst=torch.concat([predict_lst,outputs.detach().cpu()],dim=0)
                label_lst=torch.concat([label_lst,labels.detach().cpu()])
            # print(predict_lst.shape,label_lst.shape)
            predict_lst = np.argmax(predict_lst.detach().cpu().numpy(),axis=1)
            label_lst = label_lst.detach().cpu().numpy()
           # predict_lst = predict_lst.numpy()

            val_acc =  (predict_lst == label_lst).sum() / label_lst.shape[0]
            val_f1 = f1_score(label_lst, predict_lst,labels=[0, 1],average='macro')
            model.train()
        print('epoch: ',epoch)
        print('training accuracy:',train_acc,'val_acc:',val_acc)
        print('training f1:',train_f1,'val_f1:',val_f1)
        if val_acc>=best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(),'best_val_acc_model_mul.pt')
            torch.save(optimizer.state_dict(),'best_val_acc_opt_mul.pt')

train()


KeyboardInterrupt: ignored

testing loop    

In [None]:
test_dataset = dataset(mode='test',data_dir='/data',language = 'multi')
test_dataloader = DataLoader(test_dataset,batch_size = 1, shuffle = False,collate_fn=dataset.collate_func)
device = 'cuda'
model = MV_CLIP(device).to(device=device)
model.load_state_dict(torch.load('best_val_acc_model_mul.pt'))
texts_lst = []
images_lst =[]
with torch.no_grad():
    model.eval()
    label_lst = torch.tensor([])
    predict_lst = torch.tensor([])
    for texts,images,labels in tqdm(test_dataloader):
        outputs,_,_,_ = model(texts,images)
     #   texts_lst.append(texts)
        #images_lst.append(images)
      #  texts.append(texts)
        labels =  torch.tensor(labels).to(device)
        predict_lst=torch.concat([predict_lst,outputs.detach().cpu()],dim=0)
        label_lst=torch.concat([label_lst,labels.detach().cpu()])
    # print(predict_lst.shape,label_lst.shape)
    predict_lst = np.argmax(predict_lst.detach().cpu().numpy(),axis=1)
    label_lst = label_lst.detach().cpu().numpy()
    # predict_lst = predict_lst.numpy()

    test_acc =  (predict_lst == label_lst).sum() / label_lst.shape[0]
    test_f1 = f1_score(label_lst, predict_lst)
    model.train()

    print('test accuracy:',test_acc)
    print('test f1:',test_f1)

100%|██████████| 7358/7358 [10:26<00:00, 11.75it/s]

test accuracy: 0.5665941832019571
test f1: 0.5031936438697616



