In [1]:
import pandas as pd
import torch
from transformers import BlipProcessor, BlipModel
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torch
import torch.nn as nn
import ast
from scipy.spatial.distance import cosine
import numpy as np
import random
import torchvision.transforms as transforms
from torchvision.transforms import Compose, Resize, Normalize, ToTensor
import torch.optim as optim
from sklearn.metrics import f1_score
from torchvision.transforms.functional import InterpolationMode

In [2]:
!rm -rf /kaggle/working/*
!git clone https://github.com/salesforce/BLIP

Cloning into 'BLIP'...
remote: Enumerating objects: 277, done.[K
remote: Counting objects: 100% (165/165), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 277 (delta 137), reused 136 (delta 135), pack-reused 112[K
Receiving objects: 100% (277/277), 7.03 MiB | 13.96 MiB/s, done.
Resolving deltas: 100% (152/152), done.


In [3]:
%pip install fairscale==0.4.4

Collecting fairscale==0.4.4
  Downloading fairscale-0.4.4.tar.gz (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.4/235.4 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l- \ | / - done
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25l- \ | done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
Building wheels for collected packages: fairscale
  Building wheel for fairscale (pyproject.toml) ... [?25l- \ done
[?25h  Created wheel for fairscale: filename=fairscale-0.4.4-py3-none-any.whl size=292833 sha256=e51f3682821a1bf1f82b29148058fd92597124308607fd562c3258b842838e54
  Stored in directory: /root/.cache/pip/wheels/08/58/6f/56c57fa8315eb0bcf0287b580c850845be5f116359b809e9f1
Successfully built fairscale
Installing collected packages: fairscale
Successfully installed fairscale-0.4.4


In [4]:
import sys
sys.path.append('/kaggle/working/BLIP')

In [5]:
device = 'cuda'
from BLIP.models.blip import blip_feature_extractor
image_size = 512
model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base.pth'
model = blip_feature_extractor(pretrained=model_url, image_size=image_size, vit='base', med_config='/kaggle/working/BLIP/configs/med_config.json')
model.eval()
model = model.to(device)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

100%|██████████| 1.97G/1.97G [00:11<00:00, 179MB/s]


reshape position embedding from 196 to 1024
load checkpoint from https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base.pth


In [6]:
def load_image(image_path,image_size,device):

    raw_image = Image.open(image_path).convert('RGB')   

    w,h = raw_image.size
#     display(raw_image.resize((w//5,h//5)))
    
    transform = transforms.Compose([
        transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
        ]) 
    image = transform(raw_image).unsqueeze(0).to(device)   
    return image

In [7]:
df = pd.read_csv('/kaggle/input/new-cds-triplets-images/training_images.csv')

In [8]:
# torch.cuda.empty_cache()

In [9]:
base_path = '/kaggle/input/cds-sofas-chairs-training/training/'
df['blip'] = None

cache = {}

total_rows_count = df.shape[0]
print_interval = 100
rows_done = 0
for index, row in df.iterrows():
    image_path = row['image_path']
    image = load_image(base_path+image_path, 512, device)
    
    if image_path in cache:
        b = cache[image_path]
    else:
        with torch.no_grad():
            b = model(image, "" , mode='image')[0,0].tolist()
        
    df.at[index, 'blip'] = b
    cache[image_path] = b
    
    rows_done += 1

    if rows_done % print_interval == 0:
        percent_done = (rows_done / total_rows_count) * 100
        print(f'blip features done {percent_done:.2f}% ({rows_done}/{total_rows_count})')

blip features done 5.50% (100/1819)
blip features done 11.00% (200/1819)
blip features done 16.49% (300/1819)
blip features done 21.99% (400/1819)
blip features done 27.49% (500/1819)
blip features done 32.99% (600/1819)
blip features done 38.48% (700/1819)
blip features done 43.98% (800/1819)
blip features done 49.48% (900/1819)
blip features done 54.98% (1000/1819)
blip features done 60.47% (1100/1819)
blip features done 65.97% (1200/1819)
blip features done 71.47% (1300/1819)
blip features done 76.97% (1400/1819)
blip features done 82.46% (1500/1819)
blip features done 87.96% (1600/1819)
blip features done 93.46% (1700/1819)
blip features done 98.96% (1800/1819)


In [10]:
df.to_csv('/kaggle/working/training_with_blip.csv')