# Process visual features to get constant visual features for adaptations

    Make sure that you are standing in the root folder of the repository

In [1]:
%cd ../../
import os
os.getcwd()

/mnt/c/Users/basem/projects/CSCI 566/Project/adapt-pre-trained-VL-models-to-text


'/mnt/c/Users/basem/projects/CSCI 566/Project/adapt-pre-trained-VL-models-to-text'

Performs all computations necessary to generate the constant visual features used for the following adaptations:
* `avg-visual-features`
* `zero-image-visual-features`
* `zeroed-visual-features`

LXMERT and VisualBERT use the same types of visual features, while CLIP-BERT doesn't. Thus, we generate two separate visual feature versions per adaptation.

In [3]:
import numpy as np
from adaptations.src.utils import load_obj_tsv
import torch
import os
import h5py

## From average over training set

### LXMERT

First, you need to download the image datasets. Do this via the following commands:

**MS COCO**

Train
```bash
wget https://nlp.cs.unc.edu/data/lxmert_data/mscoco_imgfeat/train2014_obj36.zip -P adaptations/data/lxmert/mscoco_imgfeat
unzip adaptations/data/lxmert/mscoco_imgfeat/train2014_obj36.zip -d adaptations/data/lxmert/mscoco_imgfeat && rm adaptations/data/lxmert/mscoco_imgfeat/train2014_obj36.zip
```
* 17 GB zipped
* 31 GB unzipped and downloaded

Validation
```bash
wget https://nlp.cs.unc.edu/data/lxmert_data/mscoco_imgfeat/val2014_obj36.zip -P adaptations/data/lxmert/mscoco_imgfeat
unzip adaptations/data/lxmert/mscoco_imgfeat/val2014_obj36.zip -d adaptations/data/lxmert/mscoco_imgfeat && rm adaptations/data/lxmert/mscoco_imgfeat/val2014_obj36.zip
```
* 8.1 GB zipped
* 15 GB unzipped and downloaded

**Visual Genome**

```bash
wget https://nlp.cs.unc.edu/data/lxmert_data/vg_gqa_imgfeat/vg_gqa_obj36.zip -P adaptations/data/lxmert/vg_gqa_imgfeat
unzip adaptations/data/lxmert/vg_gqa_imgfeat/vg_gqa_obj36.zip -d adaptations/data/lxmert/vg_gqa_imgfeat && rm adaptations/data/lxmert/vg_gqa_imgfeat/vg_gqa_obj36.zip
```

* 30 GB zipped
* 55 GB unzipped and downloaded

In [4]:
import sys
import csv
import base64
import time

import numpy as np
from tqdm import tqdm

csv.field_size_limit(sys.maxsize)
FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf",
              "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"]

def get_tsv_data_item(item):
    for key in ['img_h', 'img_w', 'num_boxes']:
        item[key] = int(item[key])

    boxes = item['num_boxes']
    decode_config = [
        ('objects_id', (boxes, ), np.int64),
        ('objects_conf', (boxes, ), np.float32),
        ('attrs_id', (boxes, ), np.int64),
        ('attrs_conf', (boxes, ), np.float32),
        ('boxes', (boxes, 4), np.float32),
        ('features', (boxes, -1), np.float32),
    ]
    for key, shape, dtype in decode_config:
        item[key] = np.frombuffer(base64.b64decode(item[key]), dtype=dtype)
        item[key] = item[key].reshape(shape)
        item[key].setflags(write=False)
        
    return item

# LXMERT expects normalized boxes (copied from airsplay/lxmert)
def get_normalized_boxes(item):
    # Normalize the boxes (to 0 ~ 1)
    img_h, img_w = item['img_h'], item['img_w']
    boxes = item["boxes"].copy()
    boxes[:, (0, 2)] /= img_w
    boxes[:, (1, 3)] /= img_h
    np.testing.assert_array_less(boxes, 1+1e-5)
    np.testing.assert_array_less(-boxes, 0+1e-5)
    
    return boxes
    

def get_avg_visual_properties_from_files(fnames, features_shape=(36, 2048), pos_shape=(36, 4)):
    feature_vector = np.zeros(features_shape)
    pos_vector = np.zeros(pos_shape)
    num_iters = 0
    for fname in fnames:
        start_time = time.time()
        print("Start to load Faster-RCNN detected objects from %s" % fname)
        with open(fname) as f:
            reader = csv.DictReader(f, FIELDNAMES, delimiter="\t")
            for i, item in enumerate(tqdm(reader)):
                item = get_tsv_data_item(item)
                feature_vector += item["features"]
                pos_vector += get_normalized_boxes(item)
                num_iters += 1
                
        elapsed_time = time.time() - start_time
        print("Loaded file %s in %d seconds." % (fname, elapsed_time))
    return feature_vector/num_iters, pos_vector/num_iters

def get_avg_visual_properties_across_detections_from_files(fnames, features_shape=(1, 2048), pos_shape=(1, 4)):
    feature_vector = np.zeros(features_shape)
    pos_vector = np.zeros(pos_shape)
    num_iters = 0
    for fname in fnames:
        start_time = time.time()
        print("Start to load Faster-RCNN detected objects from %s" % fname)
        with open(fname) as f:
            reader = csv.DictReader(f, FIELDNAMES, delimiter="\t")
            for i, item in enumerate(tqdm(reader)):
                item = get_tsv_data_item(item)
                feature_vector += np.sum(item["features"], axis=0)
                pos_vector += np.sum(get_normalized_boxes(item), axis=0)
                num_iters += item["features"].shape[0]
                
        elapsed_time = time.time() - start_time
        print("Loaded file %s in %d seconds." % (fname, elapsed_time))
    return feature_vector/num_iters, pos_vector/num_iters

In [5]:
COCO_VAL_FEATURES_PATH = "adaptations/data/lxmert/mscoco_imgfeat/val2014_obj36.tsv"
COCO_TRAIN_FEATURES_PATH = "adaptations/data/lxmert/mscoco_imgfeat/train2014_obj36.tsv"
VG_FEATURES_PATH = "adaptations/data/lxmert/vg_gqa_imgfeat/vg_gqa_obj36.tsv"

data_files = [COCO_VAL_FEATURES_PATH, COCO_TRAIN_FEATURES_PATH, VG_FEATURES_PATH]

Per detection (one different vector for each detection)

In [6]:
avg_feature_vector, avg_pos_vector = get_avg_visual_properties_from_files(data_files, features_shape=(36, 2048), pos_shape=(36, 4))

avg_feature_tensor = torch.Tensor(avg_feature_vector)
torch.save(avg_feature_tensor, os.path.join("adaptations/data/avg-visual-features", "frcnn_features_per_detection.pt"))

avg_pos_tensor = torch.Tensor(avg_pos_vector)
torch.save(avg_pos_tensor, os.path.join("adaptations/data/avg-visual-features", "frcnn_boxes_per_detection.pt"))

Start to load Faster-RCNN detected objects from adaptations/data/lxmert/mscoco_imgfeat/val2014_obj36.tsv


FileNotFoundError: [Errno 2] No such file or directory: 'adaptations/data/lxmert/mscoco_imgfeat/val2014_obj36.tsv'

### CLIP-BERT

In [7]:
buffer = h5py.File("models/data/clip-bert/clip_features.hdf5", mode="r")
image_features = buffer["features"]

OSError: Unable to open file (unable to open file: name = 'models/data/clip-bert/clip_features.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [None]:
image_features.shape

In [None]:
avg_feature_vector = torch.zeros((512,))
for feats in image_features:
    avg_feature_vector += feats
avg_feature_vector = avg_feature_vector/len(image_features)
torch.save(avg_feature_vector, "adaptations/data/avg-visual-features/clip_features.pt")

## For black (zeroed) image

### LXMERT and VisualBERT (Faster-R CNN)

The `output.csv` file is generated from a Faster-R CNN using the code in `adaptations/data/zero-image-visual-features`.

ID translations can be found at 
* https://github.com/peteanderson80/bottom-up-attention/blob/master/data/genome/1600-400-20/objects_vocab.txt.
* https://github.com/peteanderson80/bottom-up-attention/blob/master/data/genome/1600-400-20/attributes_vocab.txt

**objects_id**

72: "sky"

956: "background"

**attrs_id**

11: "black"

163: "dark"

The Faster-R CNN results seemingly agree with results by Iki et al. for their black image (https://github.com/Alab-NII/eval_vl_glue/blob/main/demo/extractor_demo.ipynb).

In [None]:
with open("adaptations/data/zero-image-visual-features/output.csv") as f:
    reader = csv.DictReader(f, FIELDNAMES, delimiter="\t")
    for i, item in enumerate(tqdm(reader)):
        assert i < 1
        item = get_tsv_data_item(item)

In [None]:
item

In [None]:
item["features"].shape

In [None]:
torch.save(torch.Tensor(item["features"]), "adaptations/data/zero-image-visual-features/frcnn_features.pt")

In [None]:
normalized_boxes = get_normalized_boxes(item)
torch.save(torch.Tensor(normalized_boxes), "adaptations/data/zero-image-visual-features/frcnn_boxes.pt")

### CLIP

In [None]:
from transformers import CLIPProcessor
from PIL import Image

from models.src.clip_bert.precompute_clip_visual_features import PatchedCLIPFeatureExtractor, VisualOnlyCLIPModel

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
processor.feature_extractor = PatchedCLIPFeatureExtractor("openai/clip-vit-base-patch32")

with torch.no_grad():
    pil_image = Image.open("adaptations/data/zero-image-visual-features/filled_with_0.png")
    if pil_image.mode != "RGB":
        pil_image = pil_image.convert("RGB")
    example = processor(images=pil_image, return_tensors="pt")
    example["pixel_values"] = example["pixel_values"]

    model = VisualOnlyCLIPModel().to(device)
    clip_features = model(**example.to(device))
    clip_features = clip_features.squeeze(0)

In [None]:
torch.save(clip_features, "adaptations/data/zero-image-visual-features/clip_features.pt")

## Zero vector as visual features filler

### LXMERT and VisualBERT

In [None]:
visual_features = torch.zeros((36, 2048))
visual_boxes = torch.zeros((36, 4))

torch.save(visual_features, "adaptations/data/zeroed-visual-features/frcnn_features.pt")
torch.save(visual_boxes, "adaptations/data/zeroed-visual-features/frcnn_boxes.pt")

### CLIP-BERT

In [None]:
visual_features = torch.zeros((512,))
torch.save(visual_features, "adaptations/data/zeroed-visual-features/clip_features.pt")