In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
%%capture
!pip install wandb ml_collections transformers
!wandb login 8c920c1bbdcc7ee5353aaa8becd54a4942f0fc06

In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import os
import numpy as np
import json
import tqdm
from sklearn.model_selection import train_test_split
import random
import wandb
from ml_collections import ConfigDict
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [None]:
cfg = ConfigDict()
cfg.test_split = 0.2
cfg.batch_size = 256
cfg.task_0_lr = 1e-4
cfg.task_1_lr = 5e-6
cfg.task_2_lr = 5e-6
cfg.residual_mlp_dropout_rate = 0.2
cfg.task_0_weight_decay = 1e-5
cfg.task_1_weight_decay = 2e-5
cfg.task_2_weight_decay = 4e-5
cfg.samples_per_class = 25
cfg.hidden_dims = 1024
cfg.op = "mul"
cfg.epochs = 100
cfg.task_order = "count yes_no condition" # space separated tasknames

cfg.half_batch_size = cfg.batch_size // 2

In [None]:
DEDUP_Q_DICT = {
    "What is the overall condition of the given image?": "What is the overall condition of the given image?",
    "How many non flooded buildings can be seen in this image?": "How many non flooded buildings can be seen in this image?",
    "How many buildings can be seen in the image?": "How many buildings can be seen in the image?",
    "How many buildings can be seen in this image?": "How many buildings can be seen in the image?",
    "Is the entire road non flooded?": "Is the entire road non flooded?",
    "What is the condition of the road in this image?": "What is the condition of the road in this image?",
    "How many buildings are non flooded?": "How many non flooded buildings can be seen in this image?",
    "Is the entire road flooded?": "Is the entire road flooded?",
    "How many buildings are in this image?": "How many buildings can be seen in the image?",
    "What is the condition of road?": "What is the condition of the road in this image?",
    "How many buildings are non flooded in this image?": "How many non flooded buildings can be seen in this image?",
    "How many buildings are in the image?": "How many buildings can be seen in the image?",
    "How many buildings are flooded in this image?": "How many buildings are flooded in this image?",
    "How many buildings are flooded?": "How many buildings are flooded in this image?",
    "How many flooded buildings can be seen in this image?": "How many buildings are flooded in this image?"
}


Q_TO_PROMPT_DICT = {
    "What is the overall condition of the given image?": "The overall condition of the given image is [label].",
    "How many non flooded buildings can be seen in this image?": "[label] non flooded buildings can be seen in this image.",
    "How many buildings can be seen in the image?": "[label] buildings can be seen in the image.",
    "Is the entire road non flooded?": "[label], the entire road is non flooded. ", 
    # "Is the entire road non flooded?No": "[label], the entire road is not non flooded.", 
    "What is the condition of the road in this image?": "The condition of the road is [label].",
    "Is the entire road flooded?": "[label], the entire road is flooded." ,
    # "Is the entire road flooded?": "[label], the entire road is not flooded." ,
    "How many buildings are flooded in this image?": "[label] buildings are flooded in this image."
}

Q_TO_POSSIBLE_ANSWERS_DICT = {

    "What is the overall condition of the given image?": ["flooded", "non flooded", "flooded, non flooded"],
    "How many non flooded buildings can be seen in this image?": list(map(str, range(51))),
    "How many buildings can be seen in the image?": list(map(str, range(51))),
    "Is the entire road non flooded?": ["Yes", "No"], 
    "What is the condition of the road in this image?": ["flooded", "non flooded", "flooded,non flooded"],
    "Is the entire road flooded?": ["Yes", "No"],
    "How many buildings are flooded in this image?": list(map(str, range(51))),
}


In [None]:


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

def accuracy(pred, true):
    acc = np.sum((true == pred.argmax(-1)).astype(np.float32))
    return float(100 * acc / len(true))

def unbatch(half_batch):
    """
    Unbatches a batch into list of examples.

    Args:
        batch: A batch of examples with the structure :
        [torch.Tensor, torch.Tensor, torch.Tensor]

    Returns:
        list of unbatched examples: [[torch.Tensor, torch.Tensor, torch.Tensor], [torch.Tensor, torch.Tensor, torch.Tensor], [torch.Tensor, torch.Tensor, torch.Tensor]]

    """
    list_of_examples = []

    num_examples = len(half_batch[0])

    for idx in range(num_examples):
        list_of_examples.append([half_batch[0][idx], half_batch[1][idx], half_batch[2][idx]])

    return list_of_examples


def batch(list_of_examples):
    """
    Batches unbatched examples into one

    Args:
        list_of_examples: list of unbatched examples: [[torch.Tensor, torch.Tensor, torch.Tensor], [torch.Tensor, torch.Tensor, torch.Tensor], [torch.Tensor, torch.Tensor, torch.Tensor]]

    Returns:
        A batch of examples with the structure :
        [torch.Tensor, torch.Tensor, torch.Tensor]
    """
    img_feats = []
    q_feats = []
    labels = []
    for example in list_of_examples:
        img_feats.append(example[0])
        q_feats.append(example[1])
        labels.append(example[2])

    return torch.concat(img_feats), torch.concat(q_feats), torch.concat(labels)

def hello():
    print("hi")

def combine_batch_and_list(half_batch, list_of_examples):
    for example in list_of_examples:
        half_batch[0] = torch.concat([half_batch[0], example[0].unsqueeze(0)], dim=0)
        half_batch[1] = torch.concat([half_batch[1], example[1].unsqueeze(0)], dim=0)
        half_batch[2] = torch.concat([half_batch[2], example[2].unsqueeze(0)], dim=0)
    return half_batch


def get_train_val_splits(jsondict):
    total = len(jsondict.keys())
    train, val = train_test_split(list(jsondict.keys()), test_size=cfg.test_split)

    print(len(train),len(val))

    train_dict = dict()
    val_dict = dict()
    for i in train:
        train_dict[str(i)] = jsondict[str(i)]
    
    for j in val:
        val_dict[str(j)] = jsondict[str(j)]
    
    return train_dict, val_dict

def get_uniq_image_ids(jsondict):
    uniq_images = []
    for key, example in jsondict.items():
        if example["Image_ID"] not in uniq_images:
            uniq_images.append(example["Image_ID"])
    return uniq_images

def get_questions_for_imageid(jsondict, imageid):
    qs = dict()
    for key, example in jsondict.items():
        if example["Image_ID"] == imageid:
            qs[key] = example
    return qs

def get_train_val_splits_imagewise(jsondict):
    train_dict = dict()
    val_dict = dict()

    uniq_images = get_uniq_image_ids(jsondict)
    
    train, val = train_test_split(uniq_images, test_size=0.2)

    for imageid in train:
        train_dict.update(get_questions_for_imageid(jsondict, imageid)) 

    for imageid in val:
        val_dict.update(get_questions_for_imageid(jsondict, imageid))

    return train_dict, val_dict

def get_typewise_train_val_splits(train_dict, val_dict):
    train_road_condition = dict()
    train_yes_no = dict()
    train_image_condition = dict()

    counter = 0
    for example in train_dict.values():
        if example["Question_Type"] == "Yes_No":
            train_yes_no[str(counter)] = example
            counter += 1
        elif "road" in example["Question"]:
            train_road_condition[str(counter)] = example
            counter += 1
        elif "overall" in example["Question"]:
            train_image_condition[str(counter)] = example
            counter += 1

    val_road_condition = dict()
    val_yes_no = dict()
    val_image_condition = dict()

    counter = 0
    for example in val_dict.values():
        if example["Question_Type"] == "Yes_No":
            val_yes_no[str(counter)] = example
            counter += 1
        elif "road" in example["Question"]:
            val_road_condition[str(counter)] = example
            counter += 1
        elif "overall" in example["Question"]:
            val_image_condition[str(counter)] = example
            counter += 1

    return [train_image_condition, train_road_condition, train_yes_no], [val_image_condition, val_road_condition, val_yes_no]

def accuracy(pred, true):
    acc = np.sum((true == pred.argmax(-1)).astype(np.float32))
    return float(100 * acc / len(true))

In [None]:
# ORIGINAL_DATA_PATH = "/content/drive/MyDrive/floodnet_data/"
ORIGINAL_DATA_PATH = "/content/drive/MyDrive/floodnet_data"

In [None]:
qdict = json.load(open("/content/drive/MyDrive/floodnet_data/Questions/Training Question.json", "r"))
# qdict = json.load(open("/content/drive/MyDrive/floodnet_data/floodnet_data/Questions/Training Question.json", "r"))

train_dict, val_dict = get_train_val_splits_imagewise(qdict)
train_tasks, val_tasks = get_typewise_train_val_splits(train_dict, val_dict)

In [None]:
from torchvision import transforms as T
from torchvision import transforms

In [None]:
class ZeroShotVQADataset(Dataset):
    def __init__(self, qdict, q_to_prompt_dict=Q_TO_PROMPT_DICT, dedup_dict=DEDUP_Q_DICT, possible_answers_dict=Q_TO_POSSIBLE_ANSWERS_DICT):
        super().__init__()
        self.q_to_prompt_dict = q_to_prompt_dict
        self.dedup_dict = dedup_dict
        self.qdict = qdict
        self.possible_answers_dict = possible_answers_dict
        self.tr = transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Resize([224,224]),
                ])
        

        self.reset_index()

    def reset_index(self):
        new_qdict = dict()
        for idx, value in enumerate(self.qdict.values()):
            new_qdict[idx] = value
        self.qdict = new_qdict
    
    def _get_imgpath(self, imageid):
        return os.path.join(ORIGINAL_DATA_PATH, "Images", "Train_Image", imageid)

    def __len__(self):
        return len(self.qdict.keys())
    
    def __getitem__(self, idx):
        """
         '3878': {'Image_ID': '8928.JPG',
            'Question': 'What is the condition of road?',
            'Ground_Truth': 'non flooded',
            'Question_Type': 'Condition_Recognition'},
        """
        example = self.qdict[idx]

        img = self.tr(Image.open(self._get_imgpath(example["Image_ID"])))
        question = self.dedup_dict[example["Question"]]
        answer_word = str(example['Ground_Truth'])
        

        prompt = self.q_to_prompt_dict[question]
        if question.startswith("Is"):
            if "non" in question:
                possible_answers = ["No, the entire road is not non flooded.", "Yes, the entire road is non flooded."]
                answer = possible_answers[0] if answer_word == "no" else possible_answers[1]
                answer_idx = 0 if answer_word == "no" else 1
            else:
                possible_answers = ["No, the entire road is not flooded.", "Yes, the entire road is flooded."]
                answer = possible_answers[0] if answer_word == "no" else possible_answers[1]
                answer_idx = 0 if answer_word == "no" else 1
        else:
            possible_answers = [prompt.replace("[label]", possible_answer_word) for possible_answer_word in self.possible_answers_dict[question]]
            answer_idx = self.possible_answers_dict[question].index(answer_word)

        # print(possible_answers)
        return img, possible_answers, answer_idx

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class ZeroShotVQAModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
        self.preprocessor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

    def forward(self, batch):

        batch[1] = [i[0] for i in batch[1]]

        inputs = self.preprocessor(
            text=list(batch[1]), images=list(batch[0]), return_tensors="pt", padding=True
        )

        inputs = {k:v.to(device) for k,v in inputs.items()}
        
        outputs = self.model(**inputs)

        return outputs, batch[2]

In [None]:
%%capture
model = ZeroShotVQAModel()
model.eval()
model.to(device)

In [None]:
from tqdm.notebook import tqdm

In [None]:
ds = ZeroShotVQADataset(val_dict) # Validation
# ds = ZeroShotVQADataset(val_tasks[0]) # image
# ds = ZeroShotVQADataset(val_tasks[1]) # road
# ds = ZeroShotVQADataset(val_tasks[2]) # yesno

In [None]:
preds = []
labels = []

dl = DataLoader(ds, batch_size=1, shuffle=False)

with torch.no_grad():
    for batch in tqdm(dl, total=len(ds)):
        outputs, labels_ = model(batch)
        preds.append(outputs.logits_per_image.cpu().detach())
        labels.append(labels_)

all_preds = torch.concat([pred.argmax(dim=-1) for pred in preds], dim=0)
all_labels = torch.concat(labels, dim=0)

print((all_preds == all_labels).float().sum() / len(all_labels))

  0%|          | 0/883 [00:00<?, ?it/s]

tensor(0.3556)


In [None]:
for valtaskdict in val_tasks:
    ds = ZeroShotVQADataset(valtaskdict) # image # road #yesno

    preds = []
    labels = []

    dl = DataLoader(ds, batch_size=1, shuffle=False)

    with torch.no_grad():
        for batch in tqdm(dl, total=len(ds)):
            outputs, labels_ = model(batch)
            preds.append(outputs.logits_per_image.cpu().detach())
            labels.append(labels_)

    all_preds = torch.concat([pred.argmax(dim=-1) for pred in preds], dim=0)
    all_labels = torch.concat(labels, dim=0)

    print((all_preds == all_labels).float().sum() / len(all_labels))

  0%|          | 0/290 [00:00<?, ?it/s]

tensor(0.4172)


  0%|          | 0/172 [00:00<?, ?it/s]

tensor(0.8314)


  0%|          | 0/172 [00:00<?, ?it/s]

tensor(0.1512)


tensor(334.)

In [None]:
334 / 891

0.37485970819304154

In [None]:
outputs, labels = model(batch)

In [None]:
batch

In [None]:
(outputs.logits_per_image.argmax(dim=-1) == labels.to(device)).float()

tensor([0.], device='cuda:0')

In [None]:
# from PIL import Image
# import requests
# from transformers import CLIPProcessor, CLIPModel

# model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
# processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
# image = Image.open(requests.get(url, stream=True).raw)

# inputs = processor(
#     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
# )

# outputs = model(**inputs)
# logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
# probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities