# import

In [1]:
import warnings
warnings.filterwarnings('ignore')
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'true'
from transformers import AutoModel, CLIPProcessor
from models import WhereIsFeatures
from dataset import FolderData
import tensorflow as tf
from torch.utils.data import DataLoader
from timm.scheduler.cosine_lr import CosineLRScheduler
from torch import nn
import torch
from matplotlib import pyplot as plt
from torchvision import transforms
from PIL import Image
import numpy as np
from torchvision import datasets

2023-05-10 11:18:52.829888: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-10 11:18:52.853949: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# fns

In [2]:
def accuracy(text_embeds, image_embeds, labels):
    logits_per_image = torch.matmul(text_embeds, image_embeds.t()).t()
    probs = logits_per_image.softmax(dim=1)
    return (probs.argmax(1) == labels).float().mean()

# params

In [3]:
device = 'cuda'
n_epochs = 5
warmup = 4
num_workers = 4
batch_size = 16

# data

In [4]:
test_src = '/home/palm/data/animals/moved'
train_src = '/home/palm/data/dogs-vs-cats/train'

train_src = '/home/palm/data/animals/animals/animals'
test_src = '/home/palm/data/dogs-vs-cats/train'

In [5]:
train_dataset = FolderData(train_src, size=224, mul=1)
val_dataset = FolderData(test_src, size=224, mul=1)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
test_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)

In [6]:
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
test_texts = []
for folder in sorted(os.listdir(test_src)):
    if 'otter' in folder:
        test_texts.append(f'boooooo')
        continue
    test_texts.append(f'a photo of a {folder}')
test_inputs = processor(text=test_texts, return_tensors="pt", padding=True)
train_texts = []
for folder in sorted(os.listdir(train_src)):
    if 'otter' in folder:
        train_texts.append(f'boooooo')
        continue
    train_texts.append(f'a photo of a {folder}')
train_inputs = processor(text=train_texts, return_tensors="pt", padding=True)

In [7]:
test_texts

['a photo of a cat', 'a photo of a dog']

In [8]:
train_texts

['a photo of a antelope',
 'a photo of a badger',
 'a photo of a bat',
 'a photo of a bear',
 'a photo of a bee',
 'a photo of a beetle',
 'a photo of a bison',
 'a photo of a boar',
 'a photo of a butterfly',
 'a photo of a caterpillar',
 'a photo of a chimpanzee',
 'a photo of a cockroach',
 'a photo of a coyote',
 'a photo of a crab',
 'a photo of a crow',
 'a photo of a deer',
 'a photo of a dolphin',
 'a photo of a donkey',
 'a photo of a dragonfly',
 'a photo of a eagle',
 'a photo of a elephant',
 'a photo of a flamingo',
 'a photo of a fly',
 'a photo of a fox',
 'a photo of a goat',
 'a photo of a goldfish',
 'a photo of a goose',
 'a photo of a gorilla',
 'a photo of a grasshopper',
 'a photo of a hamster',
 'a photo of a hare',
 'a photo of a hedgehog',
 'a photo of a hippopotamus',
 'a photo of a hornbill',
 'a photo of a hummingbird',
 'a photo of a hyena',
 'a photo of a jellyfish',
 'a photo of a kangaroo',
 'a photo of a koala',
 'a photo of a ladybugs',
 'a photo of a 

# modules

In [9]:
mse = nn.MSELoss()
sigmoid = nn.Sigmoid()

clip = AutoModel.from_pretrained('openai/clip-vit-base-patch32').to(device)
for param in clip.parameters():
    param.requires_grad = False
vision_model = clip.vision_model
visual_projection = clip.visual_projection
text_projection = clip.text_projection
train_prompts = clip.text_model(**train_inputs.to('cuda'))
train_prompts = sigmoid(text_projection(train_prompts[1]))
test_prompts = clip.text_model(**test_inputs.to('cuda'))
test_prompts = sigmoid(text_projection(test_prompts[1]))
model = WhereIsFeatures()
model = model.to(device)

# autoencoder: encoder/decoder

In [10]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
schedule = CosineLRScheduler(optimizer,
                                t_initial=5,
                                t_mul=1,
                                lr_min=5e-5,
                                decay_rate=0.1,
                                cycle_limit=1,
                                t_in_epochs=False,
                                noise_range_t=None,
                                )
model.train()
progbar = tf.keras.utils.Progbar(len(train_loader))
for idx, (image, _, cls) in enumerate(train_loader):
    image = image.to(device)
    cls = cls.to(device)
    with torch.no_grad():
        features = vision_model(image)['pooler_output']
        features = visual_projection(features)
        features = sigmoid(features)
        std_acc = accuracy(train_prompts, features, cls)

    x = model.encode(features)
    recon = model.decode(x)
    recon_acc = accuracy(train_prompts, recon, cls)
    loss = mse(recon, features)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    printlog = [('loss', loss.cpu().detach().numpy()),
                ('std_acc', std_acc.cpu().detach().numpy()),
                ('recon_acc', recon_acc.cpu().detach().numpy()),
                ]
    progbar.update(idx + 1, printlog)
model.eval()
progbar = tf.keras.utils.Progbar(len(test_loader))
for idx, (image, _, cls) in enumerate(test_loader):
    image = image.to(device)
    cls = cls.to(device)
    with torch.no_grad():
        features = vision_model(image)['pooler_output']
        features = visual_projection(features)
        features = sigmoid(features)
        std_acc = accuracy(test_prompts, features, cls)
        x = model.encode(features)
        recon = model.decode(x)
        recon_acc = accuracy(test_prompts, recon, cls)
        loss = mse(recon, features)
        printlog = [('loss', loss.cpu().detach().numpy()),
                    ('std_acc', std_acc.cpu().detach().numpy()),
                    ('recon_acc', recon_acc.cpu().detach().numpy()),
                    ]
        progbar.update(idx + 1, printlog)




In [13]:

logits_per_image = torch.matmul(test_prompts, features.t()).t()
probs = logits_per_image.softmax(dim=1)


In [14]:
probs.argmax(1)

tensor([1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0], device='cuda:0')

In [15]:
cls

tensor([1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0], device='cuda:0')

# autoencoder: buffer nowhere

In [11]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
schedule = CosineLRScheduler(optimizer,
                             warmup_t=1,
                             warmup_lr_init=1e-5,
                             t_initial=n_epochs,
                             t_mul=1,
                             lr_min=5e-5,
                             decay_rate=0.1,
                             cycle_limit=1,
                             t_in_epochs=False,
                             noise_range_t=None,
                                )
for epoch in range(n_epochs):
    print('Epoch:', epoch + 1)
    model.train()
    progbar = tf.keras.utils.Progbar(len(train_loader))
    for idx, (image, _, cls) in enumerate(train_loader):
        image = image.to(device)
        cls = cls.to(device)
        with torch.no_grad():
            features = vision_model(image)['pooler_output']
            features = visual_projection(features)
            features = sigmoid(features)
            std_acc = accuracy(train_prompts, features, cls)
            prompts_ecd = model.encode(train_prompts)
            _, prompts_ecd, _ = model.where(prompts_ecd, False)
            x = model.encode(features)
        x, ecd, gt = model.where(x, False)
        buffer_acc = accuracy(prompts_ecd[:, 0], ecd[:, 0], cls)
        with torch.no_grad():
            recon = model.decode(x)
            recon_acc = accuracy(train_prompts, recon, cls)
        loss = mse(x, gt)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        printlog = [('loss', loss.cpu().detach().numpy()),
                    ('std_acc', std_acc.cpu().detach().numpy()),
                    ('recon_acc', recon_acc.cpu().detach().numpy()),
                    ('buffer_acc', buffer_acc.cpu().detach().numpy()),
                    ]
        progbar.update(idx + 1, printlog)
    model.eval()
    progbar = tf.keras.utils.Progbar(len(test_loader))
    with torch.no_grad():
        for idx, (image, _, cls) in enumerate(test_loader):
            image = image.to(device)
            cls = cls.to(device)
            features = vision_model(image)['pooler_output']
            features = visual_projection(features)
            features = sigmoid(features)
            std_acc = accuracy(test_prompts, features, cls)
            prompts_ecd = model.encode(test_prompts)
            _, prompts_ecd, _ = model.where(prompts_ecd, False)
            x = model.encode(features)
            x, ecd, gt = model.where(x, False)
            buffer_acc = accuracy(prompts_ecd[:, 0], ecd[:, 0], cls)
            recon = model.decode(x)
            recon_acc = accuracy(test_prompts, recon, cls)
            loss = mse(x, gt)
            printlog = [('loss', loss.cpu().detach().numpy()),
                        ('std_acc', std_acc.cpu().detach().numpy()),
                        ('recon_acc', recon_acc.cpu().detach().numpy()),
                        ('buffer_acc', buffer_acc.cpu().detach().numpy()),
                        ]
        progbar.update(idx + 1, printlog)


Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5


# autoencoder: buffer where

In [12]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
schedule = CosineLRScheduler(optimizer,
                             warmup_t=1,
                             warmup_lr_init=1e-5,
                             t_initial=n_epochs,
                             t_mul=1,
                             lr_min=5e-5,
                             decay_rate=0.1,
                             cycle_limit=1,
                             t_in_epochs=False,
                             noise_range_t=None,
                                )
for epoch in range(n_epochs):
    print('Epoch:', epoch + 1)
    model.train()
    progbar = tf.keras.utils.Progbar(len(train_loader))
    for idx, (image, _, cls) in enumerate(train_loader):
        image = image.to(device)
        cls = cls.to(device)
        with torch.no_grad():
            features = vision_model(image)['pooler_output']
            features = visual_projection(features)
            features = sigmoid(features)
            std_acc = accuracy(train_prompts, features, cls)
            prompts_ecd = model.encode(train_prompts)
            _, prompts_ecd, _ = model.where(prompts_ecd, True)
            x = model.encode(features)
        x, ecd, gt = model.where(x, True)
        buffer_acc = accuracy(prompts_ecd[:, 0], ecd[:, 0], cls)
        with torch.no_grad():
            recon = model.decode(x)
            recon_acc = accuracy(train_prompts, recon, cls)
        loss = mse(x, gt)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        printlog = [('loss', loss.cpu().detach().numpy()),
                    ('std_acc', std_acc.cpu().detach().numpy()),
                    ('recon_acc', recon_acc.cpu().detach().numpy()),
                    ('buffer_acc', buffer_acc.cpu().detach().numpy()),
                    ]
        progbar.update(idx + 1, printlog)
    model.eval()
    progbar = tf.keras.utils.Progbar(len(test_loader))
    with torch.no_grad():
        for idx, (image, _, cls) in enumerate(test_loader):
            image = image.to(device)
            cls = cls.to(device)
            features = vision_model(image)['pooler_output']
            features = visual_projection(features)
            features = sigmoid(features)
            std_acc = accuracy(test_prompts, features, cls)
            prompts_ecd = model.encode(test_prompts)
            _, prompts_ecd, _ = model.where(prompts_ecd, True)
            x = model.encode(features)
            x, ecd, gt = model.where(x, True)
            buffer_acc = accuracy(prompts_ecd[:, 0], ecd[:, 0], cls)
            recon = model.decode(x)
            recon_acc = accuracy(test_prompts, recon, cls)
            loss = mse(x, gt)
            printlog = [('loss', loss.cpu().detach().numpy()),
                        ('std_acc', std_acc.cpu().detach().numpy()),
                        ('recon_acc', recon_acc.cpu().detach().numpy()),
                        ('buffer_acc', buffer_acc.cpu().detach().numpy()),
                        ]
        progbar.update(idx + 1, printlog)


Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5


In [13]:
from datetime import datetime

In [14]:
n1 = datetime.now() 

In [15]:
n2 = datetime.now()

In [16]:
n2 - n1

datetime.timedelta(seconds=6, microseconds=652361)