In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
from transformers import AutoModel, AutoTokenizer, BertTokenizer
from torch.utils.data import Dataset, DataLoader
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import pandas as pd
import dataset as wsd
import numpy as np
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

BATCH_SIZE = 32
EMBED_DIM = 512
TRANSFORMER_EMBED = 768
IMAGE_SIZE = 255

class Projection(nn.Module):
    def __init__(self, d_in: int, d_out: int, p: float=0.5) -> None:
        super().__init__()
        self.linear1 = nn.Linear(d_in, d_out, bias=False)
        self.linear2 = nn.Linear(d_out, d_out, bias=False)
        self.layer_norm = nn.LayerNorm(d_out)
        self.drop = nn.Dropout(p)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        embed1 = self.linear1(x)
        embed2 = self.drop(self.linear2(F.gelu(embed1)))
        embeds = self.layer_norm(embed1 + embed2)
        return embeds

In [2]:
class VisionEncoder(nn.Module):
    def __init__(self, d_out: int) -> None:
        super().__init__()
        base = models.resnet34(pretrained=True)
        d_in = base.fc.in_features
        base.fc = nn.Identity()
        self.base = base
        self.projection = Projection(d_in, d_out)
        for p in self.base.parameters():
            p.requires_grad = False

    def forward(self, x):
        projected_vec = self.projection(self.base(x))
        projection_len = torch.norm(projected_vec, dim=-1, keepdim=True)
        return projected_vec / projection_len

In [3]:
class TextEncoder(nn.Module):
    def __init__(self, d_out: int) -> None:
        super().__init__()
        self.base = AutoModel.from_pretrained("distilbert-base-multilingual-cased")
        self.projection = Projection(TRANSFORMER_EMBED, d_out)
        for p in self.base.parameters():
            p.requires_grad = False

    def forward(self, x):
        out = self.base(x)[0]
        out = out[:, 0, :]  # get CLS token output
        projected_vec = self.projection(out)
        projection_len = torch.norm(projected_vec, dim=-1, keepdim=True)
        return projected_vec / projection_len

In [4]:
class Tokenizer:
    def __init__(self, tokenizer: BertTokenizer) -> None:
        self.tokenizer = tokenizer

    def __call__(self, x: str) -> AutoTokenizer:
        return self.tokenizer(
            x, truncation=True, padding=True, return_tensors="pt"
        )

In [5]:
def metrics(similarity: torch.Tensor):
    y = torch.arange(len(similarity)).to(similarity.device)
    img2cap_match_idx = similarity.argmax(dim=1)
    cap2img_match_idx = similarity.argmax(dim=0)

    img_acc = (img2cap_match_idx == y).float().mean()
    cap_acc = (cap2img_match_idx == y).float().mean()

    return img_acc, cap_acc

In [6]:
class CustomModel(nn.Module):
    def __init__(self, lr: float = 1e-3) -> None:
        super().__init__()
        self.vision_encoder = VisionEncoder(EMBED_DIM)
        self.caption_encoder = TextEncoder(EMBED_DIM)
        self.tokenizer = Tokenizer(AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased"))
        self.lr = lr
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

    def forward(self, images, text):
        text = self.tokenizer(text).to(self.device)

        image_embed = self.vision_encoder(images)
        caption_embed = self.caption_encoder(text["input_ids"])

        similarity = caption_embed @ image_embed.T

        loss = self.CLIP_loss(similarity)
        img_acc, cap_acc = metrics(similarity)

        return loss, img_acc, cap_acc
    
    def CLIP_loss(self, logits: torch.Tensor) -> torch.Tensor:
        n = logits.shape[1]      # number of samples
        labels = torch.arange(n).to(self.device) # Create labels tensor
        # Calculate cross entropy losses along axis 0 and 1
        loss_i = F.cross_entropy(logits.transpose(0, 1), labels, reduction="mean")
        loss_t = F.cross_entropy(logits, labels, reduction="mean")
        # Calculate the final loss
        loss = (loss_i + loss_t) / 2

        return loss
    
    def top_image(self, images, text):
        text = self.tokenizer(text).to(self.device)
        caption_embed = self.caption_encoder(text["input_ids"])

        similarities = []

        for image in images:
            image_embed = self.vision_encoder(image)
            similarities.append(F.cosine_similarity(image_embed, caption_embed, dim=1).item())

        top_image = np.argsort(similarities)[-1:][::-1]

        return top_image

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CustomModel().to(device)



In [8]:
model.device

'cuda'

In [9]:
optimizer = torch.optim.Adam([
    {'params': model.vision_encoder.parameters()},
    {'params': model.caption_encoder.parameters()}
], lr=model.lr)

In [10]:
from torchvision import transforms as tt

scale = tt.Resize((IMAGE_SIZE, IMAGE_SIZE))
tensor = tt.ToTensor()
image_composed = tt.transforms.Compose([scale, tensor])

In [11]:
train_set = wsd.VisualWSDDataset(mode="train", image_transform=image_composed)
test_set = wsd.VisualWSDDataset(mode="test", image_transform=image_composed, test_lang='en')

train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=True)

In [49]:
start_epoch = 0
num_epochs = 1

batch_zero = True
for epoch in range(start_epoch, num_epochs):
    model.train()
    for batch in test_loader:
        image = batch["correct_img"].to(device)
        text = batch["label_context"]
        # images, text = batch
        loss, img_acc, cap_acc = model(image, text)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_zero:
          print(f"Epoch [{0}/{num_epochs}], Batch Loss: {loss.item()}")
          batch_zero = False


    # Print training statistics
    print(f"Epoch [{epoch+1}/{num_epochs}], Batch Loss: {loss.item()}")

print("Training complete.")

  return F.conv2d(input, weight, bias, self.stride,


Epoch [0/1], Batch Loss: 3.466111898422241
Epoch [1/1], Batch Loss: 2.694244384765625
Training complete.


In [12]:
test_loader = DataLoader(test_set, batch_size=1, shuffle=True)

for batch in test_loader:
  images = batch["imgs"]
  text = batch["label_context"]
  idx = model.top_image(images, text)
  print(idx[0], batch["correct_idx"].item())

RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor

In [12]:
from transformers import DistilBertTokenizerFast

bert_tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokens(texts):
    return bert_tokenizer(texts, truncation=True, padding=True)


In [13]:
train_set_token = wsd.VisualWSDDataset(mode="train", image_transform=image_composed, tokenizer=tokens)
trial_set_token = wsd.VisualWSDDataset(mode="val", image_transform=image_composed, tokenizer=tokens)

In [14]:
trial_set_token[0]

  item['images'] = torch.tensor(correct_image).detach()


{'input_ids': tensor([  101,  1998, 21716, 11960,  3392,   102,     0,     0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 0, 0]),
 'images': tensor([[[0.0510, 0.0118, 0.0510,  ..., 0.3059, 0.3059, 0.3216],
          [0.1059, 0.0431, 0.0588,  ..., 0.2510, 0.2471, 0.2667],
          [0.0667, 0.0510, 0.0784,  ..., 0.1490, 0.1294, 0.1804],
          ...,
          [0.6471, 0.5412, 0.4784,  ..., 0.1255, 0.1333, 0.1882],
          [0.5922, 0.4941, 0.4706,  ..., 0.1569, 0.1412, 0.2275],
          [0.5725, 0.5059, 0.4941,  ..., 0.1608, 0.1725, 0.3255]],
 
         [[0.1059, 0.0863, 0.0706,  ..., 0.3176, 0.3294, 0.3451],
          [0.1294, 0.0980, 0.0588,  ..., 0.2510, 0.2627, 0.2941],
          [0.1333, 0.0980, 0.0431,  ..., 0.1451, 0.1490, 0.2039],
          ...,
          [0.6627, 0.5529, 0.4667,  ..., 0.2706, 0.3059, 0.3020],
          [0.6078, 0.4980, 0.4706,  ..., 0.3137, 0.2863, 0.2510],
          [0.5843, 0.5059, 0.4980,  ..., 0.3137, 0.2627, 0.2235]],
 
         [[0.0235, 0.0039, 0.

In [15]:
class CustomModelHugging(nn.Module):
    def __init__(self, lr: float = 1e-3) -> None:
        super().__init__()
        self.vision_encoder = VisionEncoder(EMBED_DIM)
        self.caption_encoder = TextEncoder(EMBED_DIM)
        #self.tokenizer = Tokenizer(AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased"))
        self.lr = lr
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

    def forward(self, images, input_ids, attention_mask):
        image_embed = self.vision_encoder(images)
        caption_embed = self.caption_encoder(input_ids)

        similarity = caption_embed @ image_embed.T

        loss = self.CLIP_loss(similarity)
        img_acc, cap_acc = metrics(similarity)

        return loss, img_acc, cap_acc
    
    def CLIP_loss(self, logits: torch.Tensor) -> torch.Tensor:
        n = logits.shape[1]      # number of samples
        labels = torch.arange(n).to(self.device) # Create labels tensor
        # Calculate cross entropy losses along axis 0 and 1
        loss_i = F.cross_entropy(logits.transpose(0, 1), labels, reduction="mean")
        loss_t = F.cross_entropy(logits, labels, reduction="mean")
        # Calculate the final loss
        loss = (loss_i + loss_t) / 2

        return loss

In [16]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=300,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    dataloader_num_workers=3,
)

model = CustomModelHugging()

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_set_token,         # training dataset
    eval_dataset=trial_set_token,             # evaluation dataset
    tokenizer=bert_tokenizer,
    optimizers=(optimizer, None)        # no scheduler
)

trainer.train()

  item['images'] = torch.tensor(correct_image).detach()
  item['images'] = torch.tensor(correct_image).detach()
  item['images'] = torch.tensor(correct_image).detach()
  return F.conv2d(input, weight, bias, self.stride,
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss
10,4.159
20,4.1589
30,4.1594
40,4.1591
50,4.1591
60,4.1591
70,4.1593
80,4.1581
90,4.1596
100,4.1588


  item['images'] = torch.tensor(correct_image).detach()
  item['images'] = torch.tensor(correct_image).detach()
  item['images'] = torch.tensor(correct_image).detach()
  item['images'] = torch.tensor(correct_image).detach()
  item['images'] = torch.tensor(correct_image).detach()
  item['images'] = torch.tensor(correct_image).detach()


TrainOutput(global_step=606, training_loss=4.146454820538511, metrics={'train_runtime': 6095.2045, 'train_samples_per_second': 6.334, 'train_steps_per_second': 0.099, 'total_flos': 0.0, 'train_loss': 4.146454820538511, 'epoch': 3.0})

In [None]:
# TODO
# Trainer ausprobieren
# DataLoader anpassen mit Validation Dataset