<a href="https://colab.research.google.com/github/G0nkly/Azure-in-bullet-points/blob/master/vits/vlms/nanoVLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
## Imports

In [10]:
import math, random
import numpy as np
import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt

In [2]:
## Variables

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
IMG_SIZE = 32
EMBED_DIM = 64
ATTENTION_HEADS = 4
BATCH_SIZE = 12
EPOCHS = 10
LR = 3e-4
TEMPERATURE = 0.07

In [4]:
## Synthetic Dataset

In [5]:
colors = ["red", "green", "blue", "yellow", "purple", "orange", "pink", "brown", "gray"]
shapes = ["square", "circle", "triangle"]
positions = ["left", "center", "right", "top", "bottom", "top-left", "top-right", "bottom-left", "bottom-right"]

In [6]:
### Drawing image shapes

In [7]:
def draw_sample(color, shape, position, img_size=IMG_SIZE):
  img = Image.new("RGB", (img_size, img_size), "white")
  draw = ImageDraw.Draw(img)
  margin = 6
  w = h = img_size - 2 * margin

  # Calculate the coordinates
  if "left" in position:
    x0 = margin
    x1 = margin + w // 2
  elif "top-left" in position:
    x0 = margin
    x1 = margin + w // 2
  elif "bottom-left" in position:
    x0 = margin
    x1 = margin + w // 2
  elif "right" in position:
    x0 = margin + w // 2
    x1 = img_size - margin
  elif "top-right" in position:
    x0 = margin + w // 2
    x1 = img_size - margin
  elif "bottom-right" in position:
    x0 = margin + w // 2
    x1 = img_size - margin
  else:
    x0 = margin + w // 4
    x1 = margin + h // 2


  # Calculate y coordinates
  if "top" in position:
    y0 = margin
    y1 = margin + h // 2
  elif "top-left" in position:
    y0 = margin
    y1 = margin + h // 2
  elif "top-right" in position:
    y0 = margin
    y1 = margin + h // 2
  elif "bottom" in position:
    y0 = margin + h // 2
    y1 = img_size - margin
  elif "bottom-left" in position:
    y0 = margin + h // 2
    y1 = img_size - margin
  elif "bottom-right" in position:
    y0 = margin + h // 2
    y1 = img_size - margin
  else:
    y0 = margin + h // 4
    y1 = margin + 3 * h // 4

  if shape == "square":
    draw.rectangle([x0, y0, x1, y1], fill=color, outline="black")
  elif shape == "circle":
    draw.ellipse([x0, y0, x1, y1], fill=color, outline="black")
  else:
    draw.polygon([((x1+x0)//2, y0), (x0, y1), (x1, y1)], fill=color, outline="black")

  return img



In [9]:
## Class for building the dataset

In [19]:
class ShapesDataset():
  def __init__(self):
    self.images = []
    self.captions = []

    for c in colors:
      for s in shapes:
        for p in positions:
          img = draw_sample(c, s, p)
          cap = f"{c} {s} {p}"
          self.images.append(torch.from_numpy(np.asarray(img)).permute(2,0,1).float()/255.0)
          self.captions.append(cap)

    self.vocab, self.word2idx = self.build_vocab(self.captions)

  def build_vocab(self, texts):
    words = sorted({w for t in texts for w in t.split()})
    vocab = ["CLS"] + words
    w2i = {w:i for i,w in enumerate(vocab)}
    return vocab, w2i

  def encode_text(self, text):
    toks = [self.word2idx["[CLS]"]] + [self.word2idx(w) for w in text.split()]
    return torch.tensor(toks, dtype=torch.lang)

  def __getitem__(self,  idx):
    return self.images[idx], self.encode_text(self.captions[idx])

In [20]:
## Create Dataset

In [23]:
full_ds = ShapesDataset()
VOCAB_SIZE = len(full_ds.vocab)
print(VOCAB_SIZE)

22
