In [None]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from PIL import Image
import timm
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def load_image(image_path, size=224):  # ViT expects 224x224 by default
    image = Image.open(image_path).convert('RGB')
    transform = transforms.Compose([
        transforms.Resize((size, size)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    return transform(image).unsqueeze(0).to(device)

content_img = load_image('/content/content.jpg')
style_img = load_image('/content/style.jpg')
output_img = content_img.clone().requires_grad_(True)  # Initialize output as content image

In [None]:
model = timm.create_model('vit_base_patch16_224', pretrained=True).to(device)
model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity(

In [None]:
# 3. Feature Extractor
class ViTFeatureExtractor(nn.Module):
    def __init__(self, model, selected_layers):
        super().__init__()
        self.model = model
        self.selected_layers = selected_layers
        self.features = {}

    def forward(self, x):
        self.features.clear()
        x = self.model.patch_embed(x)
        cls_token = self.model.cls_token.expand(x.shape[0], -1, -1)
        x = torch.cat((cls_token, x), dim=1)
        x = x + self.model.pos_embed
        x = self.model.pos_drop(x)

        for i, block in enumerate(self.model.blocks):
            x = block(x)
            layer_name = f'block_{i}'
            if layer_name in self.selected_layers:
                patches = x[:, 1:, :]  # Shape: [batch, patches, channels]
                self.features[layer_name] = patches
        return x

content_layers = ['block_9']
style_layers = ['block_0', 'block_3', 'block_6']
all_layers = content_layers + style_layers
extractor = ViTFeatureExtractor(model, all_layers)

In [None]:
def content_loss(target, content):
    return torch.mean((target - content) ** 2)

def gram_matrix(features):
    batch, patches, channels = features.shape
    features = features.view(batch, patches, channels)
    features_t = features.transpose(1, 2)
    gram = features @ features_t
    return gram / (patches * channels)

def style_loss(target, style):
    target_gram = gram_matrix(target)
    style_gram = gram_matrix(style)
    return torch.mean((target_gram - style_gram) ** 2)

In [None]:
with torch.no_grad():
    content_features = extractor(content_img)
    style_features = extractor(style_img)

In [None]:
# 6. Optimization
optimizer = torch.optim.LBFGS([output_img])
alpha = 1.0
beta = 1e6

def closure():
    optimizer.zero_grad()
    extractor(output_img)  # Populate extractor.features

    # Compute content loss
    c_loss = 0
    for l in content_layers:
        target = extractor.features[l]
        # Access content features from extractor.features
        content = extractor.features[l]
        c_loss += content_loss(target, content)

    # Compute style loss
    s_loss = 0
    for l in style_layers:
        target = extractor.features[l]
        # Access style features from extractor.features
        style = extractor.features[l]
        s_loss += style_loss(target, style)

    total_loss = alpha * c_loss + beta * s_loss
    total_loss.backward()
    return total_loss

steps = 300
for step in range(steps):
    optimizer.step(closure)
    if step % 50 == 0:
        print(f"Step {step}, Loss: {closure().item():.4f}")

Step 0, Loss: 0.0000
Step 50, Loss: 0.0000
Step 100, Loss: 0.0000
Step 150, Loss: 0.0000
Step 200, Loss: 0.0000
Step 250, Loss: 0.0000
