In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install torch torchvision transformers matplotlib


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [4]:
import os
import cv2
import torch
import numpy as np
import matplotlib.pyplot as plt
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader


In [18]:
class FloorPlanDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_paths = []
        self.labels = []

        # Mapping of labels
        self.label_map = {"3 rooms": 0, "4 rooms": 1, "5 rooms": 2}

        # Iterate through folders
        for label in os.listdir(root_dir):
            folder_path = os.path.join(root_dir, label)
            if os.path.isdir(folder_path):
                for img_name in os.listdir(folder_path):
                    img_path = os.path.join(folder_path, img_name)
                    self.image_paths.append(img_path)
                    self.labels.append(self.label_map[label])  # Convert to numeric label

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        label = self.labels[idx]

        image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)  # Convert to 3-channel RGB

        if self.transform:
            image = self.transform(image)

        return image, label

# Define transformation
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),  # Ensure correct size for ViT
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # Ensure 3 channels
])



# Load dataset
dataset_path = "/content/drive/MyDrive/datafloor"
dataset = FloorPlanDataset(dataset_path, transform=transform)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)


In [11]:
!pip install transformers




In [26]:
import torch
import torch.nn as nn
from transformers import ViTModel

class ViTVAE(nn.Module):
    def __init__(self, latent_dim=128):
        super(ViTVAE, self).__init__()

        # Load pre-trained ViT
        self.encoder = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")

        # Latent space mapping
        self.fc_mu = nn.Linear(768, latent_dim)
        self.fc_var = nn.Linear(768, latent_dim)

        # Decoder: Expand latent space and reshape to (batch_size, 256, 7, 7)
        self.decoder_input = nn.Linear(latent_dim, 256 * 7 * 7)

        # Decoder network with additional upsampling layers
        self.decoder = nn.Sequential(
            nn.ReLU(),
            nn.Unflatten(1, (256, 7, 7)),  # Reshape latent vector to feature map
            nn.ConvTranspose2d(256, 128, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(32, 16, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(16, 3, kernel_size=3, stride=2, padding=1, output_padding=1),  # Final layer to match 224x224
            nn.Tanh()
        )

    def encode(self, x):
        x = self.encoder(x).last_hidden_state  # Shape: (batch_size, num_patches, 768)
        x = x.mean(dim=1)  # Convert to (batch_size, 768)
        mu, log_var = self.fc_mu(x), self.fc_var(x)
        return mu, log_var

    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        z = self.decoder_input(z)  # Convert latent space to feature map
        return self.decoder(z)  # Pass through decoder layers

    def forward(self, x):
        mu, log_var = self.encode(x)
        z = self.reparameterize(mu, log_var)
        return self.decode(z), mu, log_var


In [27]:
# Instantiate the model and move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ViTVAE().to(device)

In [28]:
def loss_function(recon_x, x, mu, log_var):
    recon_loss = nn.MSELoss()(recon_x, x)  # Mean Squared Error
    kl_loss = -0.5 * torch.mean(1 + log_var - mu.pow(2) - log_var.exp())  # KL Divergence
    return recon_loss + kl_loss

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)


In [29]:
dummy_input = torch.randn(1, 3, 224, 224).to(device)
model = ViTVAE().to(device)

with torch.no_grad():
    recon_x, mu, log_var = model(dummy_input)
    print("Input shape:", dummy_input.shape)    # Expected: (1, 3, 224, 224)
    print("Output shape:", recon_x.shape)       # Expected: (1, 3, 224, 224)


Input shape: torch.Size([1, 3, 224, 224])
Output shape: torch.Size([1, 3, 224, 224])


In [None]:
num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in dataloader:
        images, _ = batch  # Ignore labels
        images = images.to(device)

        optimizer.zero_grad()
        recon_x, mu, log_var = model(images)
        loss = loss_function(recon_x, images, mu, log_var)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(dataloader):.4f}")
