In [3]:
from transformers import AutoImageProcessor, SegformerForSemanticSegmentation
from PIL import Image
import requests

In [11]:
root_dir = "C:\\Users\\bhatt\\Machine Learning\\vision-transformer\\Swin-Transformer\\Dataset"


In [35]:
import os
import torch
from torch.utils.data import Dataset
from PIL import Image
import torchvision.transforms as transforms

class ForgeryDataset(Dataset):
    def __init__(self, image_dir, mask_dir, transform=None, mask_transform=None, limit=None):
        self.image_dir = image_dir
        self.mask_dir = mask_dir
        self.transform = transform
        self.mask_transform = mask_transform
        self.image_files = sorted(os.listdir(image_dir))[:limit]
        self.mask_files = sorted(os.listdir(mask_dir))[:limit]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        mask_path = os.path.join(self.mask_dir, self.mask_files[idx])

        image = Image.open(img_path).convert("RGB")
        mask = Image.open(mask_path).convert("L")

        if self.transform:
            image = self.transform(image)
        if self.mask_transform:
            mask = self.mask_transform(mask)

        return image, mask


In [36]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

mask_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])


In [53]:
dataset = ForgeryDataset(
    image_dir='C:\\Users\\bhatt\\Machine Learning\\vision-transformer\\Swin-Transformer\\Dataset\\Test\\Original',
    mask_dir='C:\\Users\\bhatt\\Machine Learning\\vision-transformer\\Swin-Transformer\\Dataset\\Test\\Faked',
    transform=transform,
    mask_transform=mask_transform,
    limit=16  # Limit the dataset to 100 samples for testing
)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True)  # Reduce batch size to 4


In [54]:
import torch
import torch.nn as nn
from transformers import ViTModel

class ForgeryDetectionModel(nn.Module):
    def __init__(self):
        super(ForgeryDetectionModel, self).__init__()
        self.backbone = ViTModel.from_pretrained('google/vit-base-patch16-224')
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(768, 512, kernel_size=2, stride=2),
            nn.ReLU(),
            nn.ConvTranspose2d(512, 256, kernel_size=2, stride=2),
            nn.ReLU(),
            nn.ConvTranspose2d(256, 128, kernel_size=2, stride=2),
            nn.ReLU(),
            nn.Conv2d(128, 1, kernel_size=1)
        )

    def forward(self, x):
        batch_size = x.shape[0]
        features = self.backbone(x).last_hidden_state  # Shape: [batch_size, num_patches, hidden_size]
        
        # Calculate the grid size
        grid_size = int(features.shape[1]**0.5)  # num_patches should be a perfect square
        
        # Reshape features to [batch_size, hidden_size, grid_size, grid_size]
        features = features.permute(0, 2, 1).view(batch_size, 768, grid_size, grid_size)
        
        # Decode to segmentation mask
        output = self.decoder(features)
        return output

model = ForgeryDetectionModel()


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [55]:
import torch.optim as optim
import torch.nn.functional as F

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

num_epochs = 1  # Adjust as needed
dataloader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True)


In [57]:
import torch.optim as optim
import torch.nn.functional as F

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

num_epochs = 5  # Adjust as needed
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, masks in dataloader:
        inputs, masks = inputs.to(device), masks.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        outputs = F.interpolate(outputs, size=masks.shape[2:], mode='bilinear', align_corners=False)  # Upsample to match mask size
        loss = criterion(outputs, masks)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(dataloader):.4f}')


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 

In [51]:
from sklearn.metrics import precision_score, recall_score, f1_score

def evaluate_model(model, dataloader):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for inputs, masks in dataloader:
            inputs, masks = inputs.to(device), masks.to(device)
            outputs = model(inputs)
            outputs = F.interpolate(outputs, size=masks.shape[2:], mode='bilinear', align_corners=False)
            preds = torch.sigmoid(outputs) > 0.5
            all_preds.extend(preds.cpu().numpy().flatten())
            all_labels.extend(masks.cpu().numpy().flatten())

    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    return precision, recall, f1

precision, recall, f1 = evaluate_model(model, dataloader)
print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}')


RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same