# Using Pytorch to finetune for computer vision tasks

## Dataset

### Custom dataset

In [None]:
import os
from PIL import Image
from torch.utils.data import Dataset

class CustomImageDataset(Dataset):
    def __init__(self, image_dir):
        self.image_dir = image_dir
        self.image_paths = [os.path.join(image_dir, fname) for fname in os.listdir(image_dir)]
    
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert("RGB")

        label = 0 if "class0" in image_path else 1  
        
        return image, label

In [None]:
custom_dataset = CustomImageDataset(image_dir='data')

### Augmentation

In [None]:
from torchvision import transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

In [None]:
import os
from PIL import Image
from torch.utils.data import Dataset

class CustomImageDataset(Dataset):
    def __init__(self, image_dir, transform=None):
        self.image_dir = image_dir
        self.transform = transform
        self.image_paths = [os.path.join(image_dir, fname) for fname in os.listdir(image_dir)]
    
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert("RGB")
        
        if self.transform:
            image = self.transform(image)
        
        label = 0 if "class0" in image_path else 1  
        
        return image, label

In [None]:
custom_dataset = CustomImageDataset(image_dir='./data/custom_images', transform=transform)

### ImageFolder

```bash
data/
├── class_1/
│   ├── img1.jpg
│   ├── img2.jpg
│   └── ...
├── class_2/
│   ├── img1.jpg
│   ├── img2.jpg
│   └── ...
└── class_n/
    ├── img1.jpg
    ├── img2.jpg
    └── ...
```

In [None]:
from torchvision import datasets
dataset = datasets.ImageFolder(root='data', transform=transform)

In [None]:
import torch
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

## DataLoader

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)

## Pretrained model

In [None]:
from torchvision import models

model = models.resnet18(pretrained=True)  # Pretrained weights on ImageNet

### Freeze model

In [None]:
for param in model.parameters():
    param.requires_grad = False  # Freeze all parameters

In [None]:
for name, param in model.named_parameters():
    if "layer4" in name:  # Unfreeze the final layer block in ResNet
        param.requires_grad = True

## Custom layers

In [None]:
import torch.nn as nn

num_classes = 10  
model = models.resnet18(pretrained=True)

model.fc = nn.Linear(model.fc.in_features, num_classes) 

## Training

In [None]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001) 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(5):  
    model.train()  
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad() 
        outputs = model(images) 
        loss = criterion(outputs, labels) 
        loss.backward() 
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}")


# Special: Image + metadata model

In [None]:
class MultiInputModel(nn.Module):
    def __init__(self, num_classes, metadata_input_size):
        super(MultiInputModel, self).__init__()

        # Image branch: Use pre-trained ResNet18
        self.image_branch = models.resnet18(pretrained=True)
        self.image_branch.fc = nn.Identity()  # Remove the final fully connected layer

        # Metadata branch: Fully connected layers
        self.metadata_branch = nn.Sequential(
            nn.Linear(metadata_input_size, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU()
        )

        # Combined branch
        self.combined_fc = nn.Sequential(
            nn.Linear(512 + 32, 128),  # 512 from ResNet18 + 32 from metadata branch
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, num_classes)
        )

    def forward(self, image, metadata):
        # Forward pass through the image branch
        image_features = self.image_branch(image)

        # Forward pass through the metadata branch
        metadata_features = self.metadata_branch(metadata)

        # Concatenate features from both branches
        combined_features = torch.cat((image_features, metadata_features), dim=1)

        # Forward pass through the combined branch
        output = self.combined_fc(combined_features)
        return output

In [None]:
class MultiInputDataset(Dataset):
    def __init__(self, image_paths, metadata, labels, transform=None):
        self.image_paths = image_paths
        self.metadata = metadata
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # Load image
        image = Image.open(self.image_paths[idx]).convert("RGB")
        if self.transform:
            image = self.transform(image)

        # Load metadata
        metadata = self.metadata[idx]

        # Load label
        label = self.labels[idx]

        return image, torch.tensor(metadata, dtype=torch.float32), label

In [None]:
for epoch in range(10):  # Number of epochs
    model.train()
    running_loss = 0.0

    for images, metadata, labels in train_loader:
        images, metadata, labels = images.to(device), metadata.to(device), labels.to(device)

        optimizer.zero_grad()  # Zero the gradients
        outputs = model(images, metadata)  # Forward pass
        loss = criterion(outputs, labels)  # Compute loss
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights

        running_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_loader):.4f}")