In [207]:
import os
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torchvision import transforms
import torch.nn.functional as F
import torch.optim as optim

# Step 1: Define the Custom Dataset Class
class CustomImageDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        self.annotations = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        img_id = self.annotations.iloc[idx, 0]
        img_path = os.path.join(self.img_dir, img_id + '.jpg')
        image = Image.open(img_path).convert("RGB")
        
        if self.transform:
            image = self.transform(image)
        
        labels = self.annotations.iloc[idx, 1:].astype('float32').values  # Load all labels
        return image, torch.tensor(labels)
    
# Step 2: Define Transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to a standard size
    transforms.ToTensor(),  # Convert to tensor
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize with mean and std
])

# Step 3: Initialize the Dataset and DataLoader
csv_file = '../data/train/train.csv'  # Path to the CSV file
img_dir = '../data/train/train_images'  # Path to the image directory

dataset = CustomImageDataset(csv_file, img_dir, transform)
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(dataset, batch_size=16, shuffle=False)

# Limit the Dataset size
subset_indices = list(range(500))
subset_dataset = Subset(dataset, subset_indices)

# Define dataloaders
train_loader = DataLoader(subset_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(subset_dataset, batch_size=16, shuffle=False)

In [208]:
# Custom CNN Feature Extractor
class CustomFeatureExtractor(nn.Module):
    def __init__(self):
        super(CustomFeatureExtractor, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 28 * 28, 512)  # Assuming input images are 224x224
        
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = F.relu(self.fc1(x))
        return x

In [209]:
# Step 2: Define a  Classifier
class MultiLabelClassifier(nn.Module):
    def __init__(self, input_dim, num_labels):
        super(MultiLabelClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.dropout = nn.Dropout(0.5)  # Dropout layer to prevent overfitting
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_labels)  # Modify the number of output labels
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

In [210]:
# Initialize models
num_labels = 13  # Number of labels to predict
input_dim = 512
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
feature_extractor = CustomFeatureExtractor().to(device)
classifier = MultiLabelClassifier(input_dim, num_labels).to(device)

# Loss and optimizer
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(classifier.parameters(), lr=0.00001)

In [211]:
# Function to calculate accuracy
def calculate_accuracy(predictions, labels, threshold=0.5):
    preds = (predictions > threshold).float()
    correct = (preds == labels).float().sum()
    accuracy = correct / (labels.size(0) * labels.size(1))
    return accuracy

# Training loop with accuracy calculation
num_epochs = 25
for epoch in range(num_epochs):
    classifier.train()
    train_loss = 0.0
    train_accuracy = 0.0
    
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        # Forward pass
        features = feature_extractor(images)
        outputs = classifier(features)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * images.size(0)
        train_accuracy += calculate_accuracy(outputs, labels).item() * images.size(0)

    train_loss /= len(train_loader.dataset)
    train_accuracy /= len(train_loader.dataset)

    # Validation
    classifier.eval()
    val_loss = 0.0
    val_accuracy = 0.0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)

            features = feature_extractor(images)
            outputs = classifier(features)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * images.size(0)
            val_accuracy += calculate_accuracy(outputs, labels).item() * images.size(0)

    val_loss /= len(val_loader.dataset)
    val_accuracy /= len(val_loader.dataset)

    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

Epoch [1/25], Train Loss: 0.6295, Train Acc: 0.3509, Val Loss: 0.6229, Val Acc: 0.3509
Epoch [2/25], Train Loss: 0.6171, Train Acc: 0.3509, Val Loss: 0.6104, Val Acc: 0.3509
Epoch [3/25], Train Loss: 0.6045, Train Acc: 0.3509, Val Loss: 0.5976, Val Acc: 0.3509
Epoch [4/25], Train Loss: 0.5914, Train Acc: 0.3509, Val Loss: 0.5845, Val Acc: 0.3509
Epoch [5/25], Train Loss: 0.5780, Train Acc: 0.3509, Val Loss: 0.5706, Val Acc: 0.3509
Epoch [6/25], Train Loss: 0.5637, Train Acc: 0.3509, Val Loss: 0.5557, Val Acc: 0.3509
Epoch [7/25], Train Loss: 0.5485, Train Acc: 0.3509, Val Loss: 0.5402, Val Acc: 0.3509
Epoch [8/25], Train Loss: 0.5323, Train Acc: 0.3509, Val Loss: 0.5233, Val Acc: 0.3509
Epoch [9/25], Train Loss: 0.5148, Train Acc: 0.3514, Val Loss: 0.5050, Val Acc: 0.3528
Epoch [10/25], Train Loss: 0.4953, Train Acc: 0.3566, Val Loss: 0.4841, Val Acc: 0.3631
Epoch [11/25], Train Loss: 0.4739, Train Acc: 0.3765, Val Loss: 0.4621, Val Acc: 0.3952
Epoch [12/25], Train Loss: 0.4515, Train 

In [212]:
# Prediction on a single image
def load_image(image_path, transform):
    image = Image.open(image_path).convert("RGB")
    image = transform(image).unsqueeze(0)  # Add batch dimension
    return image

# Single image prediction
image_path = '../data/train/train_images/02581632f146cccfcfc93005ef5f907e.jpg'  # Replace with the path to your image
image = load_image(image_path, transform).to(device)

with torch.no_grad():
    features = feature_extractor(image)
    output = classifier(features).squeeze()
    predictions = (output > 0.5).float()

labels = ["Subject Focus", "Eyes", "Face", "Near", "Action", "Accessory", "Group", "Collage", "Human", "Occlusion", "Info", "Blur"]
predictions_dict = {label: predictions[i].item() for i, label in enumerate(labels)}

print("Predictions:", predictions_dict)


Predictions: {'Subject Focus': 0.0, 'Eyes': 0.0, 'Face': 1.0, 'Near': 0.0, 'Action': 0.0, 'Accessory': 1.0, 'Group': 1.0, 'Collage': 0.0, 'Human': 0.0, 'Occlusion': 0.0, 'Info': 0.0, 'Blur': 1.0}
