In [49]:
import os
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torchvision import transforms, models

# Step 1: Define the Custom Dataset Class
class CustomImageDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        self.annotations = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        img_id = self.annotations.iloc[idx, 0]
        img_path = os.path.join(self.img_dir, img_id + '.jpg')  # Assuming images are in .jpg format
        image = Image.open(img_path).convert("RGB")
        
        if self.transform:
            image = self.transform(image)
        
        labels = self.annotations.iloc[idx, 1:-1].astype('float32').values  # Load all labels
        return image, torch.tensor(labels)
    
# Step 2: Define Transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to a standard size
    transforms.ToTensor(),  # Convert to tensor
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize with mean and std
])

# Step 3: Initialize the Dataset and DataLoader
csv_file = '../data/train/train.csv'  # Path to the CSV file
img_dir = '../data/train/train_images'  # Path to the image directory

# Define relevant variables for the ML task (Hyperparameters)
batch_size = 32

dataset = CustomImageDataset(csv_file=csv_file, img_dir=img_dir, transform=transform)

# Limit the Dataset to 100 Images
subset_indices = list(range(100))
subset_dataset = Subset(dataset, subset_indices)

# Split dataset into training, validation, and test sets
train_size = int(0.8 * len(subset_dataset))
val_size = int(0.1 * len(subset_dataset))
test_size = len(subset_dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(subset_dataset, [train_size, val_size, test_size])

# Define dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [50]:
# Step 4: Define a Neural Network for Feature Extraction

# Custom CNN Feature Extractor
class CustomFeatureExtractor(nn.Module):
    def __init__(self):
        super(CustomFeatureExtractor, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 28 * 28, 512)
        
    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.pool(torch.relu(self.conv3(x)))
        x = x.view(x.size(0), -1)  # Flatten the tensor
        x = torch.relu(self.fc1(x))
        return x

# WINDOWS: Device will determine whether to run the training on GPU or CPU.
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# MAC: Device will determine whether to run the training on GPU or CPU.
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

# Instantiate the model
model = CustomFeatureExtractor().to(device)
model.eval()  # Set the model to evaluation mode

CustomFeatureExtractor(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=50176, out_features=512, bias=True)
)

In [51]:
# Extract Features
def extract_features(train_loader, model, device):
    all_features = []
    all_labels = []
    with torch.no_grad():
        for images, labels in train_loader:
            images = images.to(device)
            labels = labels.to(device)
            features = model(images)
            all_features.append(features.cpu())
            all_labels.append(labels.cpu())
    all_features = torch.cat(all_features)
    all_labels = torch.cat(all_labels)
    return all_features, all_labels

# Extract features
x_features, x_labels = extract_features(train_loader, model, device)

# Example output
print("Extracted features shape:", x_features.shape)
print("Labels shape:", x_labels.shape)

Extracted features shape: torch.Size([80, 512])
Labels shape: torch.Size([80, 12])


In [52]:
# Step 1: Prepare the Data
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim

features = x_features
labels = x_labels 

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42)

# Create TensorDatasets and DataLoaders
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [53]:
# Step 2: Define a  Classifier
class MultiLabelClassifier(nn.Module):
    def __init__(self, input_dim, num_labels):
        super(MultiLabelClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_labels)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

# Initialize the classifier
num_labels = 12  # Number of labels to predict
input_dim = 512

device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
feature_extractor = CustomFeatureExtractor().to(device)
classifier = MultiLabelClassifier(input_dim, num_labels).to(device)

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(classifier.parameters(), lr=0.001)

In [59]:
# Step 3: Train the Classifier
num_epochs = 10

for epoch in range(num_epochs):
    classifier.train()
    train_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        # Forward pass
        features = feature_extractor(images)
        outputs = classifier(features)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * images.size(0)

    train_loss /= len(train_loader.dataset)

    # Validation
    classifier.eval()
    val_loss = 0.0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)

            features = feature_extractor(images)
            outputs = classifier(features)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * images.size(0)

    val_loss /= len(val_loader.dataset)

    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [16, 512]

In [34]:
# Step 4: Use the classifier for predictions
classifier.eval()
with torch.no_grad():
    # Replace X_val with new data if needed
    test_outputs = classifier(X_val.to(device)).squeeze()
    test_predictions = (test_outputs > 0.5).float()
    print("Predictions:", test_predictions.cpu().numpy())

Predictions: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [35]:
# After training the models
torch.save(model.state_dict(), '../data/train/feature_extractor.pth')
torch.save(classifier.state_dict(), '../data/train/classifier.pth')

In [36]:
# Load and preprocess the image
def load_image(image_path, transform):
    image = Image.open(image_path).convert("RGB")
    image = transform(image).unsqueeze(0)  # Add batch dimension
    return image

# Transformations used for the image
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to a standard size
    transforms.ToTensor(),  # Convert to tensor
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize with mean and std
])

image_path = '../data/train/cnn_train_images/0a12576d99ec278f415c51f47279e89a.jpg'  # Replace with the path to your image

# Load and preprocess the image
image = load_image(image_path, transform).to(device)

# Extract features from the image
with torch.no_grad():
    features = feature_extractor(image)

# Make a prediction using the classifier
with torch.no_grad():
    output = classifier(features).squeeze()
    predictions = (output > 0.5).float()

# Print the predictions
labels = ["Subject Focus", "Eyes", "Face", "Near", "Action", "Accessory", "Group", "Collage", "Human", "Occlusion", "Info", "Blur"]
predictions_dict = {label: predictions[i].item() for i, label in enumerate(labels)}

print("Predictions:", predictions_dict)

Prediction (1 for is_human, 0 for not human): 1.0
