In [1]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
import os, random
import pandas as pd

## Imports for plotting
import matplotlib.pyplot as plt

from matplotlib.colors import to_rgba
import seaborn as sns
sns.set()

import torch
torch.manual_seed(42) # Setting the seed
print("Using torch", torch.__version__)

Using torch 2.5.0+cpu


## Exercise 1 

Modify the code to use different pre-trained architectures. [See here for ideas.](https://pytorch.org/vision/stable/models.html)

https://pytorch.org/vision/stable/models/generated/torchvision.models.inception_v3.html#torchvision.models.Inception_V3_Weights

In [2]:
class CustomImageDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        self.annotations = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform

        # Create label-to-index mapping
        self.label_to_index = {label: idx for idx, label in enumerate(self.annotations['label'].unique())}
        self.index_to_label = {idx: label for label, idx in self.label_to_index.items()}

        # Convert labels to integers
        self.annotations['label'] = self.annotations['label'].map(self.label_to_index)

        # Compute class weights based on label frequency
        class_counts = self.annotations['label'].value_counts().sort_index()
        total_samples = len(self.annotations)

        # Calculate weights inversely proportional to class frequencies
        self.class_weights = torch.tensor([total_samples / (len(class_counts) * count) for count in class_counts], dtype=torch.float)

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        img_path = os.path.join(self.img_dir, self.annotations.iloc[index, 0])
        image = Image.open(img_path).convert("RGB")
        label = self.annotations.iloc[index, 1]

        if self.transform:
            image = self.transform(image)

        return image, label

# Paths to the data
csv_file = './data/newspaper_images/ads_data/ads_upsampled_no_index.csv'  # Path to your CSV file
img_dir = './data/newspaper_images/ads_data/images'       # Directory with all the images

# Define image transformations
transform = transforms.Compose([
    transforms.Pad(20),  # Add padding to ensure images are large enough for the kernel
    transforms.Resize((299, 299)),  # Resize to 299x299 to match Inception v3's input size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalization as usual
])

# Create the dataset
dataset = CustomImageDataset(csv_file=csv_file, img_dir=img_dir, transform=transform)

# Define the train-validation split
train_size = int(0.8 * len(dataset))  # 80% of the data for training
val_size = len(dataset) - train_size  # Remaining 20% for validation 

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoader for train and validation datasets
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Example: Iterate through the training data
for images, labels in train_loader:
    print(f"Train - images shape: {images.shape}, labels shape: {labels.shape}")
    break

# Example: Iterate through the validation data
for images, labels in val_loader:
    print(f"Validation - images shape: {images.shape}, labels shape: {labels.shape}")
    break

# ================= MODEL, TRAINING, EVALUATION ===================== #

# Load pre-trained ResNet model
model = models.Inception3(init_weights=models.Inception_V3_Weights.IMAGENET1K_V1, aux_logits=True) # same model architecture and pre-trained weights

# Freeze the layers
for name, param in model.named_parameters():
    if "Mixed_7" in name:  # Unfreeze layers from Mixed_7 block
        param.requires_grad = True

# Replace the last layer to a new one, and match the number of classes
num_features = model.fc.in_features  # Get input features of the final layer
model.fc = nn.Linear(num_features, 2) 

# Move model to GPU if available
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model = model.to(device)

# Define loss function and optimizer
class_weights = torch.tensor([1.0, 1.0], dtype=torch.float)
criterion = nn.CrossEntropyLoss(weight=dataset.class_weights.to(device))
optimizer = optim.Adam(model.fc.parameters(), lr=0.001) # instead of SGD

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs, device):
    model.to(device)  # Move model to the desired device
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        correct = 0
        total = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)  # Forward pass

            # Extract the primary output from the Inception model
            if isinstance(outputs, tuple):
                outputs = outputs[0]  # Get the main output tensor if outputs is a tuple

            # Calculate loss using primary outputs only
            loss = criterion(outputs, labels)  # Compute loss
            loss.backward()  # Backward pass
            optimizer.step()  # Update parameters

            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        train_accuracy = 100 * correct / total
        print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%")

        scheduler.step()  # Update the learning rate

        # Validation phase
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs, _ = model(images)
                loss = criterion(outputs, labels)

                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

        val_loss /= len(val_loader)
        val_accuracy = 100 * val_correct / val_total

        # Adjust learning rate based on scheduler
        scheduler.step()

        print(f"Epoch [{epoch+1}/{num_epochs}], "
              f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, "
              f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%")

# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=10, device=device)

# Save the trained model
torch.save(model.state_dict(), 'inceptionv3_model.tar')

# Validate the model
# Get a single batch from the validation loader
model.eval()  # Set the model to evaluation mode
data_iter = iter(val_loader)
images, true_labels = next(data_iter)

# Move data to the device
images = images.to(device)
true_labels = true_labels.to(device)

# Perform a forward pass to get predictions
with torch.no_grad():
    outputs, _ = model(images)
    probabilities = F.softmax(outputs, dim=1) # calculating probabilities from the logits using the softmax function

# Get the predicted label and confidence
_, predicted_labels = torch.max(outputs, 1)
confidence, _ = torch.max(probabilities, 1)

# Randomly select an index for the image to display
index = random.randint(0, images.size(0) - 1)
image = images[index].cpu().permute(1, 2, 0).numpy()  # Convert to HxWxC format for plotting
true_label = true_labels[index].item()
predicted_label = predicted_labels[index].item()
confidence_score = confidence[index].item()

# Convert the label indexes back to string labels using the dataset's index_to_label dictionary
true_label_str = val_dataset.dataset.index_to_label[true_label]
predicted_label_str = val_dataset.dataset.index_to_label[predicted_label]

# Print the true label, predicted label, and confidence
print(f"True Label: {true_label_str}")
print(f"Predicted Label: {predicted_label_str}")
print(f"Model Confidence: {confidence_score:.4f}")

# Optionally, display the image
plt.imshow(image)
plt.title(f"True: {true_label_str}, Predicted: {predicted_label_str}, Confidence: {confidence_score:.4f}")
plt.axis('off')
plt.show()

Train - images shape: torch.Size([32, 3, 299, 299]), labels shape: torch.Size([32])
Validation - images shape: torch.Size([32, 3, 299, 299]), labels shape: torch.Size([32])


KeyboardInterrupt: 

## Exercise 4 - WikiArt Dataset

Link to Dataset [here](https://github.com/cs-chan/ArtGAN/blob/master/WikiArt%20Dataset/README.md).

The model implemented is for classifying images from the WikiArt dataset. It uses a transfer learning approach with MobileNetV2, a lightweight convolutional neural network (CNN) architecture that is particularly suited for mobile and embedded vision applications. This model is efficient in terms of both memory and computation, making it a good choice for environments with limited resources.

Inverted Residuals: It introduces a concept called inverted residuals with linear bottlenecks. This means that the network first expands the feature dimensions using lightweight depthwise separable convolutions and then reduces them back, preserving important information while keeping computational costs low.

Depthwise Separable Convolutions: These convolutions reduce the number of parameters and computational cost by separating the spatial and channel dimensions, which leads to a more efficient network.

### Structure

Input Layer: The model expects input images resized to the standard size of 224×224 pixels. 

Base Layers: The bulk of the MobileNetV2 architecture consists of several layers of depthwise separable convolutions, which gradually extract features from the input images. The architecture is deep enough to learn complex representations while remaining lightweight.

Output Layer: The final classification layer is modified to match the number of output classes (genres in your case). For instance, if there are 6 genres, the final layer would output 6 logits (raw prediction scores for each class).

### Training Logic

Data Preparation: The images are loaded and augmented (e.g., resized, flipped) to improve generalization. Augmentations are particularly useful to prevent overfitting by artificially enlarging the training dataset.

Loss Function: The model uses Cross Entropy Loss, which is commonly used for multi-class classification problems. It measures the difference between the predicted probability distribution and the true distribution.

Optimizer: Adam is used as the optimizer. It's an adaptive learning rate optimization algorithm that is computationally efficient and well-suited for problems with a large number of parameters.

Training Loop:
The model is set to training mode and iterates through batches of training data.
For each batch, it performs a forward pass, computes the loss, and updates the model weights through backpropagation.
After training on the entire dataset, the model's performance is validated on the validation set to check for overfitting.

###  Evaluation and Saving the Model

After training, the model is evaluated based on its accuracy and loss on the validation set.
Finally, the model's learned weights are saved to a file (mobilenetv2_wikiart.pth), allowing to reload it and use the model for inference or further training in the future.

In [None]:
import os
import pandas as pd
import random
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torchvision import models
from sklearn.model_selection import train_test_split

# Paths
image_dir = r"C:\Users\Lucrezia\OneDrive - Alma Mater Studiorum Università di Bologna\Machine Learning\Repository\data\WikiData\wikiart_imgs"

# Get all subfolders (genres/classes)
classes = os.listdir(image_dir)
print(f"Classes (genres): {classes}")

# Collect image paths and their labels
image_paths = []
labels = []

for idx, genre in enumerate(classes):
    genre_folder = os.path.join(image_dir, genre)
    for img_file in os.listdir(genre_folder):
        if img_file.endswith(('.jpg', '.jpeg', '.png')):  # Adjust for valid image extensions
            image_paths.append(os.path.join(genre_folder, img_file))
            labels.append(idx)  # Label is the index of the genre

# Split into training and validation sets (80% train, 20% val)
train_paths, val_paths, train_labels, val_labels = train_test_split(image_paths, labels, test_size=0.2, random_state=42)

# Custom Dataset to load images from file paths and labels
class WikiArtDataset(Dataset):
    def __init__(self, img_paths, labels, transform=None):
        self.img_paths = img_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, index):
        img_path = self.img_paths[index]
        image = Image.open(img_path).convert("RGB")
        label = self.labels[index]
        
        if self.transform:
            image = self.transform(image)
        
        return image, label

# Define transformations for training and validation datasets
data_transforms = {
    'train': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
}

# Load datasets
train_dataset = WikiArtDataset(img_paths=train_paths, labels=train_labels, transform=data_transforms['train'])
val_dataset = WikiArtDataset(img_paths=val_paths, labels=val_labels, transform=data_transforms['val'])

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Load the MobileNetV2 model
model = models.mobilenet_v2(pretrained=True)

# Modify the final layer to match the number of classes (in this case, the number of genres)
num_ftrs = model.classifier[1].in_features
model.classifier[1] = nn.Linear(num_ftrs, len(classes))  # len(classes) is the number of genres

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training function
def train_model(model, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data
            loader = train_loader if phase == 'train' else val_loader
            dataset_size = len(train_dataset) if phase == 'train' else len(val_dataset)

            for inputs, labels in loader:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # Zero the parameter gradients
                optimizer.zero_grad()

                # Forward pass
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # Backward pass and optimize only in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # Track loss and accuracy
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_size
            epoch_acc = running_corrects.double() / dataset_size

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

    return model

# Train the model
model_trained = train_model(model, criterion, optimizer, num_epochs=10)

# Save the trained model weights
torch.save(model_trained.state_dict(), 'mobilenetv2_wikiart.pth')

Classes (genres): ['Abstract_Expressionism', 'Action_painting', 'Analytical_Cubism', 'Art_Nouveau_Modern', 'Baroque', 'Color_Field_Painting', 'Contemporary_Realism', 'Cubism', 'Early_Renaissance', 'Expressionism', 'Fauvism', 'High_Renaissance', 'Impressionism', 'Mannerism_Late_Renaissance', 'Minimalism', 'Naive_Art_Primitivism', 'New_Realism', 'Northern_Renaissance', 'Pointillism', 'Pop_Art', 'Post_Impressionism', 'Realism', 'Rococo', 'Romanticism', 'Symbolism', 'Synthetic_Cubism', 'Ukiyo_e']


Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to C:\Users\Lucrezia/.cache\torch\hub\checkpoints\mobilenet_v2-b0353104.pth
100%|██████████| 13.6M/13.6M [00:04<00:00, 3.17MB/s]


Epoch 0/9
----------
