In [20]:
import torch
from torchvision.models import convnext_base
import torch.nn as nn

In [None]:
# Torchvision will fetch the pretrained model weights
model = convnext_base(pretrained=True)

In [26]:
# Cuts off the classifier head that predicts text classifications, replaces it with a single output neuron representing the aesthetic score.
model.classifier = nn.Sequential(
    nn.Flatten(start_dim=1, end_dim=-1),
    nn.LayerNorm(normalized_shape=1024, eps=1e-06, elementwise_affine=True),
    nn.Linear(in_features=1024, out_features=1, bias=True)
    # Optionally, add an activation function here if needed
)

In [None]:
model.eval()

In [None]:
# Test some random noise
dummy_input = torch.randn(1, 3, 224, 224)

# If you have a GPU available and want to use it, move the model and input to GPU
if torch.cuda.is_available():
    model = model.cuda()
    dummy_input = dummy_input.cuda()

# Perform inference
with torch.no_grad():
    output = model(dummy_input)

# Interpret the output
if output.shape[1] == 1:  # Regression task
    # Directly use the output as your predicted score
    predicted_score = output.item()
    print(f"Predicted Score: {predicted_score}")
else:  # Classification task
    # Convert logits to probabilities and find the predicted class
    probabilities = F.softmax(output, dim=1)
    predicted_class = torch.argmax(probabilities, dim=1).item()
    print(f"Predicted Class: {predicted_class}")

In [54]:
from PIL import Image
from torchvision import transforms

# Test a random image
image_path = 'testing/109882.png'
image = Image.open(image_path).convert('RGB')

# Define the transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize the image to 224x224 pixels
    transforms.ToTensor(),  # Convert the image to a tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize
])

# Apply the transforms to the image
image_tensor = transform(image)

# Unsqueeze to add a batch dimension
image_tensor = image_tensor.unsqueeze(0).to("cuda")

# Make sure the model is in evaluation mode
model.eval()

# Perform inference
with torch.no_grad():
    output = model(image_tensor)

# The output here will be totally arbitrary, the only point to this test is to ensure the model is not producing errors and can be trained.
predicted_score = output.item()
print(f'Predicted score: {predicted_score}')

Predicted score: 8.01877737045288


In [56]:
# Save the untrained model weights so we can start training.
torch.save(model.state_dict(), 'untrained_aesthetic_scorer.pth')
torch.save(model, 'full_model.pth')