In [2]:
import os
import torch
import torchvision.transforms as transforms
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from sklearn.preprocessing import MultiLabelBinarizer
from torchvision.models import resnet50

# Custom dataset class
class ArtDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.labels = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.mlb = MultiLabelBinarizer()
        self.mlb.fit(self.labels['labels'].apply(lambda x: x.split(', ')))

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, self.labels.iloc[idx, 0])
        image = Image.open(img_name).convert('RGB')
        labels = self.labels.iloc[idx, 1].split(', ')
        labels = self.mlb.transform([labels])[0]
        if self.transform:
            image = self.transform(image)
        return image, torch.tensor(labels, dtype=torch.float32)

# Transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Dataset and DataLoader
train_dataset = ArtDataset(csv_file='dataset/labels.csv', root_dir='dataset/train', transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Model
model = resnet50(pretrained=True)
model.fc = torch.nn.Linear(model.fc.in_features, len(train_dataset.mlb.classes_))

# Training
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f'Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}')

# Save the model
torch.save(model.state_dict(), 'art_model.pth')



Epoch 1, Loss: 0.7427393198013306
Epoch 2, Loss: 0.36886802315711975
Epoch 3, Loss: 0.18792179226875305
Epoch 4, Loss: 0.09805647283792496
Epoch 5, Loss: 0.04915500804781914
Epoch 6, Loss: 0.030429232865571976
Epoch 7, Loss: 0.02050023339688778
Epoch 8, Loss: 0.0149346012622118
Epoch 9, Loss: 0.011400505900382996
Epoch 10, Loss: 0.008951690047979355


In [4]:
import clip
import torch
from PIL import Image
import torchvision.transforms as transforms

# Load the fine-tuned saved model
model.load_state_dict(torch.load('art_model.pth'))
model.eval()

# Load CLIP model and preprocess
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, clip_preprocess = clip.load("ViT-B/32", device)

# Define the same transform used for training
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Function to predict and verify descriptions
def predict_and_verify(image_path, model, clip_model, clip_processor, mlb):
    image = Image.open(image_path).convert('RGB')  # Ensure the image is in RGB format
    
    # Preprocess the image for the classification model
    input_image = transform(image).unsqueeze(0).to(device)
    
    # Predict keywords
    with torch.no_grad():
        output = model(input_image)
        probs = torch.sigmoid(output).cpu().numpy()[0]
        top_indices = probs.argsort()[-10:][::-1]  # Top 10 keywords
        keywords = [mlb.classes_[idx] for idx in top_indices]
    
    # Preprocess the image for CLIP
    clip_image_input = clip_preprocess(image).unsqueeze(0).to(device)
    
    # Encode image and generated keywords with CLIP
    text_inputs = torch.cat([clip.tokenize(keyword) for keyword in keywords]).to(device)
    with torch.no_grad():
        image_features = clip_model.encode_image(clip_image_input)
        text_features = clip_model.encode_text(text_inputs)
    
    # Compute similarities
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarities = (image_features @ text_features.T).softmax(dim=-1).cpu().numpy()[0]
    
    # Refine and return top keywords
    refined_keywords = sorted(zip(keywords, similarities), key=lambda x: x[1], reverse=True)[:5]
    return [keyword for keyword, sim in refined_keywords]

# Example usage
image_path = "image32.png"
keywords = predict_and_verify(image_path, model, clip_model, clip_preprocess, train_dataset.mlb)

print("Predicted Keywords:", keywords)


Predicted Keywords: ['mandelbrot', 'fractal', 'particle', 'voronoi', 'pattern']
