In [None]:
import torch
from transformers import CLIPProcessor, CLIPModel
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
import os

# 1. Load the CSV containing filenames and labels
df = pd.read_csv(r"Thales Labelled.csv")  # Adjust CSV path

# 2. Define the image folder path
image_folder = r"Thales Images"  # Provide path to your image folder

# 3. Load Pretrained CLIP Model and Processor
model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')

# 4. Define the label mapping (offensive -> 1, non-offensive -> 0)
label_mapping = {'NON-OFFENSIVE': 0, 'OFFENSIVE': 1}

# 5. Custom Dataset for your images
class DoodleDataset(Dataset):
    def __init__(self, dataframe, processor, image_folder, transform=None):
        self.dataframe = dataframe
        self.processor = processor
        self.image_folder = image_folder
        self.transform = transform
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        # Load the image
        filename = self.dataframe.iloc[idx]['filename']  # Get filename from CSV
        image_path = os.path.join(self.image_folder, filename)  # Construct full image path
        image = Image.open(image_path).convert("RGB")
        
        # Text label for reasoning
        label = self.dataframe.iloc[idx]['label']  # 'offensive' or 'non-offensive'
        reasoning = "This doodle is offensive because it depicts harmful content." if label == 'offensive' else "This doodle is non-offensive because it depicts harmless content."
        
        # Preprocess the image and text separately
        inputs = self.processor(text=reasoning, images=image, return_tensors="pt", padding=True, truncation=True)
        
        # Extract the image tensor and text input ids separately
        pixel_values = inputs["pixel_values"].squeeze(0)  # Image tensor (remove batch dimension)
        input_ids = inputs["input_ids"].squeeze(0)  # Text input IDs
        
        # Map the label to an integer (0 or 1)
        label = label_mapping[label]
        
        return pixel_values, input_ids, label

# 6. Custom collate function to handle different image sizes
def collate_fn(batch):
    pixel_values = [item[0] for item in batch]
    input_ids = [item[1] for item in batch]
    labels = [item[2] for item in batch]
    
    # Stack the pixel values and input ids to create a batch of consistent size
    pixel_values = torch.stack(pixel_values, dim=0)
    input_ids = torch.stack(input_ids, dim=0)
    labels = torch.tensor(labels, dtype=torch.long)
    
    return pixel_values, input_ids, labels

# 7. Prepare DataLoader with custom collate_fn
train_dataset = DoodleDataset(df, processor, image_folder)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

# 8. Set optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# 9. Fine-tune CLIP
model.train()
for epoch in range(5):  # You can adjust the number of epochs
    for batch in train_dataloader:
        pixel_values, input_ids, labels = batch
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids=input_ids, pixel_values=pixel_values)
        logits_per_image = outputs.logits_per_image  # Image-text similarity logits
        
        # Define loss (cross-entropy for classification)
        loss = torch.nn.CrossEntropyLoss()(logits_per_image, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        print(f"Epoch {epoch}, Loss: {loss.item()}")

# Ensure all model parameters are contiguous
for param in model.parameters():
    param.data = param.data.contiguous()


# 10. Save the fine-tuned model
model.save_pretrained('fine_tuned_clip_model')




Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss: 2.7725892066955566
Epoch 0, Loss:

In [17]:
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image

# 1. Load the fine-tuned CLIP model and processor
model = CLIPModel.from_pretrained('fine_tuned_clip_model')
processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')

# 2. Define label mapping (same as during training)
label_mapping = {0: 'NON-OFFENSIVE', 1: 'OFFENSIVE'}

# 3. Load and preprocess your test image
def test_image(image_path, reasoning_text):
    # Load and preprocess the image
    image = Image.open(image_path).convert("RGB")
    
    # Process the image and reasoning text
    inputs = processor(text=reasoning_text, images=image, return_tensors="pt", padding=True, truncation=True)
    pixel_values = inputs["pixel_values"]  # Image tensor
    input_ids = inputs["input_ids"]  # Text input IDs
    
    return pixel_values, input_ids

# 4. Function to predict
def predict(image_path):
    # Define reasoning text
    reasoning_text = "This doodle is offensive because it depicts harmful content or symbols." \
                     " Otherwise, it is considered non-offensive because it depicts benign content."
    
    # Preprocess the image and text
    pixel_values, input_ids = test_image(image_path, reasoning_text)
    
    # Ensure model is in evaluation mode
    model.eval()
    with torch.no_grad():
        # Forward pass
        outputs = model(input_ids=input_ids, pixel_values=pixel_values)
        logits_per_image = outputs.logits_per_image  # Image-text similarity logits
        
        # Get the predicted class
        prediction = torch.argmax(logits_per_image, dim=1).item()
        label = label_mapping[prediction]
    
    return label

# 5. Test the model with your image
image_path = r"C:\Users\Nitya\Downloads\WhatsApp Image 2024-12-06 at 23.34.50_b2745a14.jpg"  # Replace with your image's path
predicted_label = predict(image_path)
print(f"Predicted label for the test image: {predicted_label}")


Predicted label for the test image: NON-OFFENSIVE
