In [34]:
import os
import json
import requests
from io import BytesIO
from PIL import Image
from torchvision import transforms
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch

# Load your dataset
def load_dataset(json_path):
    with open(json_path, "r") as f:
        data = json.load(f)
    return data

class PoemDataset(Dataset):
    def __init__(self, data, processor):
        self.data = data
        self.processor = processor
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        # Fetch image
        try:
            response = requests.get(item["image_url"], timeout=5)
            response.raise_for_status()  # Raise HTTPError for bad responses
            image = Image.open(BytesIO(response.content)).convert("RGB")
        except Exception as e:
            raise RuntimeError(f"Failed to fetch image at {item['image_url']}: {e}")
        
        # Prepare processor inputs
        inputs = self.processor(images=image, text=item["poem"], return_tensors="pt", padding=True)
        
        return {
            "pixel_values": inputs["pixel_values"].squeeze(0),  # Remove batch dimension
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
        }

# Load your data
data_path = "multim_poem.json"
data = load_dataset(data_path)

# Initialize BLIP processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

# Create the dataset
dataset = PoemDataset(data, processor)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=4, shuffle=True)

In [35]:
from transformers import AdamW

model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

def train(model, data_loader, optimizer, num_epochs=3):
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0
        for batch in data_loader:
            print(batch)
            pixel_values = batch["pixel_values"].to(device)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            
            outputs = model(pixel_values=pixel_values, labels=input_ids, attention_mask=attention_mask)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            epoch_loss += loss.item()
        
        print(f"Epoch {epoch + 1} / {num_epochs}, Loss: {epoch_loss / len(data_loader)}")

train(model, data_loader, optimizer, num_epochs=3)

model.save_pretrained("./fine_tuned_blip")
processor.save_pretrained("./fine_tuned_blip")

RuntimeError: stack expects each tensor to be equal size, but got [31] at entry 0 and [77] at entry 1

### Preprocessing Data

In [None]:

# Load the tsv and tsv.meta files into pandas DataFrames
df_tsv = pd.read_csv('cvpr2019.tsv', sep='\t')  # file with scenes
df_meta = pd.read_csv('cvpr2019.tsv.meta', sep='\t')  # file with image_key and url

# Pivot the df_tsv to create a column for each scene
df_tsv_pivoted = df_tsv.groupby('IMAGE_KEY')['CAPTION'].apply(list).apply(lambda x: pd.Series(x)).reset_index()

# Rename columns for clarity, assuming there are exactly 5 scenes
df_tsv_pivoted.columns = ['IMAGE_KEY', 'scene1', 'scene2', 'scene3', 'scene4', 'scene5']

# Merge the two DataFrames on the 'image_key' column
merged_df = pd.merge(df_tsv_pivoted, df_meta, on='IMAGE_KEY', how='left')

# Save the merged DataFrame back to a TSV file
merged_df.to_csv('merged_file.tsv', sep='\t', index=False)

### Fine tuning CLIP for scene recognition

In [None]:


df = pd.read_csv('merged_file.tsv', sep='\t')
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

print("Training set:")
print(train_df.head())
print("\nValidation set:")
print(val_df.head())

Training set:
            IMAGE_KEY                                  scene1  \
29   00210e646b9a01f4          digital art selected for the #   
535  03484078db61e940  a view of the interior of the building   
695  04651a6159d60055          view of the pond in the garden   
557  0371cd0bb621e734         snowboarder jumping in the snow   
836  0579eb8fae88f403                     view of the kitchen   

                                                scene2  \
29                      natural history of the insects   
535  a view of the inside of the new terminal building   
695                  the pond in front of the building   
557        a snowboarder catches a big jump in the air   
836                  person in the kitchen of his home   

                                         scene3  \
29                an illustration from the book   
535                interior of a subway station   
695                   view from across the pond   
557  a snowboarder performs a jump in the 

In [None]:
import requests
from io import BytesIO
from PIL import Image

def download_image(image_url):
    response = requests.get(image_url)
    
    if response.status_code == 200:  # Ensure the request was successful
        try:
            # Try to open the image from the response content
            image = Image.open(BytesIO(response.content))
            image.verify()  # Verify the image integrity (optional)
            return image
        except (IOError, SyntaxError) as e:
            print(f"Error with image: {image_url}, Error: {e}")
            return None
    else:
        print(f"Failed to retrieve image from {image_url}, Status Code: {response.status_code}")
        return None

In [None]:
import requests
from io import BytesIO
from PIL import Image

class SceneDataset(Dataset):
    def __init__(self, dataframe, processor):
        self.dataframe = dataframe
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_url = row['OriginalURL']  # Adjust based on your column name
        scene = row['scene1']  # Adjust based on your column name

        image = download_image(image_url)
        print(f"Downloaded image type: {type(image)}")

        # If you need to check its size
        if isinstance(image, Image.Image):  # Check if it's an instance of PIL.Image
            print(f"Image size: {image.size}")
        else:
            print("Invalid image")
        
        if image is None:  # Skip invalid images
            return None

        # Access the image dimensions
        width, height = image.size

        # Example: Perform a check on image size (optional)
        if width < 100 or height < 100:
            print(f"Skipping small image: {image_url}")
            return None

        # Process the image and scene text
        inputs = self.processor(images=image, text=scene, return_tensors="pt", padding=True)
        
        return inputs

In [None]:

# Initialize the processor (CLIP)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Create DataLoader for training and validation datasets
train_dataset = SceneDataset(train_df, processor)
val_dataset = SceneDataset(val_df, processor)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)

In [None]:
from torch.optim import AdamW
from tqdm import tqdm

# Initialize the CLIP model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-6)

# Training loop
model.train()
for epoch in range(3):  # Adjust the number of epochs
    loop = tqdm(train_dataloader, desc=f"Epoch {epoch+1}")
    for batch in loop:
        inputs = {key: value.to(model.device) for key, value in batch.items()}
        optimizer.zero_grad()

        # Forward pass
        outputs = model(**inputs)

        # Calculate loss (contrastive loss is used by default in CLIP)
        loss = outputs.loss
        loss.backward()

        optimizer.step()

        loop.set_postfix(loss=loss.item())

    # Validation step
    model.eval()
    total_loss = 0
    for batch in val_dataloader:
        with torch.no_grad():
            inputs = {key: value.to(model.device) for key, value in batch.items()}
            outputs = model(**inputs)
            total_loss += outputs.loss.item()

    print(f"Validation loss after epoch {epoch+1}: {total_loss / len(val_dataloader)}")

# Save the fine-tuned model
model.save_pretrained("fine_tuned_clip_model")
processor.save_pretrained("fine_tuned_clip_processor")

Epoch 1:   0%|          | 0/50 [00:00<?, ?it/s]

Failed to retrieve image from https://c1.staticflickr.com/9/8124/29750795646_11c16a54f1_o.jpg, Status Code: 403
Downloaded image type: <class 'NoneType'>
Invalid image


Epoch 1:   0%|          | 0/50 [00:01<?, ?it/s]

Downloaded image type: <class 'PIL.JpegImagePlugin.JpegImageFile'>
Image size: (6125, 2454)





TypeError: '>=' not supported between instances of 'JpegImageFile' and 'int'

In [None]:
from PIL import Image

# Load the fine-tuned model and processor
model = CLIPModel.from_pretrained("fine_tuned_clip_model")
processor = CLIPProcessor.from_pretrained("fine_tuned_clip_processor")

# Example image for inference
image = Image.open("path_to_image.jpg")

# Generate scene description
inputs = processor(images=image, return_tensors="pt")
outputs = model.get_text_features(**inputs)

# Your method to generate or select a scene description from the model's outputs
scene_description = generate_scene_description(outputs)  # Define your logic here
print(scene_description)