In [None]:
import json
import requests
from PIL import Image
from io import BytesIO
import os


# Load JSON file
with open("datasets/multim_poem.json", "r") as f:
    data = json.load(f)

In [None]:
os.makedirs("processed_images", exist_ok=True)

valid_data = []  
invalid_urls = []  

def download_image(url, save_path):
    try:
        response = requests.get(url, timeout=5)  
        response.raise_for_status()  
        image = Image.open(BytesIO(response.content))
        image.save(save_path)
        return True
    except Exception as e:
        print(f"Failed to process URL {url}: {e}")
        return False

for idx, item in enumerate(data):
    url = item["image_url"]
    caption = item["poem"]

    filename = f"processed_images/image_{idx}.jpg"

    if download_image(url, filename):
        valid_data.append({"image_path": filename, "caption": caption})
    else:
        invalid_urls.append(url)


In [None]:
with open("processed_data.json", "w") as f:
    json.dump(valid_data, f, indent=4)

with open("invalid_urls.txt", "w") as f:
    for url in invalid_urls:
        f.write(url + "\n")

print(f"Processed {len(valid_data)} valid entries. {len(invalid_urls)} invalid URLs logged.")