In [4]:
import os
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Function to download full-size image from image page
def download_full_image(image_page_url, output_path):
    print(f"Accessing image page: {image_page_url}")
    response = requests.get(image_page_url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        full_image_tag = soup.find("a", {"class": "internal"})
        if full_image_tag:
            full_image_url = full_image_tag["href"]
            full_image_url = urljoin(base_url, full_image_url)
            print(f"Downloading image from: {full_image_url}")
            img_response = requests.get(full_image_url, headers=headers, stream=True)
            if img_response.status_code == 200:
                with open(output_path, "wb") as handler:
                    for chunk in img_response.iter_content(1024):
                        handler.write(chunk)
                print(f"Downloaded {full_image_url} to {output_path}")
            else:
                print(f"Failed to download image from {full_image_url} with status code {img_response.status_code}")
        else:
            print(f"No full-size image found for {image_page_url}")
    else:
        print(f"Failed to access {image_page_url} with status code {response.status_code}")

# Function to process a single category URL
def process_category_url(category_url):
    response = requests.get(category_url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    # Find all span elements that contain image links
    spans = soup.find_all("span", {"typeof": "mw:File"})

    # Extract the URLs of the images and ensure they are full URLs
    image_page_urls = [urljoin(base_url, span.find("a")["href"]) for span in spans if span.find("a")]

    print(f"Found {len(image_page_urls)} image pages in {category_url}.")

    # Download each image and save it
    for idx, img_page_url in enumerate(image_page_urls):
        output_path = f"{output_dir}/image_{len(saved_images) + idx + 1}.jpg"
        download_full_image(img_page_url, output_path)
        time.sleep(1)  # Add delay to avoid rate limiting

# List of Wikimedia Commons category URLs
category_urls = [



    "https://commons.wikimedia.org/wiki/Category:Stoa_of_Attalus",
    "https://commons.wikimedia.org/w/index.php?title=Category:Stoa_of_Attalus&filefrom=The+Stoa+of+Attalus+at+night+on+May+21%2C+2024.jpg#mw-category-media"

]




# Request headers
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

# Base URL for Wikimedia Commons
base_url = "https://commons.wikimedia.org"

# Create output directory
output_dir = "./data/Stoa_of_Attalus"
os.makedirs(output_dir, exist_ok=True)

# Track downloaded images
saved_images = []

# Process each category URL
for category_url in category_urls:
    process_category_url(category_url)
    saved_images = os.listdir(output_dir)

# List the saved images
print(f"Saved images: {saved_images}")


Found 203 image pages in https://commons.wikimedia.org/wiki/Category:Stoa_of_Attalus.
Accessing image page: https://commons.wikimedia.org/wiki/File:Stoa_of_Attalos_at_the_Ancient_Agora_of_Athens_2.jpg
Downloading image from: https://upload.wikimedia.org/wikipedia/commons/d/d5/Stoa_of_Attalos_at_the_Ancient_Agora_of_Athens_2.jpg
Downloaded https://upload.wikimedia.org/wikipedia/commons/d/d5/Stoa_of_Attalos_at_the_Ancient_Agora_of_Athens_2.jpg to ./data/Stoa_of_Attalus/image_1.jpg
Accessing image page: https://www.wikidata.org/wiki/Q1263335
No full-size image found for https://www.wikidata.org/wiki/Q1263335
Accessing image page: https://www.wikidata.org/wiki/Q1263335
No full-size image found for https://www.wikidata.org/wiki/Q1263335
Accessing image page: https://commons.wikimedia.org/wiki/File:Stoa_of_Attalus_Ath.9.JPG
Downloading image from: https://upload.wikimedia.org/wikipedia/commons/0/0e/Stoa_of_Attalus_Ath.9.JPG
Downloaded https://upload.wikimedia.org/wikipedia/commons/0/0e/Stoa_