In [1]:

!pip install ddgs fastai tqdm --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m67.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m65.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
from pathlib import Path
from ddgs import DDGS
from tqdm.notebook import tqdm
from fastai.vision.utils import download_images, verify_images

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
%cd /content/drive/MyDrive/MasterCourse/Bangladeshi_Landmarks

/content/drive/MyDrive/MasterCourse/Bangladeshi_Landmarks


In [5]:

DEST_ROOT = Path("dataset_places")
DEST_ROOT.mkdir(parents=True, exist_ok=True)

TARGET_PER_CLASS = 500
OVERFETCH_FACTOR = 2
MAX_RETRIES =10

ARCH_HINT = (
    "exterior architecture building monument facade heritage "
    "-interior -floorplan -selfie -people -crowd -tourist -group -human -face -portrait -wedding"
)
NATURE_HINT = (
    "landscape scenery nature hills forest river mountain valley beach waterfall "
    "-selfie -people -tourist -crowd -group -human -face -portrait"
)

places = [
    "Jatiya Sangsad Bhaban (Dhaka) Architecture only",
    "Ahsan Manzil (Dhaka) Architecture only",
    "Lalbagh Fort (Dhaka) Architecture only",
    "Dhaka Shaheed Minar (Bangladesh) Architecture only",
    "Sixty Dome Mosque (Bagerhat) Architecture only",
    "Somapura Mahavihara (Paharpur, Naogaon) Architecture only",
    "Kantajew Temple (Dinajpur) Architecture only",
    "Panam Nagar (Sonargaon, Narayanganj) Architecture only",
    "Tajhat Palace (Rangpur) Architecture only",
    "Cox’s Bazar Sea Beach (Cox’s Bazar) Nature only",
    "St. Martin's Island (Teknaf, Chhera Dwip) Nature only",
    "Sundarbans Mangrove Forest (Khulna) Nature onlye",
    "Jaflong (Sylhet) Nature only",
    "Ratargul Swamp Forest (Sylhet) Nature only",
    "Sajek Valley (Rangamati) Nature only",
    "Nafakhum Waterfall (Bandarban) Nature only"
]


In [6]:

def split_name_and_type(item: str):
    if "Architecture" in item:
        return item.replace(" Architecture only", "").strip(), "Architecture"
    elif "Nature" in item:
        return item.replace(" Nature only", "").strip(), "Nature"
    return item, "Unknown"

def build_query(place_name: str, kind: str, attempt: int) -> str:
    if kind == "Architecture":
        base = f"{place_name} {ARCH_HINT}"
    elif kind == "Nature":
        base = f"{place_name} {NATURE_HINT}"
    else:
        base = place_name
    if attempt > 0:
        base += f" photo view picture {attempt}"
    return base

def duckduckgo_search(query: str, needed: int):
    urls, seen = [], set()
    try:
        with DDGS() as ddgs:
            for r in ddgs.images(query, safesearch="on", type_image="photo", max_results=needed*2):
                url = r.get("image")
                if not url or url in seen:
                    continue
                if any(bad in url.lower() for bad in ["svg", "base64,", "data:image"]):
                    continue
                seen.add(url)
                urls.append(url)
                if len(urls) >= needed:
                    break
    except Exception as e:
        print(f"[!] DuckDuckGo search failed: {e}")
    return urls

def sanitize_dirname(name: str) -> str:
    return "".join(c for c in name if c.isalnum() or c in " -_()&',.").strip()

def ensure_target_count(folder: Path) -> int:
    exts = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".webp"}
    return sum(1 for p in folder.glob("*") if p.suffix.lower() in exts)




In [7]:

summary = []

for item in places:
    place_name, kind = split_name_and_type(item)
    class_dir = DEST_ROOT / sanitize_dirname(place_name)
    class_dir.mkdir(parents=True, exist_ok=True)

    for attempt in range(MAX_RETRIES):
        have = ensure_target_count(class_dir)
        if have >= TARGET_PER_CLASS:
            print(f"[✓] {place_name}: reached {have}/{TARGET_PER_CLASS} images.")
            break

        need = TARGET_PER_CLASS - have
        urls_needed = int(need * OVERFETCH_FACTOR)
        query = build_query(place_name, kind, attempt)

        print(f"\n=== {place_name} ({kind}) | Attempt {attempt+1} ===")
        print(f"Query: {query}")
        print(f"Need {need} more images (fetching {urls_needed} URLs).")

        urls = duckduckgo_search(query, urls_needed)
        if not urls:
            print(f"[!] No URLs found this attempt.")
            continue

        urls_file = class_dir / f"_urls_attempt{attempt+1}.txt"
        urls_file.write_text("\n".join(urls), encoding="utf-8")
        urls_list = urls_file.read_text().splitlines()
        download_images(str(class_dir), urls=urls_list, preserve_filename=False, timeout=30, n_workers=8)
        failed = verify_images(str(class_dir))
        if failed:
            print(f"Removed {len(failed)} corrupt images.")

        final_count = ensure_target_count(class_dir)
        print(f"[i] {place_name}: now {final_count}/{TARGET_PER_CLASS} images.")

    final_count = ensure_target_count(class_dir)
    status = "OK" if final_count >= TARGET_PER_CLASS else "SHORT"
    summary.append((place_name, kind, final_count, status))



=== Jatiya Sangsad Bhaban (Dhaka) (Architecture) | Attempt 1 ===
Query: Jatiya Sangsad Bhaban (Dhaka) exterior architecture building monument facade heritage -interior -floorplan -selfie -people -crowd -tourist -group -human -face -portrait -wedding
Need 500 more images (fetching 1000 URLs).
Removed 44 corrupt images.
[i] Jatiya Sangsad Bhaban (Dhaka): now 84/500 images.

=== Jatiya Sangsad Bhaban (Dhaka) (Architecture) | Attempt 2 ===
Query: Jatiya Sangsad Bhaban (Dhaka) exterior architecture building monument facade heritage -interior -floorplan -selfie -people -crowd -tourist -group -human -face -portrait -wedding photo view picture 1
Need 416 more images (fetching 832 URLs).
Removed 44 corrupt images.
[i] Jatiya Sangsad Bhaban (Dhaka): now 171/500 images.

=== Jatiya Sangsad Bhaban (Dhaka) (Architecture) | Attempt 3 ===
Query: Jatiya Sangsad Bhaban (Dhaka) exterior architecture building monument facade heritage -interior -floorplan -selfie -people -crowd -tourist -group -human -fa

In [8]:
from PIL import Image, UnidentifiedImageError
for img_path in DEST_ROOT.rglob("*"):
    if img_path.is_file() and not img_path.name.startswith("."):
        try:
            img = Image.open(img_path)
            img.verify()
        except (IOError, SyntaxError, UnidentifiedImageError):
            print(f"Removing corrupted image: {img_path}")
            img_path.unlink()  # delete corrupted image


Removing corrupted image: dataset_places/Jatiya Sangsad Bhaban (Dhaka)/_urls_attempt1.txt
Removing corrupted image: dataset_places/Jatiya Sangsad Bhaban (Dhaka)/_urls_attempt2.txt
Removing corrupted image: dataset_places/Jatiya Sangsad Bhaban (Dhaka)/_urls_attempt3.txt
Removing corrupted image: dataset_places/Jatiya Sangsad Bhaban (Dhaka)/_urls_attempt4.txt
Removing corrupted image: dataset_places/Jatiya Sangsad Bhaban (Dhaka)/_urls_attempt5.txt
Removing corrupted image: dataset_places/Jatiya Sangsad Bhaban (Dhaka)/_urls_attempt6.txt
Removing corrupted image: dataset_places/Ahsan Manzil (Dhaka)/_urls_attempt1.txt
Removing corrupted image: dataset_places/Ahsan Manzil (Dhaka)/_urls_attempt2.txt
Removing corrupted image: dataset_places/Ahsan Manzil (Dhaka)/_urls_attempt3.txt
Removing corrupted image: dataset_places/Ahsan Manzil (Dhaka)/8d9629eb-93a7-41a8-97be-156585da6137.jpg
Removing corrupted image: dataset_places/Ahsan Manzil (Dhaka)/_urls_attempt4.txt
Removing corrupted image: dataset