In [None]:
# run only once
# !pip install ddgs

## How to search for images

In [None]:
from ddgs import DDGS

def search_images(keyword, max_results=100):
    with DDGS() as ddgs:
        images = ddgs.images(
            keyword,
            max_results=max_results
        )
        return [img['image'] for img in images]

**Example:**

Note we got less images than we asked for.

In [20]:
keyword = "banana"
image_urls = search_images(keyword, max_results=500)
len(image_urls)

100

Let us take a random image and look at it:

In [26]:
image_urls[90]

'https://blwstore.com/wp-content/uploads/2022/12/How-to-offer-Banana-in-BLW.jpg'

### Bulk Image Search Utility

In [41]:
import itertools
import time
from typing import List, Iterable, Optional, Set

def _query_variations(base: str,
                      extra_mods: Optional[Iterable[str]] = None,
                      include_defaults: bool = True,
                      max_variations: int = 200) -> List[str]:
    """
    Build a list of query variations to bypass the ~100-results cap per query.
    You can pass your own `extra_mods`; sensible defaults are included.
    """
    defaults1 = [
        "", "close-up", "outdoor", "indoor", "portrait", "landscape", "macro",
        "high resolution", "aesthetic", "minimal", "pattern", "texture",
        "vintage", "modern", "studio", "creative", "art", "drawing", "illustration",
    ]
    defaults2 = ["hd", "4k", "vertical", "horizontal", "background", "wallpaper", "stock"]

    mods1 = (defaults1 if include_defaults else []) + list(extra_mods or [])
    mods2 = defaults2 if include_defaults else [""]

    # build combinations but keep it compact
    variations = []
    for m1, m2 in itertools.product(mods1, mods2):
        q = " ".join([base, m1, m2]).strip()
        if q not in variations:
            variations.append(q)
        if len(variations) >= max_variations:
            break
    return variations

def search_images_bulk(keyword: str,
                       total: int = 500,
                       per_query_cap: int = 100,
                       extra_modifiers: Optional[Iterable[str]] = None,
                       sleep_sec: float = 0.4,
                       safe: str = "moderate") -> List[str]:
    """
    Collect up to `total` image URLs by running multiple DDG queries.
    - `safe`: 'on' | 'moderate' | 'off' (ddgs supports these)
    - `extra_modifiers`: iterable of strings appended to form variations
    """
    seen: Set[str] = set()
    out: List[str] = []

    variations = _query_variations(keyword, extra_modifiers, include_defaults=True)

    with DDGS() as ddgs:
        for q in variations:
            # ddgs.images supports max_results; other filters differ by version.
            # Keep it minimal & robust.
            try:
                results = ddgs.images(q, max_results=min(per_query_cap, 100), safesearch=safe)
            except TypeError:
                # Older versions may not accept 'safesearch' kwarg
                results = ddgs.images(q, max_results=min(per_query_cap, 100))

            added_this_round = 0
            for r in results:
                url = r.get("image") or r.get("thumbnail") or r.get("url")
                if not url or url in seen:
                    continue
                seen.add(url)
                out.append(url)
                added_this_round += 1
                if len(out) >= total:
                    return out

            # polite pacing (also helps avoid throttling)
            time.sleep(sleep_sec)

    return out

# --- example usage ---
# urls = search_images_bulk("golden retriever puppies", total=500,
#                           extra_modifiers=["running", "playing", "sleeping", "portrait"])

pears = search_images_bulk("pear", total=400)
blueberries = search_images_bulk("blueberry", total=400)
bananas = search_images_bulk("banana", total=400)


print(
    f"Number of pears: {len(pears)}\n"
    f"{pears[0]}\n"
    f"{pears[100]}\n"
    f"{pears[200]}\n"
    f"{pears[300]}\n"
    f"{pears[399]}\n"
)

print(
    f"Number of blueberries: {len(blueberries)}\n"
    f"{blueberries[0]}\n"
    f"{blueberries[100]}\n"
    f"{blueberries[200]}\n"
    f"{blueberries[300]}\n"
    f"{blueberries[399]}\n"
)

print(
    f"Number of bananas: {len(bananas)}\n"
    f"{bananas[0]}\n"
    f"{bananas[100]}\n"
    f"{bananas[200]}\n"
    f"{bananas[300]}\n"
    f"{bananas[399]}\n"
)

Number of pears: 400
https://images6.alphacoders.com/677/thumb-1920-677397.jpg
https://wallpapershome.com/images/wallpapers/pear-2160x3840-rain-25695.jpg
https://gardenerspath.com/wp-content/uploads/2023/08/Asian-Pears-Growing-in-the-Garden.jpg
https://as1.ftcdn.net/v2/jpg/01/39/83/96/1000_F_139839656_gJAU0DX9s9t9DIdQ1YWBzUpkv8zcXZS4.jpg
https://thumbs.dreamstime.com/z/seamless-pears-background-fully-editable-files-included-51789313.jpg

Number of blueberries: 400
https://wallpaperaccess.com/full/1466309.jpg
https://coolwallpapers.me/picsup/2710639-blueberry-4k-images-background.jpg
https://thumbs.dreamstime.com/b/vertical-picture-organic-blueberries-growing-garden-beautiful-summer-scenery-latvia-northern-europe-vertical-picture-340781775.jpg
https://thumbs.dreamstime.com/z/blueberry-green-leafs-white-background-blueberries-food-texture-photography-horizontal-format-advertising-photo-ai-296506903.jpg
https://thumbs.dreamstime.com/z/illustrated-blueberry-background-tile-seamless-repeati

## How to download images

In [43]:
import os
import requests
from urllib.parse import urlparse
import warnings

def download_image(url, folder, custom_name=None, verbose=True):
    # Create the folder if it doesn't exist
    os.makedirs(folder, exist_ok=True)

    # Get the filename from the URL or use the custom name
    if custom_name:
        filename = custom_name
    else:
        filename = os.path.basename(urlparse(url).path)
        if not filename:
            filename = 'image.jpg'  # Default filename if none is found in the URL

    # Ensure the filename has an extension
    if not os.path.splitext(filename)[1]:
        filename += '.jpg'

    filepath = os.path.join(folder, filename)

    # If the file already exists, append a number to make it unique
    base, extension = os.path.splitext(filepath)
    counter = 1
    while os.path.exists(filepath):
        filepath = f"{base}_{counter}{extension}"
        counter += 1

    try:
        # Send a GET request to the URL with a timeout of 10 seconds
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raises an HTTPError for bad responses

        # Check if the content type is an image
        content_type = response.headers.get('content-type', '')
        if not content_type.startswith('image'):
            if verbose:
                warnings.warn(f"The URL does not point to an image. Content-Type: {content_type}")
            return False

        # Write the image content to the file
        with open(filepath, 'wb') as f:
            f.write(response.content)

        if verbose:
            print(f"Image successfully downloaded: {filepath}")
        return True

    except requests.exceptions.Timeout:
        if verbose: 
            warnings.warn(f"Download timed out for URL: {url}")
    except requests.exceptions.HTTPError as e:
        if verbose: 
            warnings.warn(f"HTTP error occurred: {e}")
    except requests.exceptions.RequestException as e:
        if verbose: 
            warnings.warn(f"An error occurred while downloading the image: {e}")
    except IOError as e:
        if verbose: 
            warnings.warn(f"An error occurred while writing the file: {e}")

    return False

Let us donwload all teddybears into separate folder.

In [44]:
from tqdm.notebook import tqdm

def download_batch(image_urls, folder, verbose=False):
    for i, url in enumerate(tqdm(image_urls)):
        download_image(url, folder, f'image {i:03}.jpg', verbose=verbose)

download_batch(bananas[:300], "./dataset/train/banana/", verbose=False)
download_batch(pears[:300], "./dataset/train/pear/", verbose=False)
download_batch(blueberries[:300], "./dataset/train/blueberry/", verbose=False)
download_batch(bananas[300:], "./dataset/test/banana/", verbose=False)
download_batch(pears[300:], "./dataset/test/pear/", verbose=False)
download_batch(blueberries[300:], "./dataset/test/blueberry/", verbose=False)


    # download_image(url, "./dataset/teddybear/", f'image {i:03}.jpg', verbose=False)

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

## How to resize all images to 28x28

In [1]:
# Cell 1 — imports & config
import torch, torch.nn as nn, torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
from time import time

In [2]:
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [3]:
# Cell 2 — data loaders
mean, std = (0.5,), (0.5,)

train_tf = transforms.Compose([
    transforms.Resize((28, 28)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])

test_tf = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
    transforms.Resize((28, 28)),
])


train = datasets.ImageFolder(root="./dataset/train", transform=train_tf)
test  = datasets.ImageFolder(root="./dataset/test", transform=test_tf)

train_loader = DataLoader(train, pin_memory=True)
test_loader  = DataLoader(test, pin_memory=True)

print(len(train), len(test))

702 269


In [4]:
class BetterCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)   # 28x28 -> 28x28, 3 channels for RGB
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)  # 28x28 -> 28x28
        self.pool  = nn.MaxPool2d(2, 2)                           # 28->14, 14->7
        self.drop1 = nn.Dropout(0.25)
        self.fc1   = nn.Linear(64 * 7 * 7, 128)
        self.drop2 = nn.Dropout(0.5)
        self.fc2   = nn.Linear(128, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(F.relu(self.conv2(x)))  # 28->14
        x = self.drop1(x)
        # add a tiny extra conv block without extra params by reusing conv2? keep small: skip.
        x = self.pool(x)                      # 14->7
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = self.drop2(x)
        return self.fc2(x)

model = BetterCNN().to(device)
model


BetterCNN(
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (drop1): Dropout(p=0.25, inplace=False)
  (fc1): Linear(in_features=3136, out_features=128, bias=True)
  (drop2): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=128, out_features=10, bias=True)
)

In [5]:
lr = 1e-3
optimizer = Adam(model.parameters(), lr=lr)
scheduler = StepLR(optimizer, step_size=5, gamma=0.5)  # halve LR every 5 epochs
criterion = nn.CrossEntropyLoss()

In [6]:
# Cell 5 — helpers
def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss, correct, n = 0.0, 0, 0
    for x, y in loader:
        x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
        optimizer.zero_grad(set_to_none=True)
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * y.size(0)
        correct += (logits.argmax(1) == y).sum().item()
        n += y.size(0)
    return total_loss / n, correct / n

@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, correct, n = 0.0, 0, 0
    for x, y in loader:
        x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
        logits = model(x)
        loss = criterion(logits, y)
        total_loss += loss.item() * y.size(0)
        correct += (logits.argmax(1) == y).sum().item()
        n += y.size(0)
    return total_loss / n, correct / n


In [None]:

# Cell 6 — training loop
epochs = 11
best_acc = 0.0
patience, wait = 4, 0
best_state = None
start = time()

for epoch in range(1, epochs + 1):
    tr_loss, tr_acc = train_one_epoch(model, train_loader, optimizer, criterion, device)
    va_loss, va_acc = evaluate(model, test_loader, criterion, device)
    scheduler.step()

    if va_acc > best_acc:
        best_acc, wait = va_acc, 0
        best_state = {k: v.cpu() for k, v in model.state_dict().items()}
    else:
        wait += 1

    print(f"Epoch {epoch:02d} | train loss: {tr_loss:.4f}, accuracy: {tr_acc * 100:.2f}% "
          f"| test loss: {va_loss:.4f}, accuracy: {va_acc * 100:.2f}%")

    if wait >= patience:
        print("Early stopping.")
        break

print(f"Done in {(time()-start):.1f}s. Best test acc: {best_acc:.3f}")
if best_state is not None:
    model.load_state_dict({k: v.to(device) for k, v in best_state.items()})


In [93]:
# Cell 7 — final metrics
test_loss, test_acc = evaluate(model, test_loader, criterion, device)
print(f"Final test:\n"
      f"loss: {test_loss:.4f}\n"
      f"accuracy: {test_acc * 100:.2f}%")

Final test:
loss: 12.3547
accuracy: 33.09%
