In [1]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.24.0-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.26.2-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading selenium-4.24.0-py3-none-any.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.26.2-py3-none-any.whl (475 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m476.0/476.0 kB[0m [31m23.

In [146]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from pathlib import Path
from bs4 import BeautifulSoup
from urllib.parse import urljoin

import os
import requests
import time

# Configure Selenium options for running in Colab
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(options=options)

def search_and_download_duckduckgo(query, num=100):
    # URL for DDG image search
    url = f"https://duckduckgo.com/?q={query}&t=h_&iax=images&ia=images"

    # Loads webpage at specified URL
    driver.get(url)

    # Wait for the page to load
    time.sleep(5)

    # Parse the page source
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Finds all images
    image_elements = soup.find_all('img', limit=num)

    # Setup path to a data folder
    image_dir = "data/animals"

    # Create new directory contingent on query
    os.makedirs(os.path.join(image_dir, query), exist_ok=True)

    page_num = 1
    while len(image_elements) < num:
        url = f"https://duckduckgo.com/?q={query}&t=h_&iax=images&ia=images&page={page_num}"
        driver.get(url)
        time.sleep(5)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        new_image_elements = soup.find_all('img', limit=num - len(image_elements))
        image_elements.extend(new_image_elements)
        page_num += 1

    print(f"\nFound {len(image_elements)} images of {query}")

    for i, img in enumerate(image_elements):
      # Extract the image URL via attributes
      image_url = img.get('src') or img.get('data-src')

      # Handle relative URLs by converting them to absolute URLs
      if image_url and not image_url.startswith('http'):
        image_url = urljoin(url, image_url)

      # Check if the URL is a direct link to the image
      if image_url and image_url.startswith("http"):
        # Get the image URL
        try:
          response = requests.get(image_url)

          # Renames image based on query & position
          filename = f"{query}_{i}.jpg"

          # Saves image to proper directory
          with open(os.path.join(image_dir, query, filename), 'wb') as f:
              f.write(response.content)
          print(f"Downloaded {filename}")
        except Exception as e:
          print(f"Error downloading {image_url}: {e}")
      else:
        print(f"Skipping non-direct URL: {image_url}")

In [147]:
search_and_download_duckduckgo("cats", 250)
search_and_download_duckduckgo("dogs", 250)
search_and_download_duckduckgo("birds", 250)


Found 250 images of cats
Downloaded cats_0.jpg
Downloaded cats_1.jpg
Downloaded cats_2.jpg
Downloaded cats_3.jpg
Downloaded cats_4.jpg
Downloaded cats_5.jpg
Downloaded cats_6.jpg
Downloaded cats_7.jpg
Downloaded cats_8.jpg
Downloaded cats_9.jpg
Downloaded cats_10.jpg
Downloaded cats_11.jpg
Downloaded cats_12.jpg
Downloaded cats_13.jpg
Downloaded cats_14.jpg
Downloaded cats_15.jpg
Downloaded cats_16.jpg
Downloaded cats_17.jpg
Downloaded cats_18.jpg
Downloaded cats_19.jpg
Downloaded cats_20.jpg
Downloaded cats_21.jpg
Downloaded cats_22.jpg
Downloaded cats_23.jpg
Downloaded cats_24.jpg
Downloaded cats_25.jpg
Downloaded cats_26.jpg
Downloaded cats_27.jpg
Downloaded cats_28.jpg
Downloaded cats_29.jpg
Downloaded cats_30.jpg
Downloaded cats_31.jpg
Downloaded cats_32.jpg
Downloaded cats_33.jpg
Downloaded cats_34.jpg
Downloaded cats_35.jpg
Downloaded cats_36.jpg
Downloaded cats_37.jpg
Downloaded cats_38.jpg
Downloaded cats_39.jpg
Downloaded cats_40.jpg
Downloaded cats_41.jpg
Downloaded cats_42

In [115]:
# Set path of animals folder
data_dir = Path("data/")
image_dir = data_dir / "animals"

# Print path, # of sub-folders
image_dir, len(list(image_dir.iterdir()))

(PosixPath('data/animals'), 3)

In [116]:
import torch
from torch import nn
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms

device = "cuda" if torch.cuda.is_available() else "cpu"

# Ensure reproducibility
torch.manual_seed(42)

# Augment data (assists CNN in learning minute details)
data_transforms = transforms.Compose([
    # Resize images to 64x64 for TinyVGG compatibility (basis of CNN)
    transforms.Resize(size=(64, 64)),

    # Flip the images randomly on the horizontal
    transforms.RandomHorizontalFlip(),

    transforms.RandomResizedCrop(64, scale=(.08, 1)),

    # Turn the image into a torch.Tensor
    transforms.ToTensor(),

    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load the images from subfolders
imageDataset = datasets.ImageFolder(root=image_dir, transform=data_transforms)

# 80/20 split between train & validation
train_size = int(0.8 * len(imageDataset))
validation_size = len(imageDataset) - train_size

# Randomly split the data
train_dataset, validation_dataset = random_split(imageDataset, [train_size, validation_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=32, shuffle=False)

In [8]:
device

'cuda'

In [9]:
imageDataset[0][0].shape

torch.Size([3, 64, 64])

In [10]:
# Verify lengths of dataset
len(train_dataset), len(validation_dataset)

(225, 57)

In [11]:
# Verify classes of folders in list & dict form
imageDataset.classes, imageDataset.class_to_idx

(['birds', 'cats', 'dogs'], {'birds': 0, 'cats': 1, 'dogs': 2})

In [158]:
# Building a CNN
class animalIdentifier(nn.Module):
  """
  Model architecture to determine animal
  Structure is as follows: For the two conv blocks: conv -> ReLU -> conv -> ReLU -> MaxPool
  MaxPool is especially important as it decreases the spatial size of an image,
  reducing the parameters & computation of the network. Essentially it allows for higher
  higher level of pattern recognition in images (i.e. from edges to parts to objects & onward)
  """

  def __init__(self, input_shape: int, hidden_units: int, output_shape: int):
    # Instantiate NN
    super().__init__()

    # First conv block
    self.conv_block_1 = nn.Sequential(
        nn.Conv2d(in_channels=input_shape,
                  out_channels=hidden_units,
                  kernel_size=3,
                  stride=1,
                  padding=1),
        nn.ReLU(),
        nn.Conv2d(in_channels=hidden_units,
                  out_channels=hidden_units,
                  kernel_size=3,
                  stride=1,
                  padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2)
    )

    # Second conv block
    self.conv_block_2 = nn.Sequential(
        nn.Conv2d(in_channels=hidden_units,
                  out_channels=hidden_units,
                  kernel_size=3,
                  stride=1,
                  padding=1),
        nn.ReLU(),
        nn.Conv2d(in_channels=hidden_units,
                  out_channels=hidden_units,
                  kernel_size=3,
                  stride=1,
                  padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2)
    )

    # Third conv block
    self.conv_block_3 = nn.Sequential(
        nn.Conv2d(in_channels=hidden_units,
                  out_channels=hidden_units,
                  kernel_size=3,
                  stride=1,
                  padding=1),
        nn.ReLU(),
        nn.Conv2d(in_channels=hidden_units,
                  out_channels=hidden_units,
                  kernel_size=3,
                  stride=1,
                  padding=1),
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=2)
    )

    self.classifier = nn.Sequential(
        nn.Flatten(),
        nn.Dropout(p=0.5),  # 50% dropout
        nn.Linear(in_features=hidden_units * 16 * 16, # Exact spatial dimension calculated
                  out_features=output_shape)
    )

  def forward(self, x):
    return self.classifier(self.conv_block_2(self.conv_block_1(x)))

In [118]:
# Instantiate the model with, in order: # of color channels, hidden_units, # of classes
animalModel = animalIdentifier(input_shape=3,
                               hidden_units=64,
                               output_shape=len(imageDataset.classes)).to(device)

animalModel

animalIdentifier(
  (conv_block_1): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU()
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv_block_2): Sequential(
    (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU()
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv_block_3): Sequential(
    (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU()
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Drop

In [119]:
# Retrieve a single batch of data (batch of images & corresponding labels)
image_batch, label_batch = next(iter(train_loader))
image_batch.shape, label_batch.shape

(torch.Size([32, 3, 64, 64]), torch.Size([32]))

In [29]:
# Try a forward pass
animalModel(image_batch)

tensor([[ 0.0252,  0.0194, -0.0065],
        [ 0.0253,  0.0195, -0.0068],
        [ 0.0252,  0.0196, -0.0070],
        [ 0.0258,  0.0202, -0.0075],
        [ 0.0257,  0.0196, -0.0068],
        [ 0.0251,  0.0197, -0.0066],
        [ 0.0256,  0.0193, -0.0070],
        [ 0.0251,  0.0197, -0.0074],
        [ 0.0261,  0.0195, -0.0069],
        [ 0.0257,  0.0198, -0.0070],
        [ 0.0257,  0.0200, -0.0068],
        [ 0.0254,  0.0201, -0.0076],
        [ 0.0253,  0.0193, -0.0069],
        [ 0.0254,  0.0193, -0.0070],
        [ 0.0254,  0.0193, -0.0067],
        [ 0.0258,  0.0201, -0.0070],
        [ 0.0252,  0.0197, -0.0068],
        [ 0.0252,  0.0196, -0.0068],
        [ 0.0259,  0.0193, -0.0071],
        [ 0.0252,  0.0195, -0.0066],
        [ 0.0257,  0.0196, -0.0067],
        [ 0.0252,  0.0196, -0.0068],
        [ 0.0254,  0.0196, -0.0065],
        [ 0.0253,  0.0199, -0.0071],
        [ 0.0255,  0.0198, -0.0068],
        [ 0.0253,  0.0197, -0.0071],
        [ 0.0255,  0.0196, -0.0066],
 

In [120]:
# Train step
def train_step(model: torch.nn.Module,
               dataloader: torch.utils.data.DataLoader,
               loss_fn: torch.nn.Module,
               optimizer: torch.optim.Optimizer,
               device=device):

  # Put the model in train mode
  model.train()

  train_loss, train_acc = 0, 0

  # Loop through batches
  for batch in dataloader:
    # Extract values from batch
    images, labels = batch

    # Send to device
    images, labels = images.to(device), labels.to(device)

    # Forward pass
    predictions = model(images)

    # Calculate loss
    loss = loss_fn(predictions, labels) # Logits
    train_loss += loss.item() # Cumulative loss

    # Calculate accuracy
    prediction_classes = torch.argmax(predictions, dim=1) # Predicted class (logits to classes)
    train_acc += (prediction_classes == labels).sum().item() # Sum the number of correct predictions

    # Set gradients to zero
    optimizer.zero_grad()

    # Backpropagation
    loss.backward()

    # Step (learning rate)
    optimizer.step()

  # Average loss & accuracy per batch
  train_loss = train_loss / len(dataloader)
  train_acc = train_acc / len(dataloader.dataset) # Total correct / total samples

  return train_loss, train_acc

In [121]:
# Validation step
def validation_step(model: torch.nn.Module,
              dataloader: torch.utils.data.DataLoader,
              loss_fn: torch.nn.Module,
              device=device):

  # Put the model in eval mode
  model.eval()

  validation_loss, validation_acc = 0, 0

  # Disables gradient computation, reduces memory usage & increases speed
  with torch.inference_mode():

    # Loop through batches
    for batch in dataloader:
      # Extract values from batch
      images, labels = batch

      # Send to device
      images, labels = images.to(device), labels.to(device)

      # Forward pass
      predictions = model(images)

      # Calculate the loss
      loss = loss_fn(predictions, labels)
      validation_loss += loss.item()

      # Calculate accuracy
      prediction_classes = torch.argmax(predictions, dim=1) # Predicted class (logits to classes)
      validation_acc += (prediction_classes == labels).sum().item() # Sum the number of correct predictions

  # Average loss & accuracy per batch
  validation_loss = validation_loss / len(dataloader)
  validation_acc = validation_acc / len(dataloader.dataset)

  return validation_loss, validation_acc

In [122]:
# Combine both train_step() & validation_step() into one function
from tqdm.auto import tqdm

# Takes in train & validation dataloaders, as well as everything needed to compute
def train(model: torch.nn.Module,
          train_dataloader: torch.utils.data.DataLoader,
          validation_dataloader: torch.utils.data.DataLoader,
          optimizer: torch.optim.Optimizer,
          loss_fn: torch.nn.Module,
          epochs: int = 5,
          device=device):

  # Create dictionary to store results
  results = {"train_loss": [],
             "train_acc": [],
             "validation_loss": [],
             "validation_acc": []}

  # Loop through train & validation steps for # of epochs
  for epoch in tqdm(range(epochs)):
    train_loss, train_acc = train_step(model=model,
                                       dataloader=train_dataloader,
                                       loss_fn=loss_fn,
                                       optimizer=optimizer,
                                       device=device)

    validation_loss, validation_acc = validation_step(model=model,
                                    dataloader=validation_dataloader,
                                    loss_fn=loss_fn,
                                    device=device)

    # Print data per epoch
    print(f"Epoch: {epoch} | Train loss: {train_loss:.4f} | Train acc: {train_acc:.4f} | Validation loss: {validation_loss:.4f} | Validation acc: {validation_acc:.4f}")

    # Update results dictionary
    results["train_loss"].append(train_loss)
    results["train_acc"].append(train_acc)
    results["validation_loss"].append(validation_loss)
    results["validation_acc"].append(validation_acc)

  # Return filled results
  return results

In [159]:
# Time to train

NUM_EPOCHS = 125

# Recreate model from above
animalModelV2 = animalIdentifier(input_shape=3,
                               hidden_units=64,
                               output_shape=len(imageDataset.classes)).to(device)

# Configure loss function & optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=animalModelV2.parameters(),
                            lr=0.001)

# Timer
from timeit import default_timer as timer
start_time = timer()

# Train
results = train(model=animalModelV2,
                train_dataloader=train_loader,
                validation_dataloader=validation_loader,
                optimizer=optimizer,
                loss_fn=loss_fn,
                epochs=NUM_EPOCHS,
                device=device)

# End the timer
end_time = timer()
print(f"Total training time: {end_time - start_time:.3f} seconds")

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch: 0 | Train loss: 1.1014 | Train acc: 0.3917 | Validation loss: 1.0889 | Validation acc: 0.4583
Epoch: 1 | Train loss: 0.9796 | Train acc: 0.4833 | Validation loss: 0.8601 | Validation acc: 0.5833
Epoch: 2 | Train loss: 0.8870 | Train acc: 0.5792 | Validation loss: 0.8581 | Validation acc: 0.5917
Epoch: 3 | Train loss: 0.8435 | Train acc: 0.6146 | Validation loss: 0.8356 | Validation acc: 0.5917
Epoch: 4 | Train loss: 0.8177 | Train acc: 0.6229 | Validation loss: 0.8722 | Validation acc: 0.5583
Epoch: 5 | Train loss: 0.8146 | Train acc: 0.6104 | Validation loss: 0.7911 | Validation acc: 0.6167
Epoch: 6 | Train loss: 0.7388 | Train acc: 0.6750 | Validation loss: 0.8026 | Validation acc: 0.6500
Epoch: 7 | Train loss: 0.7319 | Train acc: 0.7042 | Validation loss: 0.7412 | Validation acc: 0.6833
Epoch: 8 | Train loss: 0.7246 | Train acc: 0.6521 | Validation loss: 0.7565 | Validation acc: 0.6667
Epoch: 9 | Train loss: 0.6949 | Train acc: 0.6750 | Validation loss: 0.8198 | Validation ac

In [160]:
# Create model directory
MODEL_PATH = Path("models")
MODEL_PATH.mkdir(parents=True,
                 exist_ok=True)

# Create model name & save path
MODEL_NAME = "bcd_ImageClassifier.pth"
MODEL_SAVE_PATH = MODEL_PATH / MODEL_NAME

# Save model's weights
print(f"Saving model to: {MODEL_SAVE_PATH}")
torch.save(obj=animalModelV2.state_dict(),
           f=MODEL_SAVE_PATH)

Saving model to: models/bcd_ImageClassifier.pth
