In [None]:
# Import all libs.
import os
import requests
from duckduckgo_search import DDGS
from pathlib import Path
import time
import dask

# Constants.
DATASET_DIR = Path(os.getcwd()) / "datasets"

In [None]:

def download_images(query, output_dir,max_results=50):
    os.makedirs(output_dir, exist_ok=True)
    
    ddg = DDGS()
    results = ddg.images(query, max_results=max_results)
    
    for idx, result in enumerate(results):
        image_url = result["image"]
        try:
            response = requests.get(image_url, timeout=10)
            response.raise_for_status()
            ext = image_url.split(".")[-1].split("?")[0][:4]
            filename = os.path.join(output_dir, f"{query.replace(' ', '_')}_{idx}.{ext}")
            with open(filename, "wb") as f:
                f.write(response.content)
            print(f"Downloaded: {filename}")
        except Exception as e:
            print(f"Failed to download {image_url}: {e}")


In [None]:
from dask.distributed import Client
client = Client(threads_per_worker=os.cpu_count() // 2, n_workers=os.cpu_count())
gadgets = ["smartphone", "tablet", "smartwatch", "headphones", "camera"]
parallel_results = []
for gadget in gadgets:
    parallel_result= dask.delayed(download_images)(gadget, output_dir=DATASET_DIR / gadget, max_results=200)
    parallel_results.append(parallel_result)


In [None]:
parallel_results = dask.compute(*parallel_results)
print("All downloads completed.")

In [43]:
from PIL import Image

def validate_images(directory):
  invalid_images = []
  for root, _, files in os.walk(directory):
    for file in files:
      file_path = os.path.join(root, file)
      try:
        with Image.open(file_path) as img:
          img.verify()  # Verify if the file is a valid image
      except Exception as e:
        invalid_images.append(file_path)
        print(f"Invalid image: {file_path} - {e}")
  return invalid_images

invalid_files = validate_images(DATASET_DIR)
print(f"Number of invalid images: {len(invalid_files)}")

# Remove invalid images
for invalid_file in invalid_files:
    try:
        os.remove(invalid_file)
        print(f"Removed invalid image: {invalid_file}")
    except Exception as e:
        print(f"Failed to remove {invalid_file}: {e}")

Invalid image: /Users/priyanshuagarwal/Work/my-projects/ml-learning/gadget-predictor/datasets/tablet/tablet_142.jpg - cannot identify image file '/Users/priyanshuagarwal/Work/my-projects/ml-learning/gadget-predictor/datasets/tablet/tablet_142.jpg'
Invalid image: /Users/priyanshuagarwal/Work/my-projects/ml-learning/gadget-predictor/datasets/smartwatch/smartwatch_57.jpg - cannot identify image file '/Users/priyanshuagarwal/Work/my-projects/ml-learning/gadget-predictor/datasets/smartwatch/smartwatch_57.jpg'
Invalid image: /Users/priyanshuagarwal/Work/my-projects/ml-learning/gadget-predictor/datasets/smartphone/smartphone_150.svg - cannot identify image file '/Users/priyanshuagarwal/Work/my-projects/ml-learning/gadget-predictor/datasets/smartphone/smartphone_150.svg'
Invalid image: /Users/priyanshuagarwal/Work/my-projects/ml-learning/gadget-predictor/datasets/camera/camera_75.jpg - cannot identify image file '/Users/priyanshuagarwal/Work/my-projects/ml-learning/gadget-predictor/datasets/ca

In [35]:
import torch
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

In [44]:
# Step 1: Transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

In [45]:
# Step 2: Load dataset
dataset = datasets.ImageFolder(DATASET_DIR, transform=transform)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [46]:
# Step 3: Use pre-trained ResNet
model = models.resnet18(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, len(dataset.classes))

In [None]:
# Step 4: Train
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [51]:
for epoch in range(5):  # Train for 5 epochs
    model.train()  # Set model to training mode
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Accuracy Calculation
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(dataloader)
    epoch_acc = 100 * correct / total
    print(f"Epoch {epoch+1} - Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%")

Epoch 1 - Loss: 0.6606, Accuracy: 76.94%
Epoch 2 - Loss: 0.4397, Accuracy: 83.87%
Epoch 3 - Loss: 0.3399, Accuracy: 88.24%
Epoch 4 - Loss: 0.2457, Accuracy: 90.80%
Epoch 5 - Loss: 0.1763, Accuracy: 94.78%
