In [None]:
import os
import requests
from bs4 import BeautifulSoup
import urllib.parse
import time

class ImageCountExceededException(Exception):
    pass

def check_image_count(subspecies, subspecies_counters, threshold=1000):
    if subspecies_counters[subspecies] > threshold:
        raise ImageCountExceededException(f"Subspecies {subspecies} has more than {threshold} images downloaded.")

def download_images(query, num_images, output_dir, subspecies_counters, threshold=1000):
    # Create the directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Set the headers to mimic a browser visit
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    # Format the query for URL encoding
    query_encoded = urllib.parse.quote(query)

    downloaded = 0
    page = 0

    while downloaded < num_images:
        try:
            # Construct the Google Image search URL with pagination
            search_url = f"https://www.google.com/search?q={query_encoded}&tbm=isch&start={page*20}"

            # Get the HTML content of the search page
            response = requests.get(search_url, headers=headers)
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find all image elements
            img_tags = soup.find_all('img')[1:]  # Skip the first image which is usually the Google logo

            if not img_tags:
                break

            for i, img in enumerate(img_tags):
                if downloaded >= num_images:
                    break
                try:
                    img_url = img.get('src') or img.get('data-src')
                    if not img_url or not img_url.startswith('http'):
                        continue
                    img_data = requests.get(img_url).content
                    with open(os.path.join(output_dir, f"{query}_{downloaded}.jpg"), 'wb') as handler:
                        handler.write(img_data)
                    print(f"Downloaded {query}_{downloaded}.jpg")
                    downloaded += 1
                    subspecies_counters[query] += 1

                    # Check if the number of images exceeds the threshold
                    check_image_count(query, subspecies_counters, threshold)

                except Exception as e:
                    print(f"Could not download image {downloaded}: {e}")

            # Increase page count to move to the next set of images
            page += 1

            # Pause to avoid overwhelming the server and to mimic human-like behavior
            time.sleep(5)
        except ImageCountExceededException as e:
            print(e)
            break

# Example usage: Download images related to plant subspecies
plant_species = {
    "rose": ["Rosa gallica", "Rosa chinensis", "Rosa rugosa", "Rosa damascena", "Rosa multiflora"],
    "orchid": ["Phalaenopsis orchids", "Dendrobium orchids", "Cattleya orchids", "Vanda orchids", "Oncidium orchids"],
    "oak": ["Quercus robur", "Quercus petraea", "Quercus suber", "Quercus alba", "Quercus coccinea"],
    "maple": ["Acer saccharum", "Acer rubrum", "Acer platanoides", "Acer palmatum", "Acer macrophyllum"],
    "pine": ["Pinus sylvestris", "Pinus nigra", "Pinus ponderosa", "Pinus strobus", "Pinus contorta"],
    "cactus": ["Carnegiea gigantea", "Echinocactus grusonii", "Opuntia ficus-indica", "Ferocactus wislizeni", "Mammillaria hahniana"],
    "lily": ["Lilium candidum", "Lilium lancifolium", "Lilium longiflorum", "Lilium martagon", "Lilium speciosum"],
    "fern": ["Polypodium vulgare", "Dryopteris filix-mas", "Athyrium filix-femina", "Pteridium aquilinum", "Adiantum pedatum"],
    "bamboo": ["Phyllostachys edulis", "Bambusa vulgaris", "Dendrocalamus asper", "Gigantochloa apus", "Chimonobambusa quadrangularis"],
    "palm": ["Phoenix dactylifera", "Cocos nucifera", "Elaeis guineensis", "Washingtonia robusta", "Areca catechu"]
}

num_images = 1000
output_dir = "/drive/My Drive/Colab Notebooks/plant_subspecies_dataset"

# Dictionary to keep track of the number of images downloaded for each subspecies
subspecies_counters = {f"{species} {subspecies}": 0 for species in plant_species for subspecies in plant_species[species]}

# Create directories for each plant species
for species, subspecies_list in plant_species.items():
    num_images_per_subspecies = num_images_total // len(subspecies_list)
    for subspecies in subspecies_list:
        download_images(f"{species} {subspecies}", num_images_per_subspecies, os.path.join(output_dir, species, subspecies), subspecies_counters)



Downloaded cactus Ferocactus wislizeni_0.jpg
Downloaded cactus Ferocactus wislizeni_1.jpg
Downloaded cactus Ferocactus wislizeni_2.jpg
Downloaded cactus Ferocactus wislizeni_3.jpg
Downloaded cactus Ferocactus wislizeni_4.jpg
Downloaded cactus Ferocactus wislizeni_5.jpg
Downloaded cactus Ferocactus wislizeni_6.jpg
Downloaded cactus Ferocactus wislizeni_7.jpg
Downloaded cactus Ferocactus wislizeni_8.jpg
Downloaded cactus Ferocactus wislizeni_9.jpg
Downloaded cactus Ferocactus wislizeni_10.jpg
Downloaded cactus Ferocactus wislizeni_11.jpg
Downloaded cactus Ferocactus wislizeni_12.jpg
Downloaded cactus Ferocactus wislizeni_13.jpg
Downloaded cactus Ferocactus wislizeni_14.jpg
Downloaded cactus Ferocactus wislizeni_15.jpg
Downloaded cactus Ferocactus wislizeni_16.jpg
Downloaded cactus Ferocactus wislizeni_17.jpg
Downloaded cactus Ferocactus wislizeni_18.jpg
Downloaded cactus Ferocactus wislizeni_19.jpg
Downloaded cactus Ferocactus wislizeni_20.jpg
Downloaded cactus Ferocactus wislizeni_21.jp

In [None]:
import os

output_dir = "/drive/My Drive/Colab Notebooks/plant_subspecies_dataset"

def count_images(output_dir):
    subspecies_counts = {}

    for root, dirs, files in os.walk(output_dir):
        for file in files:
            if file.endswith(".jpg"):
                # Extract subspecies name from the file name
                subspecies_name = os.path.basename(file).split("_")[0]
                if subspecies_name not in subspecies_counts:
                    subspecies_counts[subspecies_name] = 0
                subspecies_counts[subspecies_name] += 1

    return subspecies_counts

# Call the function to count images
subspecies_counts = count_images(output_dir)

# Print the counts
for subspecies, count in subspecies_counts.items():
    print(f"{subspecies}: {count} images downloaded")



bamboo: 877 images downloaded
bamboo Dendrocalamus asper: 176 images downloaded
bamboo Bambusa vulgaris: 174 images downloaded
rose Rosa damascena: 174 images downloaded
pine Pinus sylvestris: 175 images downloaded
pine Pinus strobus: 174 images downloaded
pine Pinus ponderosa: 177 images downloaded
pine Pinus nigra: 174 images downloaded
pine Pinus contorta: 175 images downloaded
palm Elaeis guineensis: 174 images downloaded
palm Cocos nucifera: 192 images downloaded
orchid Phalaenopsis orchids: 174 images downloaded
orchid Oncidium orchids: 174 images downloaded
orchid Dendrobium orchids: 174 images downloaded
oak Quercus suber: 173 images downloaded
oak Quercus coccinea: 174 images downloaded
oak Quercus alba: 175 images downloaded
maple Acer palmatum: 184 images downloaded
maple Acer macrophyllum: 174 images downloaded
lily Lilium martagon: 174 images downloaded
lily Lilium speciosum: 174 images downloaded
fern Athyrium filix-femina: 174 images downloaded
cactus: 893 images downloa

In [None]:
import os
import random
import shutil

def split_dataset(input_dir, output_dir, train_percent=0.7, val_percent=0.15, test_percent=0.15):
    # Create directories for train, validation, and test sets
    train_dir = os.path.join(output_dir, 'train')
    val_dir = os.path.join(output_dir, 'val')
    test_dir = os.path.join(output_dir, 'test')

    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    # Iterate through each class directory
    for species in os.listdir(input_dir):
        species_dir = os.path.join(input_dir, species)

        # Create corresponding directories in train, val, test sets
        os.makedirs(os.path.join(train_dir, species), exist_ok=True)
        os.makedirs(os.path.join(val_dir, species), exist_ok=True)
        os.makedirs(os.path.join(test_dir, species), exist_ok=True)

        # Iterate through each subspecies directory
        for subspecies in os.listdir(species_dir):
            subspecies_dir = os.path.join(species_dir, subspecies)

            # Get list of image files in subspecies directory
            images = os.listdir(subspecies_dir)
            random.shuffle(images)  # Shuffle to randomize the order

            # Split images into train, val, test sets
            num_images = len(images)
            num_train = int(train_percent * num_images)
            num_val = int(val_percent * num_images)

            train_images = images[:num_train]
            val_images = images[num_train:num_train+num_val]
            test_images = images[num_train+num_val:]

            # Copy images to respective directories
            for img in train_images:
                shutil.copy(os.path.join(subspecies_dir, img), os.path.join(train_dir, species, img))
            for img in val_images:
                shutil.copy(os.path.join(subspecies_dir, img), os.path.join(val_dir, species, img))
            for img in test_images:
                shutil.copy(os.path.join(subspecies_dir, img), os.path.join(test_dir, species, img))

# Example usage: Split dataset into train, val, test sets
input_dir = "/drive/My Drive/Colab Notebooks/plant_subspecies_dataset"
output_dir = "/drive/My Drive/Colab Notebooks/plant_subspecies_split"

split_dataset(input_dir, output_dir)

