<a href="https://colab.research.google.com/github/Fazlibeqir/Timski-Proekt/blob/main/fetch_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install dependencies

In [9]:
# Install required libraries (only once in Colab)
!pip install -q gdown pygbif pillow requests

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Imports

In [11]:
import sys
import os
import sys
import shutil
import zipfile
import pandas as pd
from PIL import Image
from tqdm import tqdm

In [12]:
# Link utils
module_path = '/content/drive/MyDrive/Insect Detection/utils'
sys.path.append(module_path)

from common_funcs import (
    get_gbif_images,
    download_image,
    folder_has_enough_images,
    count_images
)

Download and unzip dataset from Google Drive

https://drive.google.com/file/d/17F34dlZgpaYxy04nFqEQz2ffwMQUrfxA

In [30]:
# ===== OPTION 1: Load dataset from Google Drive ZIP =====
load_from_zip = True  # 👈 Set to False if you want to download via GBIF

dataset_dir = "/content/dataset/dataset/content/dataset"  # Final dataset path
os.makedirs(dataset_dir, exist_ok=True)

if load_from_zip:
    print("📦 Loading dataset from Drive ZIP")
    file_id = "17F34dlZgpaYxy04nFqEQz2ffwMQUrfxA"
    output_path = "dataset.zip"

    # Download from Google Drive
    import gdown
    gdown.download(id=file_id, output=output_path, quiet=False)

    # Extract if valid
    if output_path.endswith(".zip") and zipfile.is_zipfile(output_path):
        shutil.unpack_archive(output_path, "dataset", 'zip')
        print("✅ Dataset extracted successfully.")
    else:
        print("❌ Downloaded file is not a valid ZIP archive.")

else:
    print("🌐 Downloading dataset from GBIF using CSV class list")

    # ====== Load class list ======
    df = pd.read_csv("/content/drive/MyDrive/csv/0044847-241126133413365.csv")
    classes = df["Scientific name"].tolist()

    # ====== Download Images from GBIF ======
    min_images_per_class = 50
    for cls in tqdm(classes, desc="Downloading images"):
        folder_name = cls.replace(" ", "_")
        class_folder = os.path.join(dataset_dir, folder_name)
        os.makedirs(class_folder, exist_ok=True)

        urls = get_gbif_images(cls, limit=200)
        count = 0
        for i, url in enumerate(urls):
            filename = f"{i}.jpg"
            save_path = os.path.join(class_folder, filename)
            success = download_image(url, save_path)
            if success:
                count += 1
            if count >= min_images_per_class:
                break


📦 Loading dataset from Drive ZIP


Downloading...
From (original): https://drive.google.com/uc?id=17F34dlZgpaYxy04nFqEQz2ffwMQUrfxA
From (redirected): https://drive.google.com/uc?id=17F34dlZgpaYxy04nFqEQz2ffwMQUrfxA&confirm=t&uuid=d30648f1-e274-49ba-8ce3-bd23af7b5644
To: /content/dataset.zip
100%|██████████| 4.42G/4.42G [01:20<00:00, 54.9MB/s]


✅ Dataset extracted successfully.


Clean corrupted images

Opens every image to verify it's not corrupted.

Deletes unreadable or broken images.

In [31]:

print("🧹 Cleaning corrupted images in:", dataset_dir)

for split in ["train", "val", "test"]:
     split_dir = os.path.join(dataset_dir, split)
     if not os.path.isdir(split_dir):
         continue

     for class_folder in os.listdir(split_dir):
         class_path = os.path.join(split_dir, class_folder)
         if not os.path.isdir(class_path):
             continue

         for file in os.listdir(class_path):
             file_path = os.path.join(class_path, file)

             # Skip non-files
             if not os.path.isfile(file_path):
                 continue

             try:
                 with Image.open(file_path) as img:
                     img.verify()
             except:
                 os.remove(file_path)
                 print(f"❌ Removed corrupted image: {file_path}")

print("✅ Done cleaning corrupted images.")


🧹 Cleaning corrupted images in: /content/dataset/dataset/content/dataset
❌ Removed corrupted image: /content/dataset/dataset/content/dataset/train/Solenopsis_pollux_Forel,_1893/image_8.jpg
❌ Removed corrupted image: /content/dataset/dataset/content/dataset/train/Solenopsis_pollux_Forel,_1893/image_7.jpg
❌ Removed corrupted image: /content/dataset/dataset/content/dataset/train/Solenopsis_pollux_Forel,_1893/image_0.jpg
❌ Removed corrupted image: /content/dataset/dataset/content/dataset/train/Solenopsis_pollux_Forel,_1893/image_9.jpg
❌ Removed corrupted image: /content/dataset/dataset/content/dataset/train/Solenopsis_pollux_Forel,_1893/image_6.jpg
❌ Removed corrupted image: /content/dataset/dataset/content/dataset/train/Solenopsis_pollux_Forel,_1893/image_2.jpg
❌ Removed corrupted image: /content/dataset/dataset/content/dataset/train/Solenopsis_pollux_Forel,_1893/image_12.jpg
❌ Removed corrupted image: /content/dataset/dataset/content/dataset/train/Solenopsis_pollux_Forel,_1893/image_10.j

Filter out classes with too few images

Keeps only classes with enough valid images.

Deletes folders with too few images to ensure training quality.

In [32]:

print("🔍 Filtering classes with too few images based on 'train/' split...")
min_images=5
train_dir = os.path.join(dataset_dir, "train")
valid_classes = []

# Check each class folder in 'train'
for class_folder in os.listdir(train_dir):
    class_path = os.path.join(train_dir, class_folder)

    if folder_has_enough_images(class_path, min_images):
        valid_classes.append(class_folder)
    else:
        print(f"❌ Removing class '{class_folder}' (not enough images)")
        # Remove class folder from all splits
        for split in ["train", "val", "test"]:
            split_path = os.path.join(dataset_dir, split, class_folder)
            shutil.rmtree(split_path, ignore_errors=True)

print(f"✅ Valid classes remaining: {len(valid_classes)}")

🔍 Filtering classes with too few images based on 'train/' split...
❌ Removing class 'Bombus_defector_Skorikov,_1910' (not enough images)
❌ Removing class 'Bombus_hortorum_hortorum' (not enough images)
❌ Removing class 'Bombus_ruderatus_autumnalis_(Fabricius,_1793)' (not enough images)
❌ Removing class 'Bombus_pratorum_pratorum' (not enough images)
❌ Removing class 'Solenopsis_pollux_Forel,_1893' (not enough images)
❌ Removing class 'Bombus_jonellus_jonellus' (not enough images)
✅ Valid classes remaining: 58


Create 'data.yaml' for YOLOv8 classificiation

In [33]:
print("📝 Generating data.yaml...")

data_yaml_content = f"""
path: {dataset_dir}
train: .
val: .
test: .

names:
"""

for idx, cls in enumerate(valid_classes):
    class_name = cls.replace(" ", "_")
    data_yaml_content += f"  {idx}: {class_name}\n"

data_yaml_path = os.path.join(dataset_dir, "data.yaml")
with open(data_yaml_path, "w") as f:
    f.write(data_yaml_content.strip())

print("✅ data.yaml created!")


📝 Generating data.yaml...
✅ data.yaml created!
