<a href="https://colab.research.google.com/github/Fazlibeqir/Timski-Proekt/blob/main/fetch_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install dependencies

In [None]:
# Install required libraries (only once in Colab)
!pip install -q gdown pillow requests

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Imports

In [None]:
import sys
import os
import sys
import shutil
import zipfile
import pandas as pd
from PIL import Image
from tqdm import tqdm

In [None]:
# Link utils
module_path = '/content/drive/MyDrive/Insect Detection/utils'
sys.path.append(module_path)

from common_funcs import (
    get_gbif_images,
    download_image,
    folder_has_enough_images,
    count_images,
    split_and_download_images
)

Download and unzip dataset from Google Drive

https://drive.google.com/file/d/17F34dlZgpaYxy04nFqEQz2ffwMQUrfxA


# 🎯 Configuration


In [None]:
load_from_zip = True  # 👈 Set to False if you want to download via GBIF

dataset_dir = "/content/dataset/dataset/content/dataset"  # Final dataset path
train_dir = os.path.join(dataset_dir, "train")
val_dir = os.path.join(dataset_dir, "val")
test_dir = os.path.join(dataset_dir, "test")

os.makedirs(dataset_dir, exist_ok=True)

In [None]:
# ===============================
# 📦 Option 1: Load from ZIP
# ===============================
if load_from_zip:
    print("📦 Loading dataset from Drive ZIP")
    file_id = "17F34dlZgpaYxy04nFqEQz2ffwMQUrfxA"
    output_path = "dataset.zip"

    # Download from Google Drive
    import gdown
    gdown.download(id=file_id, output=output_path, quiet=False)

    # Extract if valid
    if output_path.endswith(".zip") and zipfile.is_zipfile(output_path):
        shutil.unpack_archive(output_path, "dataset", 'zip')
        print("✅ Dataset extracted successfully.")
    else:
        print("❌ Downloaded file is not a valid ZIP archive.")

# ===============================
# 🌐 Option 2: Download from GBIF
# ===============================
else:
    print("🌐 Downloading dataset from GBIF using CSV class list")

    # ====== Load class list ======
    df = pd.read_csv("/content/drive/MyDrive/csv/0044847-241126133413365.csv")
    classes = df["Scientific name"].tolist()

    # ====== Download Images from GBIF ======
    min_images_per_class = 30
    max_images_per_class = 200

    for cls in tqdm(classes, desc="Processing species"):
        urls = get_gbif_images(cls, limit=max_images_per_class)

        if len(urls) < min_images_per_class:
            print(f"⏭️ Skipping '{cls}' (only {len(urls)} images)")
            continue

        success = split_and_download_images(
            cls, urls,
            train_dir=train_dir,
            val_dir=val_dir,
            test_dir=test_dir,
            max_images_per_species=max_images_per_class
        )

        if success:
            print(f"✅ Processed '{cls}'")
        else:
            print(f"❌ Failed to process '{cls}'")


📦 Loading dataset from Drive ZIP


Downloading...
From (original): https://drive.google.com/uc?id=17F34dlZgpaYxy04nFqEQz2ffwMQUrfxA
From (redirected): https://drive.google.com/uc?id=17F34dlZgpaYxy04nFqEQz2ffwMQUrfxA&confirm=t&uuid=f5bca1af-61b5-4327-98bf-05adff93b2d0
To: /content/dataset.zip
100%|██████████| 4.42G/4.42G [00:51<00:00, 85.0MB/s]


✅ Dataset extracted successfully.


Clean corrupted images

Opens every image to verify it's not corrupted.

Deletes unreadable or broken images.

In [None]:

print("🧹 Cleaning corrupted images in:", dataset_dir)

for split in ["train", "val", "test"]:
     split_dir = os.path.join(dataset_dir, split)
     if not os.path.isdir(split_dir):
         continue

     for class_folder in os.listdir(split_dir):
         class_path = os.path.join(split_dir, class_folder)
         if not os.path.isdir(class_path):
             continue

         for file in os.listdir(class_path):
             file_path = os.path.join(class_path, file)

             # Skip non-files
             if not os.path.isfile(file_path):
                 continue

             try:
                 with Image.open(file_path) as img:
                     img.verify()
             except:
                 os.remove(file_path)
                 print(f"❌ Removed corrupted image: {file_path}")

print("✅ Done cleaning corrupted images.")


🧹 Cleaning corrupted images in: /content/dataset/dataset/content/dataset
❌ Removed corrupted image: /content/dataset/dataset/content/dataset/train/Solenopsis_pollux_Forel,_1893/image_12.jpg
❌ Removed corrupted image: /content/dataset/dataset/content/dataset/train/Solenopsis_pollux_Forel,_1893/image_6.jpg
❌ Removed corrupted image: /content/dataset/dataset/content/dataset/train/Solenopsis_pollux_Forel,_1893/image_0.jpg
❌ Removed corrupted image: /content/dataset/dataset/content/dataset/train/Solenopsis_pollux_Forel,_1893/image_4.jpg
❌ Removed corrupted image: /content/dataset/dataset/content/dataset/train/Solenopsis_pollux_Forel,_1893/image_13.jpg
❌ Removed corrupted image: /content/dataset/dataset/content/dataset/train/Solenopsis_pollux_Forel,_1893/image_7.jpg
❌ Removed corrupted image: /content/dataset/dataset/content/dataset/train/Solenopsis_pollux_Forel,_1893/image_3.jpg
❌ Removed corrupted image: /content/dataset/dataset/content/dataset/train/Solenopsis_pollux_Forel,_1893/image_8.j

Filter out classes with too few images

Keeps only classes with enough valid images.

Deletes folders with too few images to ensure training quality.

In [None]:

print("🔍 Filtering classes with too few images based on 'train/' split...")
min_images=5
valid_classes = []

# Check each class folder in 'train'
for class_folder in os.listdir(train_dir):
    class_path = os.path.join(train_dir, class_folder)

    if folder_has_enough_images(class_path, min_images):
        valid_classes.append(class_folder)
    else:
        print(f"❌ Removing class '{class_folder}' (not enough images)")
        # Remove class folder from all splits
        for split in ["train", "val", "test"]:
            split_path = os.path.join(dataset_dir, split, class_folder)
            shutil.rmtree(split_path, ignore_errors=True)

print(f"✅ Valid classes remaining: {len(valid_classes)}")

🔍 Filtering classes with too few images based on 'train/' split...
❌ Removing class 'Bombus_ruderatus_autumnalis_(Fabricius,_1793)' (not enough images)
❌ Removing class 'Bombus_pratorum_pratorum' (not enough images)
❌ Removing class 'Bombus_jonellus_jonellus' (not enough images)
❌ Removing class 'Bombus_defector_Skorikov,_1910' (not enough images)
❌ Removing class 'Solenopsis_pollux_Forel,_1893' (not enough images)
❌ Removing class 'Bombus_hortorum_hortorum' (not enough images)
✅ Valid classes remaining: 58


Create 'data.yaml' for YOLOv8 classificiation

In [None]:
print("📝 Generating data.yaml...")

data_yaml_content = f"""
path: {dataset_dir}
train: {train_dir}
val: {val_dir}
test: {test_dir}

names:
"""

for idx, cls in enumerate(valid_classes):
    data_yaml_content += f"  {idx}: {cls}\n"

with open(os.path.join(dataset_dir, "data.yaml"), "w") as f:
    f.write(data_yaml_content.strip())

print("✅ data.yaml created!")


📝 Generating data.yaml...
✅ data.yaml created!


Compress the cleaned dataset and move to Google Drive if mounted

In [None]:
shutil.make_archive("cleaned_dataset", 'zip', "dataset")

'/content/cleaned_dataset.zip'

In [None]:
!cp cleaned_dataset.zip /content/drive/MyDrive/Insect\ Detection/
print("✅ Cleaned dataset zipped and saved to Drive.")

✅ Cleaned dataset zipped and saved to Drive.


In [None]:
!ls -lh /content/drive/MyDrive/Insect\ Detection/

total 4.2G
-rw------- 1 root root 4.2G May 30 18:03 cleaned_dataset.zip
-rw------- 1 root root  324 May 29 23:17 compare_all_versions.ipynb
-rw------- 1 root root  14K May 30 18:02 fetch_data.ipynb
-rw------- 1 root root  324 May 29 23:17 test_and_eval.ipynb
-rw------- 1 root root 1.6M May 30 17:59 train_v1_baseline.ipynb
-rw------- 1 root root  324 May 29 23:16 train_v2_augmented.ipynb
-rw------- 1 root root  324 May 29 23:16 train_v3_bigger_model.ipynb
drwx------ 3 root root 4.0K May 28 16:56 utils
-rw------- 1 root root 909K May 28 16:44 YOLO8.ipynb
