In [10]:
# Step 1: Mount Google Drive (if not already mounted)
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Define your download directory in Drive
import os
import zipfile
import pandas as pd
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm # For a nice progress bar

# Define the main directory in your Drive for all GBIF downloads
gbif_root_dir = '/content/drive/My Drive/gbif_raw_downloads2'
os.makedirs(gbif_root_dir, exist_ok=True)

# Define a specific directory for the Galapagos Fur Seal dataset
dataset_name = 'Arctocephalus_galapagoensis_DwC-A'
dataset_drive_path = os.path.join(gbif_root_dir, dataset_name)
os.makedirs(dataset_drive_path, exist_ok=True)
print(f"Dataset will be processed in: {dataset_drive_path}")

# --- IMPORTANT: Paste your GBIF Download Link here ---
GBIF_DOWNLOAD_URL = 'https://api.gbif.org/v1/occurrence/download/request/0010447-250515123054153.zip'


dwca_filename = os.path.join(dataset_drive_path, 'Arctocephalus_galapagoensis_dataset.zip')

print(f"Downloading DwC-A to: {dwca_filename}")
# Download the zip file directly to your Drive
# The -O flag saves the file with the specified name
!wget -O "{dwca_filename}" "{GBIF_DOWNLOAD_URL}"

print("\nDwC-A download complete. Starting extraction...")

# Unzip the downloaded DwC-A directly in Drive
try:
    with zipfile.ZipFile(dwca_filename, 'r') as zip_ref:
        zip_ref.extractall(dataset_drive_path)
    print("DwC-A extracted successfully.")
except zipfile.BadZipFile:
    print("Error: The downloaded file is not a valid zip file. Please check the GBIF_DOWNLOAD_URL.")
    exit()
except Exception as e:
    print(f"An error occurred during unzipping: {e}")
    exit()

# Path to the multimedia.txt file within the extracted archive
multimedia_file = os.path.join(dataset_drive_path, 'multimedia.txt')

if not os.path.exists(multimedia_file):
    print(f"Error: {multimedia_file} not found. Ensure the DwC-A contained it.")
    print("This usually means no media records were found for your search criteria on GBIF.")
else:
    # Read the multimedia.txt file into a Pandas DataFrame
    multimedia_df = pd.read_csv(multimedia_file, sep='\t', low_memory=False)

    print(f"Loaded {len(multimedia_df)} records from multimedia.txt")
    print("\nColumns available in multimedia.txt:")
    print(multimedia_df.columns.tolist()) # <--- IMPORTANT: Print available columns

    # Determine which column to use for organizing images
    # We prefer 'taxonKey', then 'scientificName', otherwise a generic folder
    image_label_column = None
    if 'taxonKey' in multimedia_df.columns:
        image_label_column = 'taxonKey'
    elif 'scientificName' in multimedia_df.columns: # As a fallback for organization
        image_label_column = 'scientificName'
    else:
        print("\nWarning: Neither 'taxonKey' nor 'scientificName' found in multimedia.txt.")
        print("Images will be saved to a single folder without species-specific subfolders.")

    # Filter for 'StillImage' (photos) and relevant columns
    # 'identifier' is the image URL
    # 'license' is crucial for legal use
    required_cols = ['identifier', 'license', 'type']
    if image_label_column:
        required_cols.append(image_label_column)

    # Ensure all required columns exist before selecting
    missing_cols = [col for col in required_cols if col not in multimedia_df.columns]
    if missing_cols:
        print(f"Error: Missing required columns in multimedia.txt: {missing_cols}")
        print("Please check your GBIF download settings or the file content.")
        exit()

    image_data = multimedia_df[
        (multimedia_df['type'] == 'StillImage') &
        (multimedia_df['identifier'].notna())
    ][required_cols].dropna(subset=['identifier'])

    # Filter to only keep images for the specific taxonKey if 'taxonKey' is present
    ARCTOCEPHALUS_GALAPAGOENSIS_TAXON_KEY = 2433473
    if image_label_column == 'taxonKey':
        image_data = image_data[image_data['taxonKey'] == ARCTOCEPHALUS_GALAPAGOENSIS_TAXON_KEY]
        print(f"Found {len(image_data)} 'StillImage' URLs for TaxonKey {ARCTOCEPHALUS_GALAPAGOENSIS_TAXON_KEY}.")
    elif image_label_column == 'scientificName':
        # If using scientificName, you might want to filter by the name as well
        image_data = image_data[image_data['scientificName'] == 'Arctocephalus galapagoensis']
        print(f"Found {len(image_data)} 'StillImage' URLs for scientificName 'Arctocephalus galapagoensis'.")
    else:
        print(f"Found {len(image_data)} 'StillImage' URLs (no specific taxonKey filter applied).")


    if len(image_data) == 0:
        print("No images found matching your criteria in the multimedia.txt file.")
    else:
        # Create a directory to store the actual image files
        image_output_dir = os.path.join(dataset_drive_path, 'images')
        os.makedirs(image_output_dir, exist_ok=True)

        # Define the final image saving directory
        if image_label_column:
            # Create a subfolder based on taxonKey or scientificName
            # Make sure to convert label to string for path creation
            save_dir_name = str(ARCTOCEPHALUS_GALAPAGOENSIS_TAXON_KEY) if image_label_column == 'taxonKey' else 'Arctocephalus_galapagoensis'
            final_image_save_dir = os.path.join(image_output_dir, save_dir_name)
        else:
            final_image_save_dir = os.path.join(image_output_dir, 'unlabeled_images') # Fallback

        os.makedirs(final_image_save_dir, exist_ok=True)
        print(f"Images will be saved to: {final_image_save_dir}")

        # --- Function to download a single image ---
        def download_image(row):
            image_url = row['identifier']
            image_name = os.path.basename(image_url).split('?')[0]
            if '.' not in image_name: # Add a default extension if none present
                image_name += '.jpg'

            filepath = os.path.join(final_image_save_dir, image_name)

            if os.path.exists(filepath):
                return f"Skipped: {filepath} already exists."

            try:
                response = requests.get(image_url, stream=True, timeout=15)
                response.raise_for_status()

                with open(filepath, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                return f"Downloaded: {filepath}"
            except requests.exceptions.Timeout:
                return f"Failed to download {image_url}: Timeout."
            except requests.exceptions.ConnectionError:
                return f"Failed to download {image_url}: Connection error."
            except requests.exceptions.RequestException as e:
                status_code = response.status_code if 'response' in locals() else 'N/A'
                return f"Failed to download {image_url}: HTTP Error {status_code} - {e}"
            except Exception as e:
                return f"An unexpected error occurred for {image_url}: {e}"

        # --- Concurrent downloading ---
        max_workers = 10

        print(f"\nStarting concurrent image download with {max_workers} workers...")
        results = []
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(download_image, row) for index, row in image_data.iterrows()]

            for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading Images"):
                results.append(future.result())

        print("\nImage download process complete. Sample results:")
        for res in results[:min(5, len(results))]:
            print(res)
        if len(results) > 5:
            print("...")

        downloaded_count = sum(1 for r in results if r.startswith("Downloaded:"))
        skipped_count = sum(1 for r in results if r.startswith("Skipped:"))
        failed_count = len(results) - downloaded_count - skipped_count

        print(f"\nSummary:")
        print(f"  Successfully downloaded: {downloaded_count}")
        print(f"  Skipped (already exists): {skipped_count}")
        print(f"  Failed: {failed_count}")
        print(f"  Total URLs processed: {len(results)}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset will be processed in: /content/drive/My Drive/gbif_raw_downloads2/Arctocephalus_galapagoensis_DwC-A
Downloading DwC-A to: /content/drive/My Drive/gbif_raw_downloads2/Arctocephalus_galapagoensis_DwC-A/Arctocephalus_galapagoensis_dataset.zip
--2025-05-20 21:21:53--  https://api.gbif.org/v1/occurrence/download/request/0010447-250515123054153.zip
Resolving api.gbif.org (api.gbif.org)... 130.225.43.2
Connecting to api.gbif.org (api.gbif.org)|130.225.43.2|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://occurrence-download.gbif.org/occurrence/download/request/0010447-250515123054153.zip [following]
--2025-05-20 21:21:53--  https://occurrence-download.gbif.org/occurrence/download/request/0010447-250515123054153.zip
Resolving occurrence-download.gbif.org (occurrence-download.gbif.org)... 130.225.43.36
Connecting to occurr

Downloading Images:   0%|          | 0/592 [00:00<?, ?it/s]


Image download process complete. Sample results:
Downloaded: /content/drive/My Drive/gbif_raw_downloads2/Arctocephalus_galapagoensis_DwC-A/images/unlabeled_images/original.jpg
Skipped: /content/drive/My Drive/gbif_raw_downloads2/Arctocephalus_galapagoensis_DwC-A/images/unlabeled_images/original.jpg already exists.
Downloaded: /content/drive/My Drive/gbif_raw_downloads2/Arctocephalus_galapagoensis_DwC-A/images/unlabeled_images/original.jpg
Downloaded: /content/drive/My Drive/gbif_raw_downloads2/Arctocephalus_galapagoensis_DwC-A/images/unlabeled_images/original.jpg
Downloaded: /content/drive/My Drive/gbif_raw_downloads2/Arctocephalus_galapagoensis_DwC-A/images/unlabeled_images/original.jpg
...

Summary:
  Successfully downloaded: 72
  Skipped (already exists): 520
  Failed: 0
  Total URLs processed: 592


In [11]:
# Step 1: Mount Google Drive (if not already mounted)
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Define your download directory in Drive
import os
import zipfile
import pandas as pd
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm # For a nice progress bar

# Define the main directory in your Drive for all GBIF downloads
gbif_root_dir = '/content/drive/My Drive/gbif_raw_downloads'
os.makedirs(gbif_root_dir, exist_ok=True)

# Define a specific directory for the Zalophus wollebaeki dataset
dataset_name = 'Zalophus_wollebaeki_DwC-A'
dataset_drive_path = os.path.join(gbif_root_dir, dataset_name)
os.makedirs(dataset_drive_path, exist_ok=True)
print(f"Dataset will be processed in: {dataset_drive_path}")

# --- IMPORTANT: PASTE YOUR NEW GBIF DOWNLOAD LINK FOR ZALOPHUS WOLLEBAEKI HERE ---
GBIF_DOWNLOAD_URL = 'https://api.gbif.org/v1/occurrence/download/request/0010490-250515123054153.zip'
# Example: GBIF_DOWNLOAD_URL = 'https://api.gbif.org/v1/occurrence/download/request/0012345-250502131345914.zip'


dwca_filename = os.path.join(dataset_drive_path, 'Zalophus_wollebaeki_dataset.zip')

print(f"Downloading DwC-A to: {dwca_filename}")
# Download the zip file directly to your Drive
!wget -O "{dwca_filename}" "{GBIF_DOWNLOAD_URL}"

print("\nDwC-A download complete. Starting extraction...")

# Unzip the downloaded DwC-A directly in Drive
try:
    with zipfile.ZipFile(dwca_filename, 'r') as zip_ref:
        zip_ref.extractall(dataset_drive_path)
    print("DwC-A extracted successfully.")
except zipfile.BadZipFile:
    print("Error: The downloaded file is not a valid zip file. Please check the GBIF_DOWNLOAD_URL.")
    exit()
except Exception as e:
    print(f"An error occurred during unzipping: {e}")
    exit()

# Path to the multimedia.txt file within the extracted archive
multimedia_file = os.path.join(dataset_drive_path, 'multimedia.txt')

if not os.path.exists(multimedia_file):
    print(f"Error: {multimedia_file} not found. Ensure the DwC-A contained it.")
    print("This usually means no media records were found for your search criteria on GBIF.")
else:
    # Read the multimedia.txt file into a Pandas DataFrame
    multimedia_df = pd.read_csv(multimedia_file, sep='\t', low_memory=False)

    print(f"Loaded {len(multimedia_df)} records from multimedia.txt")
    print("\nColumns available in multimedia.txt:")
    print(multimedia_df.columns.tolist()) # <--- IMPORTANT: Print available columns

    # Determine which column to use for organizing images
    image_label_column = None
    if 'taxonKey' in multimedia_df.columns:
        image_label_column = 'taxonKey'
    elif 'scientificName' in multimedia_df.columns:
        image_label_column = 'scientificName'
    else:
        print("\nWarning: Neither 'taxonKey' nor 'scientificName' found in multimedia.txt.")
        print("Images will be saved to a single folder without species-specific subfolders.")

    # Filter for 'StillImage' (photos) and relevant columns
    required_cols = ['identifier', 'license', 'type']
    if image_label_column:
        required_cols.append(image_label_column)

    # Ensure all required columns exist before selecting
    missing_cols = [col for col in required_cols if col not in multimedia_df.columns]
    if missing_cols:
        print(f"Error: Missing required columns in multimedia.txt: {missing_cols}")
        print("Please check your GBIF download settings or the file content.")
        exit()

    image_data = multimedia_df[
        (multimedia_df['type'] == 'StillImage') &
        (multimedia_df['identifier'].notna())
    ][required_cols].dropna(subset=['identifier'])

    # Filter to only keep images for the specific taxonKey if 'taxonKey' is present
    # This is the taxonKey for Zalophus wollebaeki
    ZALOPHUS_WOLLEBAEKI_TAXON_KEY = 5218765
    if image_label_column == 'taxonKey':
        image_data = image_data[image_data['taxonKey'] == ZALOPHUS_WOLLEBAEKI_TAXON_KEY]
        print(f"Found {len(image_data)} 'StillImage' URLs for TaxonKey {ZALOPHUS_WOLLEBAEKI_TAXON_KEY}.")
    elif image_label_column == 'scientificName':
        image_data = image_data[image_data['scientificName'] == 'Zalophus wollebaeki']
        print(f"Found {len(image_data)} 'StillImage' URLs for scientificName 'Zalophus wollebaeki'.")
    else:
        print(f"Found {len(image_data)} 'StillImage' URLs (no specific taxonKey/scientificName filter applied).")


    if len(image_data) == 0:
        print("No images found matching your criteria in the multimedia.txt file.")
    else:
        # Create a directory to store the actual image files
        image_output_dir = os.path.join(dataset_drive_path, 'images')
        os.makedirs(image_output_dir, exist_ok=True)

        # Define the final image saving directory
        if image_label_column:
            # Create a subfolder based on taxonKey or scientificName
            save_dir_name = str(ZALOPHUS_WOLLEBAEKI_TAXON_KEY) if image_label_column == 'taxonKey' else 'Zalophus_wollebaeki'
            final_image_save_dir = os.path.join(image_output_dir, save_dir_name)
        else:
            final_image_save_dir = os.path.join(image_output_dir, 'unlabeled_images') # Fallback

        os.makedirs(final_image_save_dir, exist_ok=True)
        print(f"Images will be saved to: {final_image_save_dir}")

        # --- Function to download a single image ---
        def download_image(row):
            image_url = row['identifier']
            image_name = os.path.basename(image_url).split('?')[0]
            if '.' not in image_name:
                image_name += '.jpg'

            filepath = os.path.join(final_image_save_dir, image_name)

            if os.path.exists(filepath):
                return f"Skipped: {filepath} already exists."

            try:
                response = requests.get(image_url, stream=True, timeout=15)
                response.raise_for_status()

                with open(filepath, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                return f"Downloaded: {filepath}"
            except requests.exceptions.Timeout:
                return f"Failed to download {image_url}: Timeout."
            except requests.exceptions.ConnectionError:
                return f"Failed to download {image_url}: Connection error."
            except requests.exceptions.RequestException as e:
                status_code = response.status_code if 'response' in locals() else 'N/A'
                return f"Failed to download {image_url}: HTTP Error {status_code} - {e}"
            except Exception as e:
                return f"An unexpected error occurred for {image_url}: {e}"

        # --- Concurrent downloading ---
        max_workers = 10

        print(f"\nStarting concurrent image download with {max_workers} workers...")
        results = []
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(download_image, row) for index, row in image_data.iterrows()]

            for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading Images"):
                results.append(future.result())

        print("\nImage download process complete. Sample results:")
        for res in results[:min(5, len(results))]:
            print(res)
        if len(results) > 5:
            print("...")

        downloaded_count = sum(1 for r in results if r.startswith("Downloaded:"))
        skipped_count = sum(1 for r in results if r.startswith("Skipped:"))
        failed_count = len(results) - downloaded_count - skipped_count

        print(f"\nSummary:")
        print(f"  Successfully downloaded: {downloaded_count}")
        print(f"  Skipped (already exists): {skipped_count}")
        print(f"  Failed: {failed_count}")
        print(f"  Total URLs processed: {len(results)}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset will be processed in: /content/drive/My Drive/gbif_raw_downloads/Zalophus_wollebaeki_DwC-A
Downloading DwC-A to: /content/drive/My Drive/gbif_raw_downloads/Zalophus_wollebaeki_DwC-A/Zalophus_wollebaeki_dataset.zip
--2025-05-20 21:48:21--  https://api.gbif.org/v1/occurrence/download/request/0010490-250515123054153.zip
Resolving api.gbif.org (api.gbif.org)... 130.225.43.2
Connecting to api.gbif.org (api.gbif.org)|130.225.43.2|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://occurrence-download.gbif.org/occurrence/download/request/0010490-250515123054153.zip [following]
--2025-05-20 21:48:22--  https://occurrence-download.gbif.org/occurrence/download/request/0010490-250515123054153.zip
Resolving occurrence-download.gbif.org (occurrence-download.gbif.org)... 130.225.43.36
Connecting to occurrence-download.gbif.org (oc

Downloading Images:   0%|          | 0/6793 [00:00<?, ?it/s]


Image download process complete. Sample results:
Skipped: /content/drive/My Drive/gbif_raw_downloads/Zalophus_wollebaeki_DwC-A/images/unlabeled_images/original.jpeg already exists.
Skipped: /content/drive/My Drive/gbif_raw_downloads/Zalophus_wollebaeki_DwC-A/images/unlabeled_images/original.jpg already exists.
Skipped: /content/drive/My Drive/gbif_raw_downloads/Zalophus_wollebaeki_DwC-A/images/unlabeled_images/original.jpg already exists.
Skipped: /content/drive/My Drive/gbif_raw_downloads/Zalophus_wollebaeki_DwC-A/images/unlabeled_images/original.jpeg already exists.
Skipped: /content/drive/My Drive/gbif_raw_downloads/Zalophus_wollebaeki_DwC-A/images/unlabeled_images/original.jpeg already exists.
...

Summary:
  Successfully downloaded: 326
  Skipped (already exists): 6467
  Failed: 0
  Total URLs processed: 6793


In [12]:
import os
import shutil
import random
from sklearn.model_selection import train_test_split # A convenient way to split data

# Ensure Google Drive is mounted
from google.colab import drive
drive.mount('/content/drive')

print("Starting dataset organization...")

# --- 1. Define Paths and Class Mappings (UPDATED SOURCE PATHS) ---

# Base directory where your downloaded images are
base_source_dir = '/content/drive/My Drive/gbif_raw_downloads/'

# Mapping of taxonKey (used for organization) to human-readable class name
# The 'source_path' now points to the 'unlabeled_images' folder within each species' download
species_info = {
    '2433473': { # TaxonKey for Arctocephalus galapagoensis
        'name': 'Arctocephalus_galapagoensis',
        'source_path': os.path.join(base_source_dir, 'Arctocephalus_galapagoensis_DwC-A', 'images', 'unlabeled_images')
    },
    '5218765': { # TaxonKey for Zalophus wollebaeki
        'name': 'Zalophus_wollebaeki',
        'source_path': os.path.join(base_source_dir, 'Zalophus_wollebaeki_DwC-A', 'images', 'unlabeled_images')
    }
}

# New organized dataset base directory in your Drive
organized_dataset_base_dir = '/content/drive/My Drive/my_galapagos_seals_dataset'
os.makedirs(organized_dataset_base_dir, exist_ok=True)


# --- 2. Define Split Ratios ---
train_split_ratio = 0.8
val_split_ratio = 0.1
test_split_ratio = 0.1


# --- 3. Create the New Dataset Structure ---
subdirs = ['train', 'validation', 'test']

for subdir in subdirs:
    for species_key, info in species_info.items():
        class_name = info['name']
        target_path = os.path.join(organized_dataset_base_dir, subdir, class_name)
        os.makedirs(target_path, exist_ok=True)
        print(f"Created directory: {target_path}")

print("\nNew dataset directory structure created.")

# --- 4. Populate the Dataset ---

print("\nStarting to copy and split images...")

for species_key, info in species_info.items():
    species_name = info['name']
    source_folder = info['source_path']
    print(f"\nProcessing {species_name} from: {source_folder}")

    # Get all image file paths for the current species
    image_files = [f for f in os.listdir(source_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff'))]
    random.shuffle(image_files) # Shuffle to ensure random split

    total_images = len(image_files)
    print(f"Found {total_images} images for {species_name}.")

    if total_images == 0:
        print(f"No images found in {source_folder}. Skipping {species_name}.")
        continue

    # Perform the split using sklearn's train_test_split
    # First split: train vs. (validation + test)
    train_files, val_test_files = train_test_split(
        image_files,
        train_size=train_split_ratio,
        random_state=42, # For reproducibility
        shuffle=True
    )

    # Second split: validation vs. test from the remaining files
    if len(val_test_files) > 0:
        # Calculate test_size relative to the remaining val_test_files
        test_size_relative = test_split_ratio / (val_split_ratio + test_split_ratio)
        val_files, test_files = train_test_split(
            val_test_files,
            test_size=test_size_relative,
            random_state=42,
            shuffle=True
        )
    else:
        val_files = []
        test_files = []

    print(f"  Train: {len(train_files)} images")
    print(f"  Validation: {len(val_files)} images")
    print(f"  Test: {len(test_files)} images")


    # Copy files to their respective directories
    for filename in train_files:
        src = os.path.join(source_folder, filename)
        dst = os.path.join(organized_dataset_base_dir, 'train', species_name, filename)
        shutil.copyfile(src, dst)

    for filename in val_files:
        src = os.path.join(source_folder, filename)
        dst = os.path.join(organized_dataset_base_dir, 'validation', species_name, filename)
        shutil.copyfile(src, dst)

    for filename in test_files:
        src = os.path.join(source_folder, filename)
        dst = os.path.join(organized_dataset_base_dir, 'test', species_name, filename)
        shutil.copyfile(src, dst)

print("\nDataset organization complete!")

# --- 5. Verify Counts (Optional but Recommended) ---
print("\nVerifying final counts in the new dataset structure:")
for subdir in subdirs:
    print(f"\n--- {subdir.upper()} Set ---")
    for species_key, info in species_info.items():
        species_name = info['name']
        target_path = os.path.join(organized_dataset_base_dir, subdir, species_name)
        count = len(os.listdir(target_path))
        print(f"  {species_name}: {count} images")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Starting dataset organization...
Created directory: /content/drive/My Drive/my_galapagos_seals_dataset/train/Arctocephalus_galapagoensis
Created directory: /content/drive/My Drive/my_galapagos_seals_dataset/train/Zalophus_wollebaeki
Created directory: /content/drive/My Drive/my_galapagos_seals_dataset/validation/Arctocephalus_galapagoensis
Created directory: /content/drive/My Drive/my_galapagos_seals_dataset/validation/Zalophus_wollebaeki
Created directory: /content/drive/My Drive/my_galapagos_seals_dataset/test/Arctocephalus_galapagoensis
Created directory: /content/drive/My Drive/my_galapagos_seals_dataset/test/Zalophus_wollebaeki

New dataset directory structure created.

Starting to copy and split images...

Processing Arctocephalus_galapagoensis from: /content/drive/My Drive/gbif_raw_downloads/Arctocephalus_galapagoensis_DwC-A/images/unlabeled_images
Fou

In [13]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import os

print("TensorFlow Version:", tf.__version__)

# Ensure Google Drive is mounted
from google.colab import drive
drive.mount('/content/drive')

print("Starting data loading and augmentation setup...")

# --- 1. Define Dataset Paths and Parameters ---

# Base directory where your organized dataset is located
organized_dataset_base_dir = '/content/drive/My Drive/my_galapagos_seals_dataset'

# Image parameters
IMG_HEIGHT = 224 # Common size for many pre-trained models (e.g., MobileNetV2, ResNet)
IMG_WIDTH = 224  # Keep height and width consistent
BATCH_SIZE = 32  # Number of images to process at once during training

# --- 2. Load Datasets using image_dataset_from_directory ---

# Load Training Data
print("\nLoading Training Dataset...")
train_ds = tf.keras.utils.image_dataset_from_directory(
    directory=os.path.join(organized_dataset_base_dir, 'train'),
    labels='inferred',       # Labels are inferred from the directory structure (folder names)
    label_mode='int',        # Labels will be integers (0, 1, ...)
    image_size=(IMG_HEIGHT, IMG_WIDTH), # Resize images to this uniform size
    interpolation='nearest', # Interpolation method for resizing
    batch_size=BATCH_SIZE,
    shuffle=True,            # Shuffle training data
    seed=123                 # For reproducibility
)

# Load Validation Data
print("Loading Validation Dataset...")
val_ds = tf.keras.utils.image_dataset_from_directory(
    directory=os.path.join(organized_dataset_base_dir, 'validation'),
    labels='inferred',
    label_mode='int',
    image_size=(IMG_HEIGHT, IMG_WIDTH),
    interpolation='nearest',
    batch_size=BATCH_SIZE,
    shuffle=False,           # No need to shuffle validation data
    seed=123
)

# Load Test Data
print("Loading Test Dataset...")
test_ds = tf.keras.utils.image_dataset_from_directory(
    directory=os.path.join(organized_dataset_base_dir, 'test'),
    labels='inferred',
    label_mode='int',
    image_size=(IMG_HEIGHT, IMG_WIDTH),
    interpolation='nearest',
    batch_size=BATCH_SIZE,
    shuffle=False,           # No need to shuffle test data
    seed=123
)

# Get class names (inferred from folder names)
class_names = train_ds.class_names
print(f"\nDiscovered Class Names: {class_names}")
print(f"Number of Classes: {len(class_names)}")


# --- 3. Preprocessing (Rescaling Pixels) ---

# All Keras Applications models expect pixel values in `[0, 1]` or `[-1, 1]` range.
# For simplicity, we'll scale to `[0, 1]` here.
# Some pre-trained models have their own `preprocess_input` function if they expect `[-1, 1]`.
# We'll apply this as part of our data pipeline.

# Scaling factor to convert pixel values from [0, 255] to [0, 1]
normalization_layer = layers.Rescaling(1./255)

# Apply normalization to all datasets
# The .map() method applies a function to each element of the dataset
train_ds = train_ds.map(lambda x, y: (normalization_layer(x), y))
val_ds = val_ds.map(lambda x, y: (normalization_layer(x), y))
test_ds = test_ds.map(lambda x, y: (normalization_layer(x), y))

print("Images normalized to [0, 1] range.")


# --- 4. Data Augmentation (Applied ONLY to Training Data) ---

# Define the augmentation layers
# These layers are applied randomly during training to each image
data_augmentation = keras.Sequential([
    layers.RandomFlip("horizontal"),        # Randomly flip images horizontally
    layers.RandomRotation(0.1),             # Randomly rotate images by up to 10% (36 degrees)
    layers.RandomZoom(0.1),                 # Randomly zoom in/out by up to 10%
    layers.RandomContrast(0.2),             # Randomly adjust contrast
    # layers.RandomTranslation(height_factor=0.1, width_factor=0.1), # Randomly shift images
    # layers.RandomBrightness(factor=0.2), # Randomly adjust brightness
], name="data_augmentation")

print("\nData augmentation pipeline defined.")
print("Example augmentation layers: RandomFlip, RandomRotation, RandomZoom, RandomContrast.")


# Apply augmentation to the training dataset
# We put augmentation AFTER normalization in the pipeline for consistency.
# However, for performance, it can sometimes be beneficial to apply augmentation on the GPU.
# In TensorFlow 2.x, these layers run on the CPU by default, which is usually fine.
train_ds = train_ds.map(lambda x, y: (data_augmentation(x, training=True), y),
                        num_parallel_calls=tf.data.AUTOTUNE)

print("Data augmentation applied to the training dataset.")


# --- 5. Configure Datasets for Performance ---

# Use .cache() to keep images in memory after the first epoch, speeding up subsequent epochs.
# Use .prefetch() to overlap data preprocessing and model execution.
# AUTOTUNE lets TensorFlow determine the optimal buffer size.
train_ds = train_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)

print("\nDatasets configured for optimal performance (caching and prefetching).")

# --- Verify a Batch (Optional) ---
# You can uncomment the following lines to inspect a batch of images and labels

# import matplotlib.pyplot as plt
# import numpy as np

# for images, labels in train_ds.take(1): # Take one batch from the training dataset
#     plt.figure(figsize=(10, 10))
#     for i in range(min(9, len(images))): # Display up to 9 images
#         ax = plt.subplot(3, 3, i + 1)
#         # Denormalize for display if necessary, or just display [0,1] images
#         plt.imshow(images[i].numpy())
#         plt.title(class_names[labels[i]])
#         plt.axis("off")
#     plt.show()

print("\nDataset loading and augmentation setup complete!")
print("Your datasets (train_ds, val_ds, test_ds) are now ready for model training.")
print("Next, we can look at building your neural network using transfer learning.")

TensorFlow Version: 2.18.0
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Starting data loading and augmentation setup...

Loading Training Dataset...
Found 281 files belonging to 2 classes.
Loading Validation Dataset...
Found 35 files belonging to 2 classes.
Loading Test Dataset...
Found 36 files belonging to 2 classes.

Discovered Class Names: ['Arctocephalus_galapagoensis', 'Zalophus_wollebaeki']
Number of Classes: 2
Images normalized to [0, 1] range.

Data augmentation pipeline defined.
Example augmentation layers: RandomFlip, RandomRotation, RandomZoom, RandomContrast.
Data augmentation applied to the training dataset.

Datasets configured for optimal performance (caching and prefetching).

Dataset loading and augmentation setup complete!
Your datasets (train_ds, val_ds, test_ds) are now ready for model training.
Next, we can look at building your neural network using transfer learning.


In [14]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
# Change: Import ResNet50V2 instead of MobileNetV2
from tensorflow.keras.applications import ResNet50V2
from tensorflow.keras.models import Model
import os

print("TensorFlow Version:", tf.__version__)

# Ensure Google Drive is mounted (if not already)
from google.colab import drive
drive.mount('/content/drive', force_remount=True) # Force remount just in case

print("\nStarting neural network model building with Transfer Learning (using ResNet50V2)...")

# --- Re-define parameters from previous step (important for model input) ---
IMG_HEIGHT = 224
IMG_WIDTH = 224
BATCH_SIZE = 32
num_classes = 2 # We have 2 classes: Arctocephalus_galapagoensis, Zalophus_wollebaeki

# --- Load the pre-trained base model (ResNet50V2) ---
# Change: Use ResNet50V2
base_model = ResNet50V2(
    input_shape=(IMG_HEIGHT, IMG_WIDTH, 3), # 3 for RGB channels
    include_top=False, # Remove the original classification head
    weights='imagenet' # Load weights pre-trained on ImageNet
)

print(f"\nBase model ({base_model.name}) loaded successfully.")
print(f"Base model output shape: {base_model.output_shape}")

# --- Freeze the base model ---
base_model.trainable = False
print(f"Base model layers set to non-trainable.")

# --- Build the custom classification head ---
inputs = keras.Input(shape=(IMG_HEIGHT, IMG_WIDTH, 3))

# Change: Use ResNet V2 specific preprocessing function
# This layer will scale input pixels to the range expected by ResNet50V2
x = tf.keras.applications.resnet_v2.preprocess_input(inputs)

# Pass the preprocessed inputs through the base model
# training=False ensures base model layers (like BatchNorm) run in inference mode
x = base_model(x, training=False)

# Add a GlobalAveragePooling2D layer to flatten the feature maps
x = layers.GlobalAveragePooling2D()(x)

# Add a Dense layer for classification
x = layers.Dropout(0.2)(x) # Dropout to prevent overfitting

# Output layer for 2 classes with 'softmax' activation
outputs = layers.Dense(num_classes, activation='softmax')(x)

# Create the full model
model = Model(inputs, outputs)

print("\nCustom classification head built and attached to the base model.")

# --- Compile the model ---
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.0001), # Small learning rate for transfer learning
    loss=keras.losses.SparseCategoricalCrossentropy(), # For integer labels
    metrics=['accuracy']
)

print("\nModel compiled successfully.")

# --- Display Model Summary ---
model.summary()

print("\nNeural network model setup complete with ResNet50V2!")
print("Your model is now ready for training.")
print("Next, we can proceed to train the model on your prepared datasets.")

TensorFlow Version: 2.18.0
Mounted at /content/drive

Starting neural network model building with Transfer Learning (using ResNet50V2)...
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50v2_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94668760/94668760[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step

Base model (resnet50v2) loaded successfully.
Base model output shape: (None, 7, 7, 2048)
Base model layers set to non-trainable.

Custom classification head built and attached to the base model.

Model compiled successfully.



Neural network model setup complete with ResNet50V2!
Your model is now ready for training.
Next, we can proceed to train the model on your prepared datasets.


In [15]:
import tensorflow as tf
from tensorflow import keras
import os

print("TensorFlow Version:", tf.__version__)

# Ensure Google Drive is mounted (if not already)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

print("\nStarting model training...")

# --- Define Training Parameters ---
EPOCHS = 30 # Number of times to iterate over the entire training dataset
            # This is an initial guess; EarlyStopping will help prevent overfitting.

# --- Define Callbacks ---

# Directory to save your trained models
model_save_dir = '/content/drive/My Drive/my_galapagos_seals_model'
os.makedirs(model_save_dir, exist_ok=True) # Create the directory if it doesn't exist

# Model Checkpoint: Saves the best model based on validation accuracy
checkpoint_filepath = os.path.join(model_save_dir, 'best_galapagos_seals_resnet_model.h5')
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False, # Save the entire model
    monitor='val_accuracy',  # Monitor validation accuracy
    mode='max',              # We want to maximize validation accuracy
    save_best_only=True,     # Only save when it's the best so far
    verbose=1                # Print messages when saving
)

# Early Stopping: Stops training if validation accuracy doesn't improve
early_stopping_callback = keras.callbacks.EarlyStopping(
    monitor='val_accuracy',
    mode='max',
    patience=10,             # Number of epochs with no improvement after which training will be stopped.
    restore_best_weights=True, # Restore model weights from the epoch with the best value of the monitored quantity.
    verbose=1
)

# List of callbacks to use during training
callbacks_list = [model_checkpoint_callback, early_stopping_callback]

# --- Train the Model ---

print(f"\nTraining for {EPOCHS} epochs with patience of {early_stopping_callback.patience}...")
print(f"Best model will be saved to: {checkpoint_filepath}")

history = model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=val_ds,
    callbacks=callbacks_list
)

print("\nModel training complete!")

# --- Evaluate the Model on the Test Set ---
print("\nEvaluating model on the test dataset...")
loss, accuracy = model.evaluate(test_ds)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

print("\nTraining summary:")
print(f"Final training accuracy: {history.history['accuracy'][-1]:.4f}")
print(f"Final validation accuracy: {history.history['val_accuracy'][-1]:.4f}")

print("\nNext, you can:")
print("1. Load the best saved model using `tf.keras.models.load_model()`.")
print("2. Visualize training history (loss and accuracy curves).")
print("3. Perform fine-tuning to potentially achieve higher accuracy.")
print("4. Make predictions on new images.")

TensorFlow Version: 2.18.0
Mounted at /content/drive

Starting model training...

Training for 30 epochs with patience of 10...
Best model will be saved to: /content/drive/My Drive/my_galapagos_seals_model/best_galapagos_seals_resnet_model.h5
Epoch 1/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - accuracy: 0.1838 - loss: 0.8981
Epoch 1: val_accuracy improved from -inf to 0.14286, saving model to /content/drive/My Drive/my_galapagos_seals_model/best_galapagos_seals_resnet_model.h5




[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 7s/step - accuracy: 0.1854 - loss: 0.8962 - val_accuracy: 0.1429 - val_loss: 0.8326
Epoch 2/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7s/step - accuracy: 0.2311 - loss: 0.8161
Epoch 2: val_accuracy improved from 0.14286 to 0.20000, saving model to /content/drive/My Drive/my_galapagos_seals_model/best_galapagos_seals_resnet_model.h5




[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 8s/step - accuracy: 0.2339 - loss: 0.8143 - val_accuracy: 0.2000 - val_loss: 0.7508
Epoch 3/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - accuracy: 0.3443 - loss: 0.7392
Epoch 3: val_accuracy improved from 0.20000 to 0.65714, saving model to /content/drive/My Drive/my_galapagos_seals_model/best_galapagos_seals_resnet_model.h5




[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 7s/step - accuracy: 0.3490 - loss: 0.7377 - val_accuracy: 0.6571 - val_loss: 0.6811
Epoch 4/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - accuracy: 0.5882 - loss: 0.6714
Epoch 4: val_accuracy improved from 0.65714 to 0.82857, saving model to /content/drive/My Drive/my_galapagos_seals_model/best_galapagos_seals_resnet_model.h5




[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 7s/step - accuracy: 0.5916 - loss: 0.6706 - val_accuracy: 0.8286 - val_loss: 0.6231
Epoch 5/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - accuracy: 0.7938 - loss: 0.6110
Epoch 5: val_accuracy improved from 0.82857 to 0.85714, saving model to /content/drive/My Drive/my_galapagos_seals_model/best_galapagos_seals_resnet_model.h5




[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 7s/step - accuracy: 0.7952 - loss: 0.6102 - val_accuracy: 0.8571 - val_loss: 0.5766
Epoch 6/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - accuracy: 0.8543 - loss: 0.5563
Epoch 6: val_accuracy did not improve from 0.85714
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 6s/step - accuracy: 0.8529 - loss: 0.5568 - val_accuracy: 0.8571 - val_loss: 0.5397
Epoch 7/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - accuracy: 0.8641 - loss: 0.5181
Epoch 7: val_accuracy did not improve from 0.85714
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 6s/step - accuracy: 0.8628 - loss: 0.5188 - val_accuracy: 0.8571 - val_loss: 0.5106
Epoch 8/30
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - accuracy: 0.8679 - loss: 0.4842
Epoch 8: val_accuracy did not improve from