### Progress:
1. Split the whole original dataset (9,000) images into train (80%), val (10%), test (10%)
2. Downsize Wildlife_Animals_Split --> Wildlife_Animals_Downsized: train (1000 images), val (20%), test (20%)
3. Create ground truth labels (.txt files) for the Dataset_Downsized images and place inside ./Wildlife_Animals_Downsized/labels
4. Create data.yaml file for yolov8 model

### TO-DO: 
1. Train model and run inference to gather annotated images + .txt files
2. Pass validation data and run inference to gather annotated images + .txt files. If predicted boxes and is classified well, move on
3. Pass test data and run inference to gather annoted images + .txt files. This set is used for the professor to run but double check to see if model runs good on unseen data
4. Figure out way that once the animal is classified, return is animal is harmful or harmless. Probably hardcode this and use a mapping
5. Export model and logic and figure out android simulator

### Flow:
1. Generate ground truth labels for the downsized dataset: train, val, test
2. Train the model on training set. Run inference on the training set to manually view how well the model detected and classified the animal in the image. Hypertune parameters if needed
3. Validate model by passing validation data to model. Evaluate the Precision, Recall, F1-Score, mAP, IoU. Run inference on validation data to view how well model detected and classified on unseen data. Hypertune paramters if needed
4. Test the model by passing testing data to model. Evaluate the Precision, Recall, F1-Score, mAP, IoU. Run inference on test data to view how well model detected and classified on unseen data.

In [2]:
import os
import random
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shutil
from sklearn.model_selection import train_test_split
from ultralytics import YOLO
import seaborn as sns

## Do not need to run. Check below for starting point

#### Splitting into train, val, test and downsizing to train (1000 images), val (20%), test(20%)

In [None]:
# Path to the folder containing class folders
dataset_path = './CMPE258_Wildlife_Animals/Dataset'
output_path = './CMPE258_Wildlife_Animals/Wildlife_Animals_Split/images'
os.makedirs(output_path, exist_ok=True)

In [4]:
# split original dataset into 80-10-10 : train, val, test

import os
import shutil
from sklearn.model_selection import train_test_split

# Ratios for train, validation, and test sets
train_ratio = 0.8
val_ratio = 0.10
test_ratio = 0.10

# Create train, val, and test directories
train_dir = os.path.join(output_path, 'train')
val_dir = os.path.join(output_path, 'val')
test_dir = os.path.join(output_path, 'test')
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Split each class folder
for class_name in os.listdir(dataset_path):
    class_path = os.path.join(dataset_path, class_name)
    if not os.path.isdir(class_path):
        continue

    # Get all image file paths
    images = [os.path.join(class_path, img) for img in os.listdir(class_path) if img.endswith(('jpg', 'jpeg', 'png'))]

    # First split into train and temp (validation + test)
    train_images, temp_images = train_test_split(images, test_size=(val_ratio + test_ratio), random_state=42)
    # Then split temp into validation and test sets
    val_images, test_images = train_test_split(temp_images, test_size=test_ratio / (val_ratio + test_ratio), random_state=42)

    # Create class directories in train, val, and test
    train_class_dir = os.path.join(train_dir, class_name)
    val_class_dir = os.path.join(val_dir, class_name)
    test_class_dir = os.path.join(test_dir, class_name)
    os.makedirs(train_class_dir, exist_ok=True)
    os.makedirs(val_class_dir, exist_ok=True)
    os.makedirs(test_class_dir, exist_ok=True)

    # Move images to respective directories
    for img in train_images:
        shutil.copy(img, train_class_dir)
    for img in val_images:
        shutil.copy(img, val_class_dir)
    for img in test_images:
        shutil.copy(img, test_class_dir)

print("Dataset split into train, validation, and test complete!")


Dataset split into train, validation, and test complete!


In [None]:
# downsize training set to 1000 images only

# Define paths
parent_dir = './Wildlife_Animals_Split/images/train'

# Collect all valid image paths
image_files = []
for class_name in os.listdir(parent_dir):
    class_path = os.path.join(parent_dir, class_name)
    if not os.path.isdir(class_path):
        continue
    for img_name in os.listdir(class_path):
        img_path = os.path.join(class_path, img_name)
        if img_path.lower().endswith(('jpg', 'jpeg', 'png')):
            image_files.append(img_path)

# Randomly select 1,000 images
selected_images = random.sample(image_files, 1000)

# Clear the existing train folder and downsized dataset folder
downsized_images_dir = './Wildlife_Animals_Downsized/images/train'
os.makedirs(downsized_images_dir, exist_ok=True)

# Move selected images and their labels to downsized dataset
for img_path in selected_images:
    img_name = os.path.basename(img_path)
    class_name = os.path.basename(os.path.dirname(img_path))

    # Create class directories
    os.makedirs(os.path.join(downsized_images_dir, class_name), exist_ok=True)

    # Move image
    shutil.copy(img_path, os.path.join(downsized_images_dir, class_name, img_name))

In [None]:
# downsize validation set to take 20% of original validation set

# Define paths
parent_dir = './Wildlife_Animals_Split/images/val'

# Collect all valid image paths
image_files = []
for class_name in os.listdir(parent_dir):
    class_path = os.path.join(parent_dir, class_name)
    if not os.path.isdir(class_path):
        continue
    for img_name in os.listdir(class_path):
        img_path = os.path.join(class_path, img_name)
        if img_path.lower().endswith(('jpg', 'jpeg', 'png')):
            image_files.append(img_path)


subset_size = int(0.2 * len(image_files))  # 20% of the original validation set
selected_images = random.sample(image_files, subset_size)

# Clear the existing train folder and downsized dataset folder
downsized_images_dir = './Wildlife_Animals_Downsized/images/val'
os.makedirs(downsized_images_dir, exist_ok=True)

# Move selected images and their labels to downsized dataset
for img_path in selected_images:
    img_name = os.path.basename(img_path)
    class_name = os.path.basename(os.path.dirname(img_path))

    # Create class directories
    os.makedirs(os.path.join(downsized_images_dir, class_name), exist_ok=True)

    # Move image
    shutil.copy(img_path, os.path.join(downsized_images_dir, class_name, img_name))

In [None]:
# downsize test set to take 20% of original test set

# Define paths
parent_dir = './Wildlife_Animals_Split/images/test'

# Collect all valid image paths
image_files = []
for class_name in os.listdir(parent_dir):
    class_path = os.path.join(parent_dir, class_name)
    if not os.path.isdir(class_path):
        continue
    for img_name in os.listdir(class_path):
        img_path = os.path.join(class_path, img_name)
        if img_path.lower().endswith(('jpg', 'jpeg', 'png')):
            image_files.append(img_path)


subset_size = int(0.2 * len(image_files))  # 20% of the original validation set
selected_images = random.sample(image_files, subset_size)

# Clear the existing train folder and downsized dataset folder
downsized_images_dir = './Wildlife_Animals_Downsized/images/test'
os.makedirs(downsized_images_dir, exist_ok=True)

# Move selected images and their labels to downsized dataset
for img_path in selected_images:
    img_name = os.path.basename(img_path)
    class_name = os.path.basename(os.path.dirname(img_path))

    # Create class directories
    os.makedirs(os.path.join(downsized_images_dir, class_name), exist_ok=True)

    # Move image
    shutil.copy(img_path, os.path.join(downsized_images_dir, class_name, img_name))

In [None]:
labels_dir = './Wildlife_Animals_Downsized/labels'
os.makedirs(labels_dir, exist_ok=True)

# create directories for labels

# List of animal names
animals = [
    "bald_eagle", "black_bear", "bobcat", "cheetah", "cougar", "deer", "elk", 
    "gray_fox", "Horse", "hyena", "lion", "raccoon", "red_fox", "rhino", "tiger", 
    "wolf", "zebra"
]

# Define the base directory where the folders will be created
labels_dir = './Wildlife_Animals_Downsized/labels'

# Create the main directories for train, val, and test
for folder in ['train', 'val', 'test']:
    folder_path = os.path.join(labels_dir, folder)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    
    # Create subdirectories for each animal in train, val, and test folders
    for animal in animals:
        animal_folder = os.path.join(folder_path, animal)
        if not os.path.exists(animal_folder):
            os.makedirs(animal_folder)

print("Directories created successfully!")


Directories created successfully!


#### Counting # of images after split and downsizing

In [52]:
import os

def count_images_in_folders(dataset_folder):
    # Initialize a counter for images
    image_count = 0
    
    # Walk through the directory to count images in subfolders
    for root, dirs, files in os.walk(dataset_folder):
        for file in files:
            # You can adjust the image file extensions based on your dataset (e.g., '.jpg', '.png')
            if file.endswith('.jpg') or file.endswith('.png'):
                image_count += 1
    
    return image_count

# Paths to your validation and test sets
original_train_folder = "./Wildlife_Animals_Split/images/train"
original_val_folder = "./Wildlife_Animals_Split/images/val"
original_test_folder = "./Wildlife_Animals_Split/images/test"

downsized_train_folder = "./Wildlife_Animals_Downsized/images/train"
downsized_val_folder = "./Wildlife_Animals_Downsized/images/val"
downsized_test_folder = "./Wildlife_Animals_Downsized/images/test"

# Count the images in each set
train_image_count = count_images_in_folders(original_train_folder)
val_image_count = count_images_in_folders(original_val_folder)
test_image_count = count_images_in_folders(original_test_folder)

downsized_train_folder_count = count_images_in_folders(downsized_train_folder)
downsized_val_folder_count = count_images_in_folders(downsized_val_folder)
downsized_test_folder_count = count_images_in_folders(downsized_test_folder)

# Output the results
print(f"Number of images in original_train_folder set: {train_image_count}")
print(f"Number of images in original_val_folder set: {val_image_count}")
print(f"Number of images in original_test_folder set: {test_image_count}")

print(f"\nNumber of images in downsized_train_folder set: {downsized_train_folder_count}")
print(f"Number of images in downsized_val_folder set: {downsized_val_folder_count}")
print(f"Number of images in downsized_test_folder set: {downsized_test_folder_count}")


Number of images in original_train_folder set: 6836
Number of images in original_val_folder set: 855
Number of images in original_test_folder set: 857

Number of images in downsized_train_folder set: 873
Number of images in downsized_val_folder set: 161
Number of images in downsized_test_folder set: 167


## DO NOT NEED TO RUN CODE ABOVE AS IT SPLITS AND DOWNSIZES DATA. Already have split and downsized data in GitHub
## Start here

#### Created ground truth labels for Wildlife_Animals_Downsized. Placed in 'labels_dir'
#### Use cell below to change the labels for .txt files in each animal folder
Use .yaml file for the corresponding animal labels

In [None]:
# use this code to change the class label numbers to its respective label animal number
# change the annotations_folder path to where you saved the txt files and change the mapping

import os

# Define the label mapping (old label -> new label)
label_mapping = {
    0: 11,  # Change '0' to '1'
}

# Path to your annotations folder
annotations_folder = 'C:/Users/britn/CMPE258_Wildlife_Animals/Wildlife_Animals_Downsized/labels/train/raccoon'

# Iterate through each annotation file
for filename in os.listdir(annotations_folder):
    if filename.endswith(".txt"):
        file_path = os.path.join(annotations_folder, filename)
        
        with open(file_path, 'r') as file:
            lines = file.readlines()
        
        # Update the class label in each line
        new_lines = []
        for line in lines:
            parts = line.split()
            old_label = int(parts[0])
            new_label = label_mapping.get(old_label, old_label)  # Get the new label, or keep old if not mapped
            parts[0] = str(new_label)
            new_lines.append(" ".join(parts))
        
        # Save the updated lines
        with open(file_path, 'w') as file:
            file.writelines(new_lines)

print("Class labels updated.")


Class labels updated.


#### Training YOLOv8 Model

In [None]:
# Load a pre-trained YOLO model or a custom model
model = YOLO("yolov8m.pt")  # You can use a pre-trained model like yolov8m.pt

downsized_yaml = './Wildlife_Animals_Downsized/dataset.yaml'
# Directory where the annotated images will be saved
output_dir = './Wildlife_Animals_Downsized/output_images/train'

# Training
try:
    # Train the model on the bald eagle dataset
    results = model.train(
    data=downsized_yaml,
    epochs=25,
    imgsz=320,
    batch=16,
    lr0=0.001,
    lrf=0.5,
    save=True,
    patience=5   # Early stopping patience: stop training if no improvement after 5 epochs
)

    print("Training completed successfully.")
except Exception as e:
    print(f"Training failed: {e}")


#### Access the trained model, run inference on the TRAINED DATA to save TRAIN annotated images and .txt files in directory
TO:DO - after training, check the annotated images in output_images/train to see if the bounding boxes + labels are good

In [None]:
# Load the trained YOLO model
model = YOLO('C:/Users/britn/CMPE258FinalProject/runs/detect/train3/weights/best.pt')  # Path to the best trained model

downsized_train_folder = "./Wildlife_Animals_Downsized/images/train"
downsized_val_folder = "./Wildlife_Animals_Downsized/images/val"
downsized_test_folder = "./Wildlife_Animals_Downsized/images/test"

# Path to images
train_images_dir = './Wildlife_Animals_Downsized/images/train'
output_images_dir = './Wildlife_Animals_Downsized/output_images/train'
output_labels_dir = './Wildlife_Animals_Downsized/output_labels/train'  # Directory to save .txt files

# Ensure the output directory exists
os.makedirs(output_images_dir, exist_ok=True)
os.makedirs(output_labels_dir, exist_ok=True)

# Collect all image paths from the bald_eagle folder
image_files = []
for img_name in os.listdir(train_images_dir):
    img_path = os.path.join(train_images_dir, img_name)
    if img_path.lower().endswith(('jpg', 'jpeg', 'png')):  # Check if the file is an image
        image_files.append(img_path)

print(f"Found {len(image_files)} images.")

# Run inference and save annotated images and .txt files
for img_path in image_files:
    img_name = os.path.basename(img_path)
    print(f"Running inference on {img_name}")

    # Run inference on the image
    results = model(img_path)

    # Ensure results is not a list before calling save
    if isinstance(results, list):
        results = results[0]  # YOLOv8 inference results are wrapped in a list, access the first item

    # Save the annotated image
    save_img_dir = os.path.join(output_images_dir, img_name)
    results.save(save_img_dir)  # Saves to the output directory with bounding boxes and class labels

    # Extract the bounding box details and save them as a .txt file
    txt_file_path = os.path.join(output_labels_dir, img_name.replace('.jpg', '.txt').replace('.jpeg', '.txt').replace('.png', '.txt'))

    # Get the bounding boxes and class labels
    labels = results.names  # Get class names (labels) from the model
    boxes = results.boxes.xywh  # Get the box coordinates in x, y, width, height format
    confs = results.boxes.conf  # Get the confidence scores
    classes = results.boxes.cls  # Get the predicted class indices

    with open(txt_file_path, 'w') as txt_file:
        for i in range(len(boxes)):
            # Write each detection as a line in the .txt file
            class_id = int(classes[i])  # Convert class to integer ID
            x_center, y_center, width, height = boxes[i]  # Bounding box values
            conf = confs[i]  # Confidence score

            # YOLO format: class_id x_center y_center width height (normalized)
            txt_file.write(f"{class_id} {x_center} {y_center} {width} {height} {conf}\n")

    print(f"Annotated image and .txt file saved for: {img_name}")
    print(f"Saved to: {save_img_dir}")
    print(f"Saved .txt to: {txt_file_path}")
    print(f"Contents of image output directory: {os.listdir(output_images_dir)}")

#### After training, pass the VALIDATION DATA to the model

#### Access the trained model, run inference on the VALIDATION DATA to save VAL annotated images and .txt files in directory
TO-DO: Manually check the annotated images in output_images/val to see if the bounding boxes + labels are good
    - If bounding boxes and labels are not good on val data, go back to hypertune the model then train again. *Make sure model is not fixated on training and val sets

#### Once model runs well for val dataset, check on test set (maybe)
Note: using test set for the professor to run when assignment is turned in so not sure if it's a good idea to run the test set already. probably is though to double check the model is working well