In [1]:
import os
import tensorflow as tf
from tensorflow.keras import layers, models, applications
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from sklearn.model_selection import train_test_split
import shutil
import pandas as pd
import matplotlib.pyplot as plt
import ultralytics

ModuleNotFoundError: No module named 'ultralytics'

In [None]:
dataset_dir = "dataset"
raw_img_dir = os.path.join(dataset_dir, "raw-img")

if os.path.exists(raw_img_dir) and len(os.listdir(raw_img_dir)) > 0:
    print(f"Dataset already exists at {raw_img_dir}. Skipping download.")
else:
    print("Dataset not found. Attempting to download...")
    try:
        import kagglehub
        dataset_path = kagglehub.dataset_download("alessiocorrado99/animals10", path=dataset_dir)
        print(f"Dataset downloaded to: {dataset_path}")
    except ImportError:
        print("kagglehub not found. Please install it using 'pip install kagglehub'")
        print("Attempting manual download...")
    except Exception as e:
        print(f"Error downloading dataset: {e}")
        print(f"Please ensure kagglehub is properly configured and the dataset exists.")
        print(f"You can also manually download the dataset from Kaggle to {dataset_dir}")

In [None]:
data = []
for class_name in os.listdir(f"{dataset_dir}/raw-img"):
    for filename in os.listdir(f"{dataset_dir}/raw-img/{class_name}"):
        data.append({"filename": f"{dataset_dir}/raw-img/{class_name}/{filename}", "class": class_name})

df = pd.DataFrame(data)
df.to_csv(f"{dataset_dir}/_annotations.csv", index=False)

In [None]:
# Read the annotations file
df = pd.read_csv(f"{dataset_dir}/_annotations.csv")

# Create train, test, valid directories
for split in ['train', 'test', 'valid']:
    os.makedirs(os.path.join(dataset_dir, split), exist_ok=True)

# Split the data
train_df, test_valid_df = train_test_split(df, test_size=0.3, stratify=df['class'], random_state=42)
valid_df, test_df = train_test_split(test_valid_df, test_size=0.5, stratify=test_valid_df['class'], random_state=42)

# Function to copy files and create new annotations
def process_split(split_df, split_name):
    new_annotations = []
    for _, row in split_df.iterrows():
        src = row['filename']
        dst = os.path.join(dataset_dir, split_name, os.path.basename(src))
        shutil.copy(src, dst)
        new_annotations.append({'filename': os.path.basename(src), 'class': row['class']})
    
    new_df = pd.DataFrame(new_annotations)
    new_df.to_csv(os.path.join(dataset_dir, f'{split_name}_annotations.csv'), index=False)

# Process each split
process_split(train_df, 'train')
process_split(valid_df, 'valid')
process_split(test_df, 'test')

print("Data split and reorganized into train, test, and valid directories.")


In [3]:
# Set up paths
dataset_dir = "dataset"
raw_img_dir = os.path.join(dataset_dir, "raw-img")
annotations_file = os.path.join(dataset_dir, "_annotations.csv")

# Create annotations file if it doesn't exist
if not os.path.exists(annotations_file):
    data = []
    for class_name in os.listdir(raw_img_dir):
        class_dir = os.path.join(raw_img_dir, class_name)
        for filename in os.listdir(class_dir):
            data.append({"filename": os.path.join(class_dir, filename), "class": class_name})
    
    df = pd.DataFrame(data)
    df.to_csv(annotations_file, index=False)

# Read the annotations file
df = pd.read_csv(annotations_file)

# Create train, test, valid directories
for split in ['train', 'test', 'valid']:
    os.makedirs(os.path.join(dataset_dir, split), exist_ok=True)

# Split the data
train_df, test_valid_df = train_test_split(df, test_size=0.3, stratify=df['class'], random_state=42)
valid_df, test_df = train_test_split(test_valid_df, test_size=0.5, stratify=test_valid_df['class'], random_state=42)

# Function to create new annotations
def create_split_annotations(split_df, split_name):
    split_dir = os.path.join(dataset_dir, split_name)
    split_df['filename'] = split_df['filename'].apply(lambda x: os.path.join(split_dir, os.path.basename(x)))
    split_df.to_csv(os.path.join(dataset_dir, f'{split_name}_annotations.csv'), index=False)

# Create split annotations
create_split_annotations(train_df, 'train')
create_split_annotations(valid_df, 'valid')
create_split_annotations(test_df, 'test')

batch_size = 64

# Set up data generators
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    validation_split=0.2
)

# Load and preprocess the data
train_generator = train_datagen.flow_from_dataframe(
    dataframe=pd.read_csv(os.path.join(dataset_dir, 'train_annotations.csv')),
    x_col='filename',
    y_col='class',
    target_size=(224, 224),
    batch_size=batch_size,
    class_mode='categorical'
)

validation_generator = train_datagen.flow_from_dataframe(
    dataframe=pd.read_csv(os.path.join(dataset_dir, 'valid_annotations.csv')),
    x_col='filename',
    y_col='class',
    target_size=(224, 224),
    batch_size=batch_size,
    class_mode='categorical'
)

print(f"Found {len(train_generator.filenames)} images belonging to {len(train_generator.class_indices)} classes in the training set.")
print(f"Found {len(validation_generator.filenames)} images belonging to {len(validation_generator.class_indices)} classes in the validation set.")

Found 18325 validated image filenames belonging to 10 classes.
Found 3927 validated image filenames belonging to 10 classes.
Found 18325 images belonging to 10 classes in the training set.
Found 3927 images belonging to 10 classes in the validation set.


In [None]:
import torch
from ultralytics import YOLO
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import shutil

# Get absolute path to current working directory
current_dir = os.path.abspath(os.getcwd())

# Create dataset structure for classification
dataset_dir = os.path.join(current_dir, 'dataset_cls')
os.makedirs(dataset_dir, exist_ok=True)

# Create train/val/test directories
for split in ['train', 'val', 'test']:
    os.makedirs(os.path.join(dataset_dir, split), exist_ok=True)

# Load annotations
df = pd.read_csv('dataset/_annotations.csv')

# Split data
train_df, temp_df = train_test_split(df, test_size=0.3, stratify=df['class'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['class'], random_state=42)

# Function to copy images to classification structure
def copy_images(split_df, split_name):
    split_dir = os.path.join(dataset_dir, split_name)
    for _, row in split_df.iterrows():
        class_dir = os.path.join(split_dir, row['class'])
        os.makedirs(class_dir, exist_ok=True)
        
        src = os.path.join(current_dir, row['filename'])
        dst = os.path.join(class_dir, os.path.basename(row['filename']))
        if os.path.exists(src) and not os.path.exists(dst):
            shutil.copy2(src, dst)

# Copy images to respective directories
copy_images(train_df, 'train')
copy_images(val_df, 'val')
copy_images(test_df, 'test')

# Create checkpoint directory
checkpoint_dir = os.path.join(current_dir, 'yolo_checkpoints')
os.makedirs(checkpoint_dir, exist_ok=True)

# Initialize model
model = YOLO('yolov8n-cls.pt')  # Using YOLOv8 classification model

# Train with simplified parameters
results = model.train(
    data=dataset_dir,
    epochs=30,
    imgsz=224,
    batch=64,
    device='cuda' if torch.cuda.is_available() else 'cpu',
    project=checkpoint_dir,
    name='train',
    exist_ok=True,
    patience=5,  # Early stopping patience
    save_period=5,  # Save checkpoint every 5 epochs
    resume=False  # Start fresh training
)

# Save the final model
model.save(os.path.join(checkpoint_dir, 'final_model.pt'))

print("\nTraining completed!")

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n-cls.pt to 'yolov8n-cls.pt'...


100%|██████████| 5.31M/5.31M [00:00<00:00, 17.4MB/s]


Ultralytics 8.3.40  Python-3.11.9 torch-2.5.1+cpu CPU (AMD Ryzen 9 9950X 16-Core Processor)
[34m[1mengine\trainer: [0mtask=classify, mode=train, model=yolov8n-cls.pt, data=d:\Github\MachineLearningFinalProject\dataset_cls, epochs=30, time=None, patience=5, batch=64, imgsz=224, save=True, save_period=5, cache=False, device=cpu, workers=8, project=d:\Github\MachineLearningFinalProject\yolo_checkpoints, name=train, exist_ok=True, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=Fals

[34m[1mtrain: [0mScanning D:\Github\MachineLearningFinalProject\dataset_cls\train... 14950 images, 0 corrupt: 100%|██████████| 14950/14950 [00:05<00:00, 2655.78it/s]


[34m[1mtrain: [0mNew cache created: D:\Github\MachineLearningFinalProject\dataset_cls\train.cache


[34m[1mval: [0mScanning D:\Github\MachineLearningFinalProject\dataset_cls\val... 3204 images, 0 corrupt: 100%|██████████| 3204/3204 [00:01<00:00, 2656.88it/s]


[34m[1mval: [0mNew cache created: D:\Github\MachineLearningFinalProject\dataset_cls\val.cache
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.000714, momentum=0.9) with parameter groups 26 weight(decay=0.0), 27 weight(decay=0.0005), 27 bias(decay=0.0)
[34m[1mTensorBoard: [0mmodel graph visualization added 
Image sizes 224 train, 224 val
Using 0 dataloader workers
Logging results to [1md:\Github\MachineLearningFinalProject\yolo_checkpoints\train[0m
Starting training for 30 epochs...

      Epoch    GPU_mem       loss  Instances       Size


       1/30         0G      1.245         38        224: 100%|██████████| 234/234 [01:46<00:00,  2.20it/s]
               classes   top1_acc   top5_acc: 100%|██████████| 26/26 [00:09<00:00,  2.65it/s]

                   all      0.903      0.995






      Epoch    GPU_mem       loss  Instances       Size


       2/30         0G     0.3363         38        224: 100%|██████████| 234/234 [01:46<00:00,  2.20it/s]
               classes   top1_acc   top5_acc: 100%|██████████| 26/26 [00:08<00:00,  2.93it/s]

                   all      0.927      0.994






      Epoch    GPU_mem       loss  Instances       Size


       3/30         0G      0.263         38        224: 100%|██████████| 234/234 [01:37<00:00,  2.39it/s]
               classes   top1_acc   top5_acc: 100%|██████████| 26/26 [00:08<00:00,  2.95it/s]

                   all      0.941      0.996






      Epoch    GPU_mem       loss  Instances       Size


       4/30         0G     0.2381         38        224: 100%|██████████| 234/234 [01:38<00:00,  2.38it/s]
               classes   top1_acc   top5_acc: 100%|██████████| 26/26 [00:08<00:00,  2.97it/s]

                   all      0.941      0.996






      Epoch    GPU_mem       loss  Instances       Size


       5/30         0G     0.1936         64        224:  15%|█▍        | 35/234 [00:14<01:23,  2.40it/s]