# Crocodile Identification Pipeline

This notebook implements a complete pipeline for automated biometric identification of Mugger Crocodiles using UAV images. The pipeline includes:
- Dataset cleaning and balancing
- Feature extraction using SIFT, HOG, LBP, and ORB
- Dimensionality reduction using PCA
- Multiple model training and evaluation
- Visualization of results and model performance

## 1. Import Required Libraries

In [None]:
import os
import cv2
import numpy as np
import glob
import shutil
import random
import time
import xml.etree.ElementTree as ET
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from joblib import Memory
from tqdm import tqdm
from sklearn.decomposition import IncrementalPCA
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from skimage.feature import hog, local_binary_pattern
import xgboost as xgb
import joblib

print("All required libraries imported successfully!")

## 2. Utility Functions

In [None]:
def create_directory(path):
    """
    Create a directory if it doesn't exist
    
    Args:
        path (str): Directory path to create
    """
    Path(path).mkdir(parents=True, exist_ok=True)

def parse_voc_xml(xml_path):
    """
    Parse Pascal VOC XML file to get bounding box coordinates
    
    Args:
        xml_path (str): Path to XML file
        
    Returns:
        tuple: (xmin, ymin, xmax, ymax) coordinates
    """
    tree = ET.parse(xml_path)
    root = tree.getroot()
    
    # Get bounding box coordinates
    obj = root.find('object')
    bbox = obj.find('bndbox')
    xmin = int(bbox.find('xmin').text)
    ymin = int(bbox.find('ymin').text)
    xmax = int(bbox.find('xmax').text)
    ymax = int(bbox.find('ymax').text)
    
    return (xmin, ymin, xmax, ymax)

def crop_image(image_path, bbox=None, output_size=(224, 224)):
    """
    Crop image using bounding box or center crop
    
    Args:
        image_path (str): Path to input image
        bbox (tuple, optional): (xmin, ymin, xmax, ymax) coordinates
        output_size (tuple): Desired output size (width, height)
        
    Returns:
        numpy.ndarray: Cropped image
    """
    img = cv2.imread(image_path)
    if img is None:
        raise ValueError(f"Could not read image: {image_path}")
    
    if bbox:
        xmin, ymin, xmax, ymax = bbox
        cropped = img[ymin:ymax, xmin:xmax]
    else:
        # Center crop
        h, w = img.shape[:2]
        center_x, center_y = w // 2, h // 2
        xmin = max(0, center_x - output_size[0] // 2)
        ymin = max(0, center_y - output_size[1] // 2)
        xmax = min(w, xmin + output_size[0])
        ymax = min(h, ymin + output_size[1])
        cropped = img[ymin:ymax, xmin:xmax]
    
    # Resize to standard size
    cropped = cv2.resize(cropped, output_size)
    return cropped

def extract_croc_id_from_filename(filename):
    """
    Extract crocodile ID from filename
    
    Args:
        filename (str): Input filename (e.g., 'Croc1_1.jpg')
        
    Returns:
        str: Crocodile ID (e.g., 'Croc1')
    """
    return filename.split('_')[0]

## 3. Dataset Cleaning Functions

In [None]:
def validate_xml(xml_path):
    """
    Validate XML file for proper bounding box information.
    
    Args:
        xml_path (str): Path to the XML file
        
    Returns:
        bool: True if XML is valid, False otherwise
    """
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        
        # Check if there are any object tags
        objects = root.findall('.//object')
        if not objects:
            return False
        
        # Check if each object has valid bounding box
        for obj in objects:
            bndbox = obj.find('bndbox')
            if bndbox is None:
                return False
            
            # Check if all required coordinates are present and valid
            for coord in ['xmin', 'ymin', 'xmax', 'ymax']:
                if bndbox.find(coord) is None:
                    return False
                try:
                    float(bndbox.find(coord).text)
                except (ValueError, TypeError):
                    return False
        
        return True
    except Exception as e:
        print(f"Error validating XML {xml_path}: {str(e)}")
        return False

def get_min_folder_size(directory):
    """
    Get the minimum number of valid image-XML pairs across all crocodile folders.
    
    Args:
        directory (str): Path to the training dataset directory
        
    Returns:
        int: Minimum number of valid pairs
    """
    print("\n[STEP 1] Analyzing folders to find minimum number of valid pairs...")
    min_size = float('inf')
    folder_counts = {}
    
    for root, dirs, files in os.walk(directory):
        if "removed" in root:
            continue
        
        jpg_files = set(f for f in files if f.endswith('.jpg'))
        xml_files = set(f.replace('.jpg', '.xml') for f in jpg_files)
        
        # Count only valid pairs
        valid_pairs = sum(1 for xml in xml_files if xml in files and validate_xml(os.path.join(root, xml)))
        
        if valid_pairs > 0:
            min_size = min(min_size, valid_pairs)
            folder_name = os.path.basename(root)
            folder_counts[folder_name] = valid_pairs
    
    # Print folder statistics
    print(f"\nFound {len(folder_counts)} folders with valid image-XML pairs:")
    for folder, count in sorted(folder_counts.items()):
        print(f"  - {folder}: {count} valid pairs")
    
    return min_size if min_size != float('inf') else 0

def clean_and_balance_dataset(directory):
    """
    Clean the dataset by moving invalid files to a removed directory and balance the number of images.
    Recursively checks all subfolders within the given directory.
    
    Args:
        directory (str): Path to the root training dataset directory
    """
    print("\n[STEP 2] Setting up removed directory structure...")
    
    # Use the existing removed directory
    removed_dir = os.path.join(os.path.dirname(os.path.dirname(directory)), "removed", "Training")
    os.makedirs(removed_dir, exist_ok=True)
    print(f"Using existing removed directory: {removed_dir}")
    
    # First, get the minimum folder size
    min_size = get_min_folder_size(directory)
    print(f"\n[STEP 3] Minimum number of valid pairs across all folders: {min_size}")
    
    total_moved_jpg = 0
    total_moved_xml = 0
    
    print("\n[STEP 4] Processing each folder to clean and balance the dataset...")
    
    # Walk through all subdirectories
    for root, dirs, files in os.walk(directory):
        # Skip the removed directory
        if "removed" in root:
            continue
        
        # Get the CrocID from the current directory path
        croc_id = os.path.basename(root)
        print(f"\n  Processing folder: {croc_id}")
        
        # Get all jpg and xml files in current directory
        jpg_files = set(glob.glob(os.path.join(root, "*.jpg")))
        xml_files = set(glob.glob(os.path.join(root, "*.xml")))
        
        print(f"    Found {len(jpg_files)} JPG files and {len(xml_files)} XML files")
        
        # Convert to sets of filenames without extensions
        jpg_bases = {os.path.splitext(os.path.basename(f))[0] for f in jpg_files}
        xml_bases = {os.path.splitext(os.path.basename(f))[0] for f in xml_files}
        
        # Create corresponding removed directory
        removed_croc_dir = os.path.join(removed_dir, croc_id)
        os.makedirs(removed_croc_dir, exist_ok=True)
        
        # Find orphaned JPG files (no corresponding XML)
        orphaned_jpg = jpg_bases - xml_bases
        
        # Move orphaned JPG files
        if orphaned_jpg:
            print(f"    Found {len(orphaned_jpg)} orphaned JPG files (no corresponding XML)")
            for base in orphaned_jpg:
                src_path = os.path.join(root, f"{base}.jpg")
                dst_path = os.path.join(removed_croc_dir, f"{base}.jpg")
                try:
                    shutil.move(src_path, dst_path)
                    print(f"      Moved orphaned image: {base}.jpg")
                    total_moved_jpg += 1
                except Exception as e:
                    print(f"      Error moving {base}.jpg: {str(e)}")
        
        # Get valid image-XML pairs
        valid_pairs = []
        invalid_pairs = []
        
        for base in xml_bases:
            xml_path = os.path.join(root, f"{base}.xml")
            jpg_path = os.path.join(root, f"{base}.jpg")
            
            # Check if both files exist and XML is valid
            if os.path.exists(jpg_path) and validate_xml(xml_path):
                valid_pairs.append((base, xml_path, jpg_path))
            else:
                invalid_pairs.append((base, xml_path, jpg_path))
        
        # Move invalid pairs
        if invalid_pairs:
            print(f"    Found {len(invalid_pairs)} invalid pairs (missing or invalid XML)")
            for base, xml_path, jpg_path in invalid_pairs:
                try:
                    if os.path.exists(xml_path):
                        xml_dst = os.path.join(removed_croc_dir, f"{base}.xml")
                        shutil.move(xml_path, xml_dst)
                        print(f"      Moved invalid XML: {base}.xml")
                        total_moved_xml += 1
                    
                    if os.path.exists(jpg_path):
                        jpg_dst = os.path.join(removed_croc_dir, f"{base}.jpg")
                        shutil.move(jpg_path, jpg_dst)
                        print(f"      Moved corresponding image: {base}.jpg")
                        total_moved_jpg += 1
                except Exception as e:
                    print(f"      Error moving files for {base}: {str(e)}")
        
        print(f"    Found {len(valid_pairs)} valid image-XML pairs")
        
        # If we have more valid pairs than the minimum size, randomly select pairs to move
        if len(valid_pairs) > min_size:
            excess_count = len(valid_pairs) - min_size
            print(f"    Need to move {excess_count} excess pairs to balance the dataset")
            pairs_to_move = random.sample(valid_pairs, excess_count)
            for base, xml_path, jpg_path in pairs_to_move:
                try:
                    # Move XML file
                    xml_dst = os.path.join(removed_croc_dir, f"{base}.xml")
                    shutil.move(xml_path, xml_dst)
                    print(f"      Moved excess XML: {base}.xml")
                    total_moved_xml += 1
                    
                    # Move corresponding JPG
                    jpg_dst = os.path.join(removed_croc_dir, f"{base}.jpg")
                    shutil.move(jpg_path, jpg_dst)
                    print(f"      Moved excess image: {base}.jpg")
                    total_moved_jpg += 1
                except Exception as e:
                    print(f"      Error moving excess files for {base}: {str(e)}")
        
        print(f"    Folder {croc_id} now has {min_size} valid pairs")
    
    # Print summary
    print("\n[STEP 5] Cleaning and balancing complete!")
    total_moved = total_moved_jpg + total_moved_xml
    if total_moved == 0:
        print("\nNo files needed to be moved. Dataset is clean and balanced!")
    else:
        print(f"\nMoved {total_moved} files to removed directory:")
        print(f"- {total_moved_jpg} JPG files")
        print(f"- {total_moved_xml} XML files")
        print(f"\nFiles have been moved to: {removed_dir}")
        print(f"Each folder now contains {min_size} valid image-XML pairs")

## 4. Feature Extraction Class

In [None]:
class FeatureExtractor:
    def __init__(self):
        """
        Initialize feature extractors with caching
        """
        print("[DEBUG] Initializing FeatureExtractor...")
        # Initialize feature extractors
        self.sift = cv2.SIFT_create()
        self.orb = cv2.ORB_create()
        
        # Parameters
        self.lbp_radius = 3
        self.lbp_n_points = 8 * self.lbp_radius
        self.hog_orientations = 9
        self.hog_pixels_per_cell = (8, 8)
        self.hog_cells_per_block = (2, 2)
        
        # Setup caching in current directory
        cache_dir = 'feature_cache'
        os.makedirs(cache_dir, exist_ok=True)
        print(f"[DEBUG] Cache directory: {cache_dir}")
        self.memory = Memory(cache_dir, verbose=0)
        
        # Cache the feature extraction methods
        print("[DEBUG] Setting up feature caching...")
        self.cached_sift = self.memory.cache(self._extract_sift)
        self.cached_hog = self.memory.cache(self._extract_hog)
        self.cached_lbp = self.memory.cache(self._extract_lbp)
        self.cached_orb = self.memory.cache(self._extract_orb)
        print("[DEBUG] FeatureExtractor initialization complete")
    
    def _extract_sift(self, gray):
        """Internal SIFT feature extraction"""
        keypoints, descriptors = self.sift.detectAndCompute(gray, None)
        if descriptors is None:
            print("[DEBUG] No SIFT features found, returning zero vector")
            return np.zeros(128)
        return np.mean(descriptors, axis=0)
    
    def _extract_hog(self, gray):
        """Internal HOG feature extraction"""
        return hog(gray, 
                  orientations=self.hog_orientations,
                  pixels_per_cell=self.hog_pixels_per_cell,
                  cells_per_block=self.hog_cells_per_block,
                  block_norm='L2-Hys')
    
    def _extract_lbp(self, gray):
        """Internal LBP feature extraction"""
        lbp = local_binary_pattern(gray, 
                                 self.lbp_n_points,
                                 self.lbp_radius,
                                 method='uniform')
        hist, _ = np.histogram(lbp.ravel(), 
                             bins=np.arange(0, self.lbp_n_points + 3),
                             density=True)
        return hist
    
    def _extract_orb(self, gray):
        """Internal ORB feature extraction"""
        keypoints, descriptors = self.orb.detectAndCompute(gray, None)
        if descriptors is None:
            print("[DEBUG] No ORB features found, returning zero vector")
            return np.zeros(32)
        return np.mean(descriptors, axis=0)
    
    def extract_all_features(self, image):
        """
        Extract all features from image using parallel processing
        
        Args:
            image (numpy.ndarray): Input image
            
        Returns:
            numpy.ndarray: Concatenated features
        """
        start_time = time.time()
        
        # Convert to grayscale once
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        
        # Extract features in parallel
        with ThreadPoolExecutor(max_workers=4) as executor:
            futures = {
                'sift': executor.submit(self.cached_sift, gray),
                'hog': executor.submit(self.cached_hog, gray),
                'lbp': executor.submit(self.cached_lbp, gray),
                'orb': executor.submit(self.cached_orb, gray)
            }
            
            # Get results
            features = {name: future.result() for name, future in futures.items()}
        
        # Concatenate all features
        all_features = np.concatenate([
            features['sift'],
            features['hog'],
            features['lbp'],
            features['orb']
        ])
        
        return all_features
    
    def extract_features_batch(self, images, desc="Extracting features"):
        """
        Extract features from a batch of images with progress bar
        
        Args:
            images (list): List of images
            desc (str): Description for progress bar
            
        Returns:
            numpy.ndarray: Array of features
        """
        print(f"\n[DEBUG] Starting batch processing of {len(images)} images")
        start_time = time.time()
        
        features = []
        for img in tqdm(images, desc=desc, unit="img"):
            features.append(self.extract_all_features(img))
        
        total_time = time.time() - start_time
        print(f"\n[DEBUG] Batch processing complete:")
        print(f"[DEBUG] Total time: {total_time:.2f}s")
        print(f"[DEBUG] Average time per image: {total_time/len(images):.2f}s")
        print(f"[DEBUG] Final feature array shape: {np.array(features).shape}")
        
        return np.array(features)

## 5. Crocodile Classifier Class

In [None]:
class CrocodileClassifier:
    def __init__(self):
        """Initialize the classifier with memory-efficient components"""
        # Initialize PCA with consistent components and batch size
        self.pca = IncrementalPCA(n_components=1500, batch_size=1500)
        
        # Initialize models with memory-efficient settings
        self.models = {
            'svm': SVC(
                kernel='rbf',
                C=10.0,
                gamma='scale',
                probability=True,
                cache_size=500,  # 500MB cache
                decision_function_shape='ovr',
                random_state=42
            ),
            'rf': RandomForestClassifier(
                n_estimators=100,
                max_depth=None,
                min_samples_split=2,
                min_samples_leaf=1,
                random_state=42
            ),
            'xgb': xgb.XGBClassifier(
                n_estimators=100,
                max_depth=6,
                learning_rate=0.1,
                random_state=42
            )
        }
        
        # Initialize feature extractor
        self.feature_extractor = FeatureExtractor()
        
        # Create cache directory
        self.cache_dir = 'model_cache'
        create_directory(self.cache_dir)
    
    def preprocess_features(self, features):
        """Preprocess features using PCA"""
        # Convert to numpy array if not already
        features = np.array(features)
        
        # Handle NaN values
        features = np.nan_to_num(features, nan=0.0)
        
        # Apply PCA
        reduced_features = self.pca.fit_transform(features)
        
        # Print memory reduction
        print(f"Memory usage reduced by {features.nbytes / (1024**2):.2f} MB")
        print(f"Explained variance ratio: {np.sum(self.pca.explained_variance_ratio_):.4f}")
        
        return reduced_features
    
    def train(self, X_train, y_train):
        """Train all models"""
        # Preprocess features
        X_train_processed = self.preprocess_features(X_train)
        
        # Train each model
        for name, model in self.models.items():
            print(f"Training {name}...")
            model.fit(X_train_processed, y_train)
            
            # Save model
            model_path = os.path.join(self.cache_dir, f'{name}_model.joblib')
            joblib.dump(model, model_path)
    
    def evaluate(self, X_test, y_test):
        """Evaluate all models"""
        # Preprocess features
        X_test_processed = self.preprocess_features(X_test)
        
        results = {}
        for name, model in self.models.items():
            # Load model if not in memory
            model_path = os.path.join(self.cache_dir, f'{name}_model.joblib')
            if not hasattr(model, 'classes_'):
                model = joblib.load(model_path)
            
            # Make predictions
            y_pred = model.predict(X_test_processed)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='weighted')
            recall = recall_score(y_test, y_pred, average='weighted')
            f1 = f1_score(y_test, y_pred, average='weighted')
            
            results[name] = {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1
            }
            
            print(f"{name} Results:")
            print(f"Accuracy: {accuracy:.4f}")
            print(f"Precision: {precision:.4f}")
            print(f"Recall: {recall:.4f}")
            print(f"F1 Score: {f1:.4f}")
            print()
        
        return results
    
    def predict(self, X):
        """Make predictions using all models"""
        # Preprocess features
        X_processed = self.preprocess_features(X)
        
        predictions = {}
        for name, model in self.models.items():
            # Load model if not in memory
            model_path = os.path.join(self.cache_dir, f'{name}_model.joblib')
            if not hasattr(model, 'classes_'):
                model = joblib.load(model_path)
            
            # Make predictions
            predictions[name] = model.predict(X_processed)
        
        return predictions

## 6. Main Pipeline Class

In [None]:
class CrocodilePipeline:
    def __init__(self):
        """
        Initialize the crocodile identification pipeline
        """
        # Initialize feature extractor
        self.feature_extractor = FeatureExtractor()
        
        # Initialize classifier
        self.classifier = CrocodileClassifier()
        
        # Create output directories
        self.output_dirs = {
            'training': 'cropped/Training',
            'test_known': 'cropped/Test/Known',
            'test_unknown': 'cropped/Test/Unknown'
        }
        for dir_path in self.output_dirs.values():
            create_directory(dir_path)
    
    def process_training_data(self, training_dir):
        """
        Process training data: extract bounding boxes and features
        
        Args:
            training_dir (str): Path to training data directory
            
        Returns:
            tuple: (features, labels)
        """
        features = []
        labels = []
        total_images = 0
        processed_folders = 0
        
        print("\n=== Starting Training Data Processing ===")
        print(f"Training directory: {training_dir}")
        
        # Process each crocodile folder
        for croc_dir in os.listdir(training_dir):
            croc_path = os.path.join(training_dir, croc_dir)
            if not os.path.isdir(croc_path):
                continue
            
            processed_folders += 1
            folder_images = 0
            
            # Check if this folder has already been processed
            cropped_dir = os.path.join(self.output_dirs['training'], croc_dir)
            if os.path.exists(cropped_dir) and os.path.isdir(cropped_dir):
                print(f"\n[Folder {processed_folders}] Loading features from processed folder: {croc_dir}")
                # Load features from processed images
                for img_file in os.listdir(cropped_dir):
                    if not img_file.endswith('.jpg'):
                        continue
                    
                    folder_images += 1
                    total_images += 1
                    
                    # Load cropped image
                    img_path = os.path.join(cropped_dir, img_file)
                    cropped_img = cv2.imread(img_path)
                    
                    if cropped_img is None:
                        print(f"  Warning: Could not read image {img_path}")
                        continue
                    
                    # Extract features
                    try:
                        img_features = self.feature_extractor.extract_all_features(cropped_img)
                        features.append(img_features)
                        labels.append(croc_dir)
                    except Exception as e:
                        print(f"  Error extracting features from {img_file}: {str(e)}")
                        continue
                
                print(f"  Loaded {folder_images} images from {croc_dir}")
                continue
            
            print(f"\n[Folder {processed_folders}] Processing new folder: {croc_dir}...")
            
            # Process each image in the folder
            for img_file in os.listdir(croc_path):
                if not img_file.endswith('.jpg'):
                    continue
                
                folder_images += 1
                total_images += 1
                
                # Get image and XML paths
                img_path = os.path.join(croc_path, img_file)
                xml_path = os.path.join(croc_path, img_file.replace('.jpg', '.xml'))
                
                try:
                    # Parse bounding box
                    bbox = parse_voc_xml(xml_path)
                    
                    # Crop image
                    cropped_img = crop_image(img_path, bbox)
                    
                    # Save cropped image
                    output_path = os.path.join(self.output_dirs['training'], croc_dir, img_file)
                    create_directory(os.path.dirname(output_path))
                    cv2.imwrite(output_path, cropped_img)
                    
                    # Extract features
                    img_features = self.feature_extractor.extract_all_features(cropped_img)
                    
                    features.append(img_features)
                    labels.append(croc_dir)
                except Exception as e:
                    print(f"  Error processing {img_file}: {str(e)}")
                    continue
            
            print(f"  Processed {folder_images} images from {croc_dir}")
        
        if len(features) == 0:
            raise ValueError("No features extracted! Check if the dataset directories are correct.")
        
        print("\n=== Training Data Processing Summary ===")
        print(f"Total folders processed: {processed_folders}")
        print(f"Total images processed: {total_images}")
        print(f"Total features extracted: {len(features)}")
        print(f"Feature dimension: {len(features[0])}")
        print("=====================================\n")
            
        return np.array(features), np.array(labels)
    
    def process_test_data(self, test_dir, is_known=True):
        """
        Process test data: crop images and extract features
        
        Args:
            test_dir (str): Path to test data directory
            is_known (bool): Whether the test data is for known crocodiles
            
        Returns:
            tuple: (features, labels) if is_known else (features,)
        """
        features = []
        labels = [] if is_known else None
        total_images = 0
        
        print(f"\n=== Processing {'Known' if is_known else 'Unknown'} Test Data ===")
        print(f"Test directory: {test_dir}")
        
        # Process each image
        for img_file in os.listdir(test_dir):
            if not img_file.endswith('.jpg'):
                continue
            
            total_images += 1
            print(f"\nProcessing image {total_images}: {img_file}")
            
            try:
                # Get image path
                img_path = os.path.join(test_dir, img_file)
                
                # Crop image (center crop for unknown)
                cropped_img = crop_image(img_path)
                
                # Save cropped image
                output_dir = self.output_dirs['test_known' if is_known else 'test_unknown']
                output_path = os.path.join(output_dir, img_file)
                cv2.imwrite(output_path, cropped_img)
                
                # Extract features
                img_features = self.feature_extractor.extract_all_features(cropped_img)
                
                features.append(img_features)
                if is_known:
                    labels.append(extract_croc_id_from_filename(img_file))
            except Exception as e:
                print(f"  Error processing {img_file}: {str(e)}")
                continue
        
        print("\n=== Test Data Processing Summary ===")
        print(f"Total images processed: {total_images}")
        print(f"Total features extracted: {len(features)}")
        if len(features) > 0:
            print(f"Feature dimension: {len(features[0])}")
        print("=====================================\n")
        
        if is_known:
            return np.array(features), np.array(labels)
        return np.array(features)
    
    def run_pipeline(self, training_dir, test_known_dir, test_unknown_dir):
        """
        Run the complete pipeline
        
        Args:
            training_dir (str): Path to training data directory
            test_known_dir (str): Path to known test data directory
            test_unknown_dir (str): Path to unknown test data directory
        """
        print("\n=== Starting Crocodile Identification Pipeline ===")
        print(f"Training directory: {training_dir}")
        print(f"Known test directory: {test_known_dir}")
        print(f"Unknown test directory: {test_unknown_dir}")
        print("=============================================\n")
        
        # Clean and balance the dataset
        print("Cleaning and balancing dataset...")
        clean_and_balance_dataset(training_dir)
        
        # Process training data
        print("\nProcessing training data...")
        X_train, y_train = self.process_training_data(training_dir)
        
        # Train and evaluate models
        print("\nTraining and evaluating models...")
        self.classifier.train(X_train, y_train)
        
        # Process and evaluate test data
        print("\nProcessing known test data...")
        X_test_known, y_test_known = self.process_test_data(test_known_dir, is_known=True)
        known_results = self.classifier.evaluate(X_test_known, y_test_known)
        
        print("\nProcessing unknown test data...")
        X_test_unknown = self.process_test_data(test_unknown_dir, is_known=False)
        unknown_predictions = self.classifier.predict(X_test_unknown)
        
        print("\n=== Pipeline Completed Successfully ===")
        
        return known_results, unknown_predictions

## 7. Run the Pipeline

In [None]:
# Initialize pipeline
pipeline = CrocodilePipeline()

# Run pipeline with dataset directories
known_results, unknown_predictions = pipeline.run_pipeline(
    training_dir="dataset/Training",
    test_known_dir="dataset/Test/Known",
    test_unknown_dir="dataset/Test/Unknown"
)

## 8. Conclusion

In [None]:
print("The pipeline has completed successfully! Here's what was accomplished:")

print("\n1. Dataset Preparation:")
print("   - Cleaned and balanced the training dataset")
print("   - Removed invalid and orphaned files")
print("   - Ensured equal number of samples per class")

print("\n2. Feature Extraction:")
print("   - Extracted SIFT, HOG, LBP, and ORB features")
print("   - Implemented parallel processing for efficiency")
print("   - Cached features to speed up processing")

print("\n3. Model Training:")
print("   - Trained SVM, Random Forest, and XGBoost models")
print("   - Applied PCA for dimensionality reduction")
print("   - Optimized memory usage")

print("\n4. Evaluation:")
print("   - Evaluated models on known test data")
print("   - Generated predictions for unknown test data")
print("   - Calculated performance metrics")

print("\nThe results are stored in `known_results` and `unknown_predictions` variables. You can analyze these results further or use the trained models for new predictions.") 