# yolo 

In [1]:
from ultralytics import YOLO
import os
import torch
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from PIL import Image
from tqdm.notebook import tqdm

### Configuration

In [2]:
YOLO_DATASET_ROOT = os.path.join("SignLanguage_Processed_Frames", "YOLO_Frames_Dataset")
TEST_SPLIT_DIR = os.path.join(YOLO_DATASET_ROOT, "test")

# Update this path to where your 'best.pt' is located from your training run
MODEL_PATH = "runs/slr_yolo_frame_cls_train/exp_frames/weights/best.pt" 

IMG_SIZE = 224 
BATCH_SIZE_TEST = 32 

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Evaluation Function

In [3]:
def test_and_score_model(model_path, test_data_dir, img_size, batch_size):
    if not os.path.exists(model_path):
        print(f"ERROR: Trained model not found at {model_path}")
        return None
    if not os.path.exists(test_data_dir):
        print(f"ERROR: Test data directory not found at {test_data_dir}")
        return None

    model = YOLO(model_path)
    model.to(DEVICE)

    print(f"\n--- Evaluating Model on Test Set ---")
    print(f"Model: {model_path}")
    print(f"Test Data: {test_data_dir}")
    print(f"Image Size: {img_size}")
    print(f"Batch Size: {batch_size}")

    all_preds_indices = []
    all_true_labels_indices = []
    class_names_from_model = model.names
    class_to_idx = {name: idx for idx, name in class_names_from_model.items()}

    print("Collecting predictions on the test set...")
    for class_name in tqdm(os.listdir(test_data_dir), desc="Processing classes"):
        class_dir_path = os.path.join(test_data_dir, class_name)
        if not os.path.isdir(class_dir_path): continue
        if class_name not in class_to_idx: 
            print(f"Warning: Class '{class_name}' from test set not in model's classes. Skipping."); continue
        true_label_idx = class_to_idx[class_name]
        image_files = [os.path.join(class_dir_path, f) for f in os.listdir(class_dir_path) 
                       if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        if not image_files: continue

        for i in range(0, len(image_files), batch_size):
            batch_files = image_files[i:i+batch_size]
            try:
                results = model.predict(source=batch_files, imgsz=img_size, verbose=False, device=DEVICE, stream=False)
                for res in results:
                    if hasattr(res, 'probs') and res.probs is not None:
                        pred_idx = res.probs.top1
                        all_preds_indices.append(pred_idx)
                        all_true_labels_indices.append(true_label_idx)
                    else:
                        print(f"Warning: No probabilities for an image in {class_name}. Result obj: {type(res)}")
            except Exception as e:
                print(f"Error during prediction for batch in {class_name}: {e}")

    if not all_true_labels_indices: print("No predictions were made."); return None

    num_model_classes = len(class_names_from_model)
    target_names_for_report_cm = [class_names_from_model[i] for i in range(num_model_classes)]
    unique_true_labels_indices = sorted(list(set(all_true_labels_indices)))
    target_names_report_only = [class_names_from_model[i] for i in unique_true_labels_indices]

    print("\n📊 Classification Report (per frame):")
    print(classification_report(
        all_true_labels_indices, 
        all_preds_indices, 
        labels=unique_true_labels_indices, 
        target_names=target_names_report_only, 
        digits=4, 
        zero_division=0
    ))

    accuracy = accuracy_score(all_true_labels_indices, all_preds_indices)
    f1_macro = f1_score(all_true_labels_indices, all_preds_indices, average='macro', zero_division=0)
    f1_weighted = f1_score(all_true_labels_indices, all_preds_indices, average='weighted', zero_division=0)

    print(f"\n🔍 Overall Test Accuracy (per frame): {accuracy:.4f}")
    print(f"🔍 Overall Test F1-Score (Macro, per frame): {f1_macro:.4f}")
    print(f"🔍 Overall Test F1-Score (Weighted, per frame): {f1_weighted:.4f}")

    cm = confusion_matrix(all_true_labels_indices, all_preds_indices, labels=list(range(num_model_classes)))
    plt.figure(figsize=(max(8, num_model_classes), max(6, num_model_classes * 0.8)))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=target_names_for_report_cm,
                yticklabels=target_names_for_report_cm)
    plt.title("Confusion Matrix (Per Frame on Test Set)")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig("confusion_matrix_yolo_frames_test.png")
    plt.show()

    return {
        "accuracy": accuracy,
        "f1_macro": f1_macro,
        "f1_weighted": f1_weighted,
        "classification_report": classification_report(
            all_true_labels_indices, all_preds_indices, 
            labels=unique_true_labels_indices, target_names=target_names_report_only, 
            digits=4, zero_division=0, output_dict=True
        ),
        "confusion_matrix": cm
    }

### 5. Execute Evaluation

In [None]:
results = test_and_score_model(
    model_path=MODEL_PATH,
    test_data_dir=TEST_SPLIT_DIR,
    img_size=IMG_SIZE,
    batch_size=BATCH_SIZE_TEST
)
if results:
    print("\nEvaluation Finished.")
else:
    print("\nEvaluation could not be completed.")