# Face Detection & Recognition

## Importing Libraries

In [None]:
# Standard libraries
import os
import random

# Data manipulation
import numpy as np
import pandas as pd

# PyTorch core
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# PyTorch utilities
from torch.utils.data import Dataset, DataLoader, random_split

# TorchVision
import torchvision
from torchvision import transforms
from torchvision.io import read_image
from torchvision import datasets


# Image processing
from PIL import Image

# Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as patches


# Preparing The Dataset

In [None]:
class FaceDataset(Dataset):
    def __init__(self, labels_file, images_dir, new_size=(256, 256), max_boxes=6, augment=False):
        self.images_dir = images_dir
        self.new_size = new_size
        self.max_boxes = max_boxes
        self.augment = augment 

        self.img_paths = []
        self.img_coords = []

        transform_list = [transforms.Resize(self.new_size)]

        if self.augment:
            transform_list.extend([
                transforms.RandomHorizontalFlip(),
                transforms.RandomRotation(10),
                transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2)
            ])

        transform_list.append(transforms.ToTensor())

        self.transform = transforms.Compose(transform_list)

        # Parsing the labels file
        with open(labels_file, 'r') as f:
            current = []
            for line in f:
                line = line.strip()
                if not line:
                    continue
                if line.startswith('#'):
                    if current:
                        self.img_coords.append(current)
                        current = []
                    self.img_paths.append(line.lstrip('# ').strip())
                else:
                    current.append(list(map(int, line.split())))
            if current:
                self.img_coords.append(current)

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        path = os.path.join(self.images_dir, self.img_paths[idx])
        img = Image.open(path).convert('RGB')
        ow, oh = img.size
        img_t = self.transform(img)

        bboxes = []
        for x1, y1, x2, y2 in self.img_coords[idx]:
            nx1 = x1 * self.new_size[0] / ow
            ny1 = y1 * self.new_size[1] / oh
            nx2 = x2 * self.new_size[0] / ow
            ny2 = y2 * self.new_size[1] / oh
            bboxes.append([nx1 / self.new_size[0],
                           ny1 / self.new_size[1],
                           nx2 / self.new_size[0],
                           ny2 / self.new_size[1]])

        padded = bboxes[:self.max_boxes] + [[-1] * 4] * (self.max_boxes - len(bboxes))
        confs = [1.0] * min(len(bboxes), self.max_boxes) + [0.0] * (self.max_boxes - len(bboxes))

        boxes_t = torch.tensor(padded, dtype=torch.float32)
        confs_t = torch.tensor(confs, dtype=torch.float32).unsqueeze(-1)
        target = torch.cat([boxes_t, confs_t], dim=-1)

        return img_t, target


In [None]:
dataset = FaceDataset(
    labels_file='/kaggle/input/dataset-for-face-detection/Dataset_FDDB/Dataset_FDDB/label.txt',
    images_dir='/kaggle/input/dataset-for-face-detection/Dataset_FDDB/Dataset_FDDB/images'
)

## Plotting The Dataset

In [None]:
fig, axs = plt.subplots(5, 5, figsize=(15, 15))

for i in range(25):
    row = i // 5
    col = i % 5
    image_tensor, target = dataset[i]  
    
    bbox = target[:, :4]  # Shape (6, 4) [x1, y1, x2, y2] normalized
    conf = target[:, 4]   # Shape (6,) confidence scores
    
    img = image_tensor.permute(1, 2, 0).numpy()  
    
    non_dummy_boxes = []
    for box, c in zip(bbox, conf):
        if c > 0.5:  # confidence > 50%
            # denormalize
            x1 = box[0].item() * 256
            y1 = box[1].item() * 256
            x2 = box[2].item() * 256
            y2 = box[3].item() * 256
            non_dummy_boxes.append([x1, y1, x2, y2])
    
    axs[row, col].imshow(img)
    for box in non_dummy_boxes:
        x1, y1, x2, y2 = box
        rect = patches.Rectangle(
            (x1, y1), x2-x1, y2-y1,
            linewidth=2, edgecolor='r', facecolor='none'
        )
        axs[row, col].add_patch(rect)
    axs[row, col].axis('off')

plt.tight_layout()
plt.show()

In [None]:
total_size = len(dataset)
train_size = int(0.8 * total_size)  
val_size = int(0.1 * total_size) 
test_size = total_size - train_size - val_size 

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

# Custom CNN Architecture

In [None]:
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn1   = nn.BatchNorm2d(out_channels)
        self.relu  = nn.ReLU()
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn2   = nn.BatchNorm2d(out_channels)
        self.shortcut = nn.Sequential()
        if in_channels != out_channels or stride != 1:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
                nn.BatchNorm2d(out_channels),
            )

    def forward(self, x):
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        return self.relu(out)

class FaceDetectorCNN(nn.Module):
    def __init__(self, grid_size=8, max_boxes=6):
        super().__init__()
        self.max_boxes = max_boxes
        # ---- feature extractor ----
        self.backbone = nn.Sequential(
            ResidualBlock(3, 16), nn.MaxPool2d(2),
            ResidualBlock(16,32), nn.MaxPool2d(2),
            ResidualBlock(32,64), ResidualBlock(64,64), nn.MaxPool2d(2),
            ResidualBlock(64,128), ResidualBlock(128,128),
            nn.AdaptiveAvgPool2d((grid_size, grid_size)),
        )
        feat_dim = 128 * grid_size * grid_size
        # ---- heads ----
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(feat_dim, 512), nn.ReLU(),
            nn.Linear(512, 256), nn.ReLU(),
        )
        # 5 outputs per box: 4 coordinates + 1 confidence
        self.box_head  = nn.Linear(256, max_boxes * 5)  # [x1, y1, x2, y2, confidence]
        
    def forward(self, x):
        f = self.backbone(x)
        h = self.fc(f)
        
        # Predicted boxes + confidence
        boxes_conf = self.box_head(h).view(-1, self.max_boxes, 5) 

        boxes = boxes_conf[..., :4]  # First 4 --> coords
        conf_logits = boxes_conf[..., 4]  # 5th --> confidence 
        
        return boxes, conf_logits


In [None]:
def detection_loss(pred_boxes, pred_conf_logits, targets,
                   lambda_box=5.0, lambda_conf=1.0):
    device = pred_conf_logits.device
    B, M, _ = pred_boxes.shape  # B=batch size, M=max boxes

    true_boxes = targets[..., :4]         # (B, M, 4)
    true_conf  = targets[..., 4].float()  # (B, M)

    # Box Loss: Smooth L1 only on positive samples 
    pos_mask = true_conf == 1             # (B, M)
    num_pos = pos_mask.sum().float().clamp(min=1.0)

    if num_pos > 0:
        box_loss = F.smooth_l1_loss(
            pred_boxes[pos_mask], true_boxes[pos_mask], reduction='sum'
        ) / num_pos
    else:
        box_loss = torch.tensor(0., device=device)

    # Confidence Loss: Balanced Binary Cross Entropy 
    pos_mask = true_conf == 1
    neg_mask = true_conf == 0

    pred_conf_pos = pred_conf_logits[pos_mask]
    pred_conf_neg = pred_conf_logits[neg_mask]

    true_conf_pos = true_conf[pos_mask]
    true_conf_neg = true_conf[neg_mask]

    # Compute losses
    pos_loss = F.binary_cross_entropy_with_logits(
        pred_conf_pos, true_conf_pos, reduction='sum'
    )
    neg_loss = F.binary_cross_entropy_with_logits(
        pred_conf_neg, true_conf_neg, reduction='sum'
    )

    neg_weight = 0.25  
    conf_loss = (pos_loss + neg_weight * neg_loss) / (pos_mask.sum() + neg_weight * neg_mask.sum()).clamp(min=1.0)

    total_loss = lambda_box * box_loss + lambda_conf * conf_loss

    return total_loss, {'box_loss': box_loss, 'conf_loss': conf_loss}


In [None]:
model = FaceDetectorCNN().eval()

imgs, targets = next(iter(train_loader))
with torch.no_grad():
    pred_boxes, pred_conf_logits = model(imgs)

pred_conf_probs = torch.softmax(pred_conf_logits, dim=-1)[..., 1]

# Now you can index:
print("Pred boxes:",        pred_boxes[0])       
print("Target boxes:",      targets[0, :, :4])    
print("Pred conf (logits):", pred_conf_logits[0])  
print("Pred conf (probs):",  pred_conf_probs[0])  
print("Target conf:",        targets[0, :, 4])    

## Computing Accuracy

In [None]:
def compute_iou(box1, box2):
    x1, y1, x2, y2 = box1
    x1_p, y1_p, x2_p, y2_p = box2

    # compute intersection
    xi1 = max(x1, x1_p)
    yi1 = max(y1, y1_p)
    xi2 = min(x2, x2_p)
    yi2 = min(y2, y2_p)
    inter_area = max(0, xi2 - xi1) * max(0, yi2 - yi1)

    # compute union
    box_area = (x2 - x1) * (y2 - y1)
    pred_area = (x2_p - x1_p) * (y2_p - y1_p)
    union_area = box_area + pred_area - inter_area

    return inter_area / union_area if union_area > 0 else 0

## Model Training

In [None]:
from torch.nn.utils import clip_grad_norm_
from torchvision.ops import generalized_box_iou_loss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FaceDetectorCNN().to(device)
opt = optim.Adam(model.parameters(), lr=1e-3)

train_loss_list, val_loss_list, val_iou_list = [], [], []
epochs = 15
for ep in range(1, epochs+1):
    model.train()
    total_train_loss = 0.0
    for batch_idx, (imgs, targs) in enumerate(train_loader):
        imgs, targs = imgs.to(device), targs.to(device)
        boxes, conf_logits = model(imgs)
        loss, _ = detection_loss(boxes, conf_logits, targs)

        opt.zero_grad()
        loss.backward()
        # Prevent exploding gradients
        clip_grad_norm_(model.parameters(), max_norm=1.0)
        opt.step()

        total_train_loss += loss.item()

    avg_train = total_train_loss / len(train_loader)

    # Validation
    model.eval()
    total_val_loss, iou_sum, iou_count = 0.0, 0.0, 0
    with torch.no_grad():
        for imgs, targs in val_loader:
            imgs, targs = imgs.to(device), targs.to(device)
            boxes, conf_logits = model(imgs)
            loss_v, _ = detection_loss(boxes, conf_logits, targs)
            total_val_loss += loss_v.item()

            # IoU accuracy
            probs = torch.sigmoid(conf_logits)
            for b, c, t in zip(boxes, probs, targs):
                preds = b[c > 0.5]
                trues = t[t[...,4]>0][:,:4]
                for pb in preds:
                    best = max([compute_iou(pb.cpu().numpy(), tb.cpu().numpy())
                                for tb in trues], default=0)
                    if best > 0.5:
                        iou_sum += best; iou_count += 1

    avg_val = total_val_loss / len(val_loader)
    avg_iou = iou_sum / max(iou_count, 1)

    train_loss_list.append(avg_train)
    val_loss_list.append(avg_val)
    val_iou_list.append(avg_iou)

    print(f"Epoch {ep:02d}  Train: {avg_train:.4f}  Val: {avg_val:.4f}  IoU: {avg_iou:.4f}")


In [None]:
epochs_range = range(1, epochs + 1)

# Plot training and validation loss
plt.figure(figsize=(10, 5))
plt.plot(epochs_range, train_loss_list, label='Train Loss')
plt.plot(epochs_range, val_loss_list, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training vs Validation Loss')
plt.show()

## Predicting On Test Dataset

In [None]:
model.eval()
imgs, targets = next(iter(test_loader))
imgs, targets = imgs.to(device), targets.to(device)

with torch.no_grad():
    pred_boxes, pred_conf_logits = model(imgs)  
pred_conf = torch.sigmoid(pred_conf_logits)  # Convert logits to probabilities

correct = 0
total = 0

# Visualization setup
fig, axs = plt.subplots(4, 4, figsize=(16, 16))

for i in range(16):  
    ax = axs[i//4, i%4]
    img = imgs[i].cpu().permute(1,2,0).numpy()
    ax.imshow(img)
    ax.axis('off')

    gt = targets[i].cpu()
    img_boxes = pred_boxes[i].cpu()
    img_conf = pred_conf[i].cpu()

    # Accuracy
    gt_boxes = [box[:4].flatten().mul(256).tolist() for box in gt if torch.all(box[:4] != 0)]
    pr_boxes = []
    for box, conf in zip(img_boxes, img_conf):
        if conf > 0.5:  
            box_coords = box[:4].flatten().mul(256).tolist()
            pr_boxes.append((box_coords, conf.item()))
    
    total += len(gt_boxes)

    # Predictions 
    for box, conf in zip(img_boxes, img_conf):
        if conf > 0.5:
            x1, y1, x2, y2 = box * 256
            # Draw bounding box
            rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                                edgecolor='red', facecolor='none',
                                linestyle='--', linewidth=2)
            ax.add_patch(rect)
            # Draw confidence score
            ax.text(x1, y1 - 5, f'{conf:.2f}', color='red',
                    fontsize=10, fontweight='bold',
                    bbox=dict(facecolor='white', alpha=0.5, edgecolor='none'))

plt.tight_layout()
plt.show()

In [None]:
all_confidences = pred_conf.cpu().flatten().numpy()

plt.hist(all_confidences, bins=50)
plt.title("Confidence score distribution")
plt.xlabel("Confidence")
plt.ylabel("Frequency")
plt.show()

# Using Pretrained Model ( YOLOv8 ) For detection

In [None]:
!pip install ultralytics &> /dev/null

In [None]:
from ultralytics import YOLO
import cv2

In [None]:
model = YOLO("/kaggle/input/yolo/pytorch/default/1/yolov8l_100e.pt")

In [None]:
rows, cols = 4, 4

plt.figure(figsize=(cols * 4, rows * 4)) 

for idx in range(16):
    img_t, _ = test_dataset[idx]
    img_np = (img_t.permute(1,2,0).numpy() * 255).astype(np.uint8)
    img_np = np.ascontiguousarray(img_np)

    results = model(img_np, conf=0.25)

    annotated = results[0].plot()
    plt.subplot(rows, cols, idx + 1)
    plt.imshow(annotated)
    plt.axis('off')

plt.tight_layout()
plt.show()

In [None]:
from pathlib import Path

input_root = Path("/kaggle/input/facedataset/facesDS")

In [None]:
output_root = Path("cropped_faces")
output_root.mkdir(parents=True, exist_ok=True)

skipped = 0
processed = 0

for person_folder in input_root.iterdir():
    if not person_folder.is_dir():
        continue
    person_name = person_folder.name
    (output_root / person_name).mkdir(parents=True, exist_ok=True)

    for img_file in person_folder.glob("*"):
        if img_file.suffix.lower() not in [".jpg", ".jpeg", ".png"]:
            continue
        img = cv2.imread(str(img_file))
        results = model(img)[0]

        # Get largest box only
        boxes = results.boxes

        if len(boxes) == 0:
            skipped += 1
            continue
        processed += 1
            

        largest_box = max(boxes, key=lambda b: (b.xyxy[0][2] - b.xyxy[0][0]) * (b.xyxy[0][3] - b.xyxy[0][1]))
        x1, y1, x2, y2 = map(int, largest_box.xyxy[0])
        face_crop = img[y1:y2, x1:x2]

        # Save cropped face
        save_path = output_root / person_name / img_file.name
        cv2.imwrite(str(save_path), face_crop)

In [None]:
image_count = sum(1 for f in input_root.rglob("*") if f.suffix.lower() in [".jpg", ".jpeg", ".png"])
print("Total images:", image_count)
print(f"Processed: {processed} | Skipped: {skipped}")

In [None]:
cropped_dir = "/kaggle/working/cropped_faces"

rows, cols = 3, 5
fig, axs = plt.subplots(rows, cols, figsize=(15, 9))

people = os.listdir(cropped_dir)  # list of person folders

i = 0
for person in people:
    person_folder = os.path.join(cropped_dir, person)
    images = os.listdir(person_folder)[:cols]  # take first few images

    for img_file in images:
        if i >= rows * cols:
            break
        img_path = os.path.join(person_folder, img_file)
        img = Image.open(img_path)

        ax = axs[i // cols, i % cols]
        ax.imshow(img)
        ax.set_title(person)
        ax.axis('off')
        i += 1

plt.tight_layout()
plt.show()

In [None]:
data_dir = "/kaggle/working/cropped_faces"
dataset = datasets.ImageFolder(data_dir)

In [None]:
simple_transform = transforms.Compose([
    transforms.Resize((224, 224)),   
    transforms.ToTensor(),           
])

dataset.transform = simple_transform

def compute_mean_std(dataset):
    loader = DataLoader(dataset, batch_size=64, shuffle=False, num_workers=2)
    mean = 0.
    std = 0.
    total_images = 0

    for images, _ in loader:
        batch_samples = images.size(0)
        images = images.view(batch_samples, images.size(1), -1)
        mean += images.mean(2).sum(0)
        std += images.std(2).sum(0)
        total_images += batch_samples

    mean /= total_images
    std /= total_images
    return mean, std


In [None]:
mean, std = compute_mean_std(dataset)
print("Mean:", mean)
print("Std:", std)

dataset.transform = None

In [None]:
total_size = len(dataset)
train_size = int(0.8 * total_size)  
val_size = int(0.1 * total_size)    
test_size = total_size - train_size - val_size  

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# Data Augmentation
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),
    transforms.RandomRotation(degrees=15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    transforms.RandomPerspective(distortion_scale=0.2, p=0.5),
    transforms.RandomResizedCrop(size=(224, 224), scale=(0.8, 1.0)),
    transforms.ToTensor(),
    transforms.Normalize(mean, std)
])

default_transform = transforms.Compose([
    transforms.Resize((224, 224)),  
    transforms.ToTensor(),
    transforms.Normalize(mean, std)  

])

def apply_transform(batch, transform):
    transformed_images = [transform(img) for (img, label) in batch]
    labels = [label for (img, label) in batch]
    return torch.stack(transformed_images), torch.tensor(labels)

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    collate_fn=lambda x: apply_transform(x, train_transform),
)

val_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=lambda x: apply_transform(x, default_transform),
)

test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=lambda x: apply_transform(x, default_transform),
)

In [None]:
print((dataset))
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

# Custom CNN Model For Recognition

In [None]:
class classifierCNN(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        self.encoder = nn.Sequential(
            # Block 1
            nn.Conv2d(3, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),
            
            # Block 2
            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2),
            
            # Block 3
            nn.Conv2d(128, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2),
            
            # Block 4
            nn.Conv2d(256, 512, 3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            
            nn.Conv2d(512, 512, 3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1))
        )
        self.classifier = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, num_classes)
        )

    def forward(self, x):
        features = self.encoder(x).flatten(1)
        return self.classifier(features)

In [None]:
import time 

In [None]:
from sklearn.metrics import precision_score, recall_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = classifierCNN(num_classes=3).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

train_loss_list = []
val_loss_list   = []
train_acc_list  = []
val_acc_list    = []
precision_list  = []
recall_list     = []
epochs = 40

scaler = torch.cuda.amp.GradScaler()

best_val_loss = float('inf')
early_stop_counter = 0
patience = 5

total_start_time_1 = time.time() 

for epoch in range(epochs):
    # Training 
    model.train()
    train_loss, train_correct, train_total = 0, 0, 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            logits = model(images)
            loss = criterion(logits, labels)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item() * labels.size(0)
        train_correct += (logits.argmax(1) == labels).sum().item()
        train_total += labels.size(0)

    train_loss /= train_total
    train_acc = train_correct / train_total

    # Validation 
    model.eval()
    all_labels, all_preds = [], []
    val_loss, val_correct, val_total = 0, 0, 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            logits = model(images)
            loss = criterion(logits, labels)

            val_loss += loss.item() * labels.size(0)
            _, preds = logits.max(1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            val_correct += (preds == labels).sum().item()
            val_total += labels.size(0)

    val_loss /= val_total
    val_acc = val_correct / val_total
    precision = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='weighted', zero_division=0)

    # Scheduler step on validation loss
    scheduler.step(val_loss)

    # Record metrics
    train_loss_list.append(train_loss)
    train_acc_list.append(train_acc)
    val_loss_list.append(val_loss)
    val_acc_list.append(val_acc)
    precision_list.append(precision)
    recall_list.append(recall)

    # Print epoch summary
    print(f"Epoch {epoch+1:02d}/{epochs} | "
          f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | "
          f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | "
          f"Val Prec: {precision:.4f} | Val Rec: {recall:.4f}")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stop_counter = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print("Early stopping triggered.")
            break


total_time_1 = time.time() - total_start_time_1
print(f"\nTotal training time: {total_time_1:.2f} seconds")


In [None]:
epochs = range(1, len(train_loss_list) + 1)

# 1) Loss
plt.figure()
plt.plot(epochs, train_loss_list,    label='Train Loss')
plt.plot(epochs, val_loss_list,      label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss vs. Epoch')
plt.legend()
plt.show()

# 2) Accuracy
plt.figure()
plt.plot(epochs, train_acc_list,     label='Train Acc')
plt.plot(epochs, val_acc_list,       label='Val Acc')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. Epoch')
plt.legend()
plt.show()

# 3) Precision & Recall
plt.figure()
plt.plot(epochs, precision_list,     label='Precision')
plt.plot(epochs, recall_list,        label='Recall')
plt.xlabel('Epoch')
plt.ylabel('Score')
plt.title('Precision/Recall vs. Epoch')
plt.legend()
plt.show()

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# Confusion matrix
cm = confusion_matrix(all_labels, all_preds)

# Plot heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Print classification report
print(classification_report(all_labels, all_preds, zero_division=0))

In [None]:
def denormalize(tensor, mean, std):
    denorm = tensor.clone()
    for t, m, s in zip(denorm, mean, std):
        t.mul_(s).add_(m)  # Reverse: (tensor * std) + mean
    return denorm

In [None]:
class_names = ["Ahmed Zewail", "Farouk El-Baz", "Magdi Yacoub"]  


model.eval()
test_correct, test_total = 0, 0

test_images = []
test_labels = []
test_preds = []

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        
        logits  = model(images)
        preds = logits.argmax(1)
        
        test_correct += (preds == labels).sum().item()
        test_total += labels.size(0)
        
        test_images.append(images.cpu())
        test_labels.append(labels.cpu())
        test_preds.append(preds.cpu())

test_images = torch.cat(test_images, dim=0)
test_labels = torch.cat(test_labels, dim=0)
test_preds = torch.cat(test_preds, dim=0)

def imshow(img, title):
    img = denormalize(img, 
                      mean=mean, 
                      std=std)
    
    np_img = img.numpy().transpose((1, 2, 0))  # (C, H, W) → (H, W, C)
    
    np_img = np.clip(np_img, 0, 1)
    
    plt.imshow(np_img)
    plt.title(title)
    plt.axis('off')

plt.figure(figsize=(12, 12))  

for i in range(16):
    plt.subplot(4, 4, i+1)  
    img = test_images[i]
    true_label = test_labels[i].item()
    pred_label = test_preds[i].item()
    
    true_name = f"True: {class_names[true_label]}"
    pred_name = f"Pred: {class_names[pred_label]}"
    
    imshow(img, f"{true_name}\n{pred_name}")

plt.show()

test_acc = test_correct / test_total
print(f"Test Accuracy: {test_acc:.4f}")

# Using Pretrained Model ( ResNet50 ) For Recognition

In [None]:
import torchvision.models as models

model_2 = models.resnet50(pretrained=True)

# Freeze all layers
for param in model_2.parameters():
    param.requires_grad = False

# Replace final classification layer
num_classes = 3 
model_2.fc = nn.Linear(model_2.fc.in_features, num_classes)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_2 = model_2.to(device)

optimizer = torch.optim.Adam(model_2.fc.parameters(), lr=1e-3)

criterion = nn.CrossEntropyLoss()

In [None]:
train_loss_list_2 = []
val_loss_list_2   = []
train_acc_list_2  = []
val_acc_list_2    = []
precision_list_2  = []
recall_list_2     = []
epochs = 30

scaler = torch.cuda.amp.GradScaler()

best_val_loss = float('inf')
early_stop_counter = 0
patience = 5

total_start_time_2 = time.time() 

for epoch in range(epochs):
    # Training
    model_2.train()
    train_loss, train_correct, train_total = 0, 0, 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            logits = model_2(images)
            loss = criterion(logits, labels)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model_2.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item() * labels.size(0)
        train_correct += (logits.argmax(1) == labels).sum().item()
        train_total += labels.size(0)

    train_loss /= train_total
    train_acc = train_correct / train_total

    # Validation 
    model.eval()
    all_labels, all_preds = [], []
    val_loss, val_correct, val_total = 0, 0, 0

    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            logits = model_2(images)
            loss = criterion(logits, labels)

            val_loss += loss.item() * labels.size(0)
            _, preds = logits.max(1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            val_correct += (preds == labels).sum().item()
            val_total += labels.size(0)

    val_loss /= val_total
    val_acc = val_correct / val_total
    precision = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='weighted', zero_division=0)

    # Scheduler step on validation loss
    scheduler.step(val_loss)

    # Record metrics
    train_loss_list_2.append(train_loss)
    train_acc_list_2.append(train_acc)
    val_loss_list_2.append(val_loss)
    val_acc_list_2.append(val_acc)
    precision_list_2.append(precision)
    recall_list_2.append(recall)

    # Print epoch summary
    print(f"Epoch {epoch+1:02d}/{epochs} | "
          f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | "
          f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | "
          f"Val Prec: {precision:.4f} | Val Rec: {recall:.4f}")

    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        early_stop_counter = 0
        torch.save(model_2.state_dict(), 'best_model.pth')
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print("Early stopping triggered.")
            break


total_time_2 = time.time() - total_start_time_2
print(f"\nTotal training time: {total_time_2:.2f} seconds")


In [None]:
epochs = range(1, len(train_loss_list_2) + 1)

# 1) Loss
plt.figure()
plt.plot(epochs, train_loss_list_2,    label='Train Loss')
plt.plot(epochs, val_loss_list_2,      label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss vs. Epoch')
plt.legend()
plt.show()

# 2) Accuracy
plt.figure()
plt.plot(epochs, train_acc_list_2,     label='Train Acc')
plt.plot(epochs, val_acc_list_2,       label='Val Acc')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. Epoch')
plt.legend()
plt.show()

# 3) Precision & Recall
plt.figure()
plt.plot(epochs, precision_list_2,     label='Precision')
plt.plot(epochs, recall_list_2,        label='Recall')
plt.xlabel('Epoch')
plt.ylabel('Score')
plt.title('Precision/Recall vs. Epoch')
plt.legend()
plt.show()


In [None]:
class_names = ["Ahmed Zewail", "Farouk El-Baz", "Magdi Yacoub"]  


model_2.eval()
test_correct, test_total = 0, 0

test_images = []
test_labels = []
test_preds = []

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        
        logits  = model_2(images)
        preds = logits.argmax(1)
        
        test_correct += (preds == labels).sum().item()
        test_total += labels.size(0)
        
        test_images.append(images.cpu())
        test_labels.append(labels.cpu())
        test_preds.append(preds.cpu())

test_images = torch.cat(test_images, dim=0)
test_labels = torch.cat(test_labels, dim=0)
test_preds = torch.cat(test_preds, dim=0)

def imshow(img, title):
    img = denormalize(img, 
                      mean=mean, 
                      std=std)
    
    np_img = img.numpy().transpose((1, 2, 0))  # (C, H, W) → (H, W, C)
    
    np_img = np.clip(np_img, 0, 1)
    
    plt.imshow(np_img)
    plt.title(title)
    plt.axis('off')

plt.figure(figsize=(12, 12)) 
for i in range(16):
    plt.subplot(4, 4, i+1)  
    img = test_images[i]
    true_label = test_labels[i].item()
    pred_label = test_preds[i].item()
    
    true_name = f"True: {class_names[true_label]}"
    pred_name = f"Pred: {class_names[pred_label]}"
    
    imshow(img, f"{true_name}\n{pred_name}")

plt.show()

test_acc = test_correct / test_total
print(f"Test Accuracy: {test_acc:.4f}")


In [None]:
model = YOLO("/kaggle/input/yolo/pytorch/default/1/yolov8l_100e.pt")
recognizer = model_2
print(detector)
print(recognizer)

In [None]:
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

video_path = "/kaggle/input/testvid1/test1.mp4"
cap = cv2.VideoCapture(video_path)
fps    = cap.get(cv2.CAP_PROP_FPS)
w      = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h      = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out    = cv2.VideoWriter("/kaggle/working/output1.avi", fourcc, fps, (w, h))

while True:
    ret, frame = cap.read()
    if not ret:
        break

    results = detector(frame)[0]
    boxes   = results.boxes.xyxy.cpu().numpy()

    for (x1, y1, x2, y2) in boxes.astype(int):
        crop = frame[y1:y2, x1:x2]
        if crop.size == 0:
            continue

        crop_rgb   = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
        crop_pil   = Image.fromarray(crop_rgb)
        tensor     = transform(crop_pil).unsqueeze(0).cuda()

        with torch.no_grad():
            logits = recognizer(tensor)
            idx    = logits.argmax(dim=1).item()
            label  = class_names[idx]

        cv2.rectangle(frame, (x1, y1), (x2, y2), (0,0,255), 2)
        cv2.putText(frame, label, (x1, y1-10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0,0,255), 2)

    out.write(frame)

cap.release()
out.release()

print(" Done — video saved to /kaggle/working/output.avi")