<a href="https://colab.research.google.com/github/ELiTE0005/DeepLearningTechniques/blob/main/EXP_14_Feature_extractor_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
"""
Feature extractor using a pre-trained ResNet-50 (PyTorch)

What this script does:
- Loads a pretrained ResNet-50 model from torchvision
- Removes the final fully-connected layer and uses the remaining network as a feature extractor
- Loads images from a directory with torchvision.datasets.ImageFolder (class subfolders)
- Extracts features (global-average pooled 2048-d vectors) for every image and saves them to .npy files

Usage example:
    python feature_extractor_resnet50.py --data_dir /path/to/data --out_prefix ./features/resnet50

Output:
- <out_prefix>_features.npy  -> float32 array, shape (N, 2048)
- <out_prefix>_labels.npy    -> int32 array, shape (N,)
- <out_prefix>_paths.npy     -> object array of file paths (for traceability)

Requirements:
- Python 3.8+
- PyTorch and torchvision
- numpy
- pillow

This script is written to be robust and easily modifiable for other backbones (e.g., vgg16, efficientnet) or for different pooling strategies.
"""

import argparse
import os
from pathlib import Path
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import models, transforms, datasets


def get_resnet50_feature_extractor(pretrained=True, device='cpu'):
    """Return a nn.Module that maps input images to a 2048-d feature vector."""
    # Load pretrained resnet50
    resnet = models.resnet50(pretrained=pretrained)
    resnet.eval()

    # Remove the final fully-connected layer (fc)
    # Keep everything up to the final avgpool layer.
    # The original resnet returns a tensor of shape (batch, 2048, 1, 1) just before fc.
    # We'll add a flatten so users get a (batch, 2048) tensor.
    modules = list(resnet.children())[:-1]  # remove fc
    feature_extractor = nn.Sequential(*modules, nn.Flatten(1))
    feature_extractor.to(device)
    return feature_extractor


def build_dataloader(data_dir, batch_size=32, input_size=224, num_workers=4):
    """Create DataLoader using ImageFolder. Assumes data_dir has subfolders per class."""
    transform = transforms.Compose([
        transforms.Resize(int(input_size * 1.15)),
        transforms.CenterCrop(input_size),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    dataset = datasets.ImageFolder(str(data_dir), transform=transform)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
    return loader, dataset


def extract_features(model, loader, device='cpu', max_images=None):
    """Iterate through the dataloader and extract features.

    Returns:
      features: numpy array (N, D)
      labels: numpy array (N,)
      paths: numpy array (N,) of file paths
    """
    model.eval()
    features_list = []
    labels_list = []
    paths_list = []

    with torch.no_grad():
        count = 0
        for images, labels in tqdm(loader, desc='Extracting features'):
            images = images.to(device)
            feats = model(images)  # shape (B, 2048)
            feats = feats.cpu().numpy()
            features_list.append(feats)
            labels_list.append(labels.numpy())

            # Access file paths via loader.dataset.samples (ImageFolder stores them in order)
            # We need to compute indices for these batch samples.
            # Simpler approach: walk sequentially using a running counter.
            batch_size = feats.shape[0]
            start_idx = count
            end_idx = count + batch_size
            # We'll get file paths from dataset.samples when caller passes it back
            paths_list.append((start_idx, end_idx))

            count += batch_size
            if max_images and count >= max_images:
                break

    features = np.vstack(features_list)
    labels = np.concatenate(labels_list)
    return features, labels, paths_list


def resolve_paths_from_dataset(dataset, paths_ranges):
    """Convert list of (start, end) ranges into a flat list of file paths from dataset.samples."""
    samples = [s[0] for s in dataset.samples]
    out_paths = []
    for start, end in paths_ranges:
        out_paths.extend(samples[start:end])
    return np.array(out_paths, dtype=object)


def main():
    parser = argparse.ArgumentParser(description='ResNet50 feature extraction')
    parser.add_argument('--data_dir', type=str, required=True, help='Path to dataset directory (ImageFolder format)')
    parser.add_argument('--out_prefix', type=str, default='./resnet50', help='Prefix for output .npy files')
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--input_size', type=int, default=224)
    parser.add_argument('--num_workers', type=int, default=4)
    parser.add_argument('--no_cuda', action='store_true', help='Disable GPU even if available')
    parser.add_argument('--pretrained', action='store_true', help='Use pretrained weights (default: True)')
    parser.add_argument('--max_images', type=int, default=None, help='Optional: stop after this many images')

    args = parser.parse_args()

    device = 'cuda' if (torch.cuda.is_available() and not args.no_cuda) else 'cpu'
    print(f'Using device: {device}')

    data_dir = Path(args.data_dir)
    if not data_dir.exists():
        raise FileNotFoundError(f'Data directory not found: {data_dir}')

    loader, dataset = build_dataloader(data_dir, batch_size=args.batch_size,
                                      input_size=args.input_size, num_workers=args.num_workers)

    model = get_resnet50_feature_extractor(pretrained=args.pretrained, device=device)

    features, labels, path_ranges = extract_features(model, loader, device=device, max_images=args.max_images)
    paths = resolve_paths_from_dataset(dataset, path_ranges)

    out_prefix = args.out_prefix
    os.makedirs(os.path.dirname(out_prefix) or '.', exist_ok=True)
    np.save(f'{out_prefix}_features.npy', features.astype(np.float32))
    np.save(f'{out_prefix}_labels.npy', labels.astype(np.int32))
    np.save(f'{out_prefix}_paths.npy', paths)

    print('Saved:')
    print(f'  {out_prefix}_features.npy ->', features.shape)
    print(f'  {out_prefix}_labels.npy   ->', labels.shape)
    print(f'  {out_prefix}_paths.npy    ->', paths.shape)


if __name__ == '__main__':
    main()


usage: colab_kernel_launcher.py [-h] --data_dir DATA_DIR
                                [--out_prefix OUT_PREFIX]
                                [--batch_size BATCH_SIZE]
                                [--input_size INPUT_SIZE]
                                [--num_workers NUM_WORKERS] [--no_cuda]
                                [--pretrained] [--max_images MAX_IMAGES]
colab_kernel_launcher.py: error: the following arguments are required: --data_dir
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/lib/python3.12/argparse.py", line 1943, in _parse_known_args2
    namespace, args = self._parse_known_args(args, namespace, intermixed)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/argparse.py", line 2230, in _parse_known_args
    raise ArgumentError(None, _('the following arguments are required: %s') %
argparse.ArgumentError: the following arguments are required: --data_dir

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipython-input-4046083714.py", line 160, in <cell line: 0>
    main()
  File "/tmp/ipython-input-4046083714.py", line 130, in main
    args = parser.parse_args()
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/argparse.py", line 

TypeError: object of type 'NoneType' has no len()