# Notebook 1: Feature Extraction and Dimensionality Reduction

This notebook performs the first two major steps of the image classification pipeline:

1.  **Feature Extraction**: It reads the raw images and applies two different visual descriptors to extract features:
    *   **Local Binary Patterns (LBP)**: Captures texture information.
    *   **Histogram of Oriented Gradients (HOG)**: Captures shape and edge information.
2.  **Dimensionality Reduction**: It applies **Principal Component Analysis (PCA)** to the generated feature sets to reduce their dimensionality.

All resulting datasets are saved as `.csv` files in the `../results/` directory.

## 1. Setup and Configuration

In [1]:
import os
import numpy as np
import pandas as pd
from skimage.io import imread
from skimage.color import rgb2gray
from skimage.transform import resize
from skimage.feature import hog, local_binary_pattern
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings

warnings.filterwarnings('ignore')

# --- Configuration ---
IMAGE_DIR = '../data/images/'
RESULTS_DIR = '../results/'

# Ensure the results directory exists
if not os.path.exists(RESULTS_DIR):
    os.makedirs(RESULTS_DIR)

# Image and feature parameters from the paper
IMAGE_SIZES = [128, 256]
LBP_RADIUS_OPTIONS = [3, 6, 12]
HOG_PIXELS_PER_CELL_OPTIONS = [(8, 8), (16, 16), (20, 20), (32, 32)]
PCA_VAR_THRESHOLDS = [0.90, 0.75]

print(f'Image directory: {os.path.abspath(IMAGE_DIR)}')
print(f'Results directory: {os.path.abspath(RESULTS_DIR)}')


Image directory: /home/jonjo/Code/jonjo/cat_dog_ml/data/images
Results directory: /home/jonjo/Code/jonjo/cat_dog_ml/results


## 2. Image Loading and Preprocessing

This function loads all images from the specified directory, resizes them, and assigns labels based on their parent folder name. The paper defines 'cat' classes (Birman, Ragdoll) as label 1 and 'dog' classes (Miniature Pinscher, English Setter) as label 0.

In [None]:
def load_images(image_dir, image_size):
    images = []
    labels = []
    filenames = []

    for class_name in ['dogs', 'cats']:
        class_dir = os.path.join(image_dir, class_name)
        if not os.path.isdir(class_dir):
            print(f"Warning: Directory {class_dir} not found. Skipping.")
            continue

        label = 0 if class_name == 'dogs' else 1 # 0 for dogs, 1 for cats

        for filename in os.listdir(class_dir):
            if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
                image_path = os.path.join(class_dir, filename)
                image = imread(image_path)
                image_resized = resize(image, (image_size, image_size))
                images.append(image_resized)
                labels.append(label)
                filenames.append(filename)

    print(f'Loaded {len(images)} images of size {image_size}x{image_size}.')
    return images, labels, filenames

## 3. LBP Feature Extraction

In [None]:
def extract_lbp_features(images, radius):
    lbp_features_list = []
    n_points = 8 * radius

    for img in images:
        img_gray = rgb2gray(img)
        lbp = local_binary_pattern(img_gray, n_points, radius, method='uniform')
        hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, n_points + 3), range=(0, n_points + 2))
        lbp_features_list.append(hist)

    return np.array(lbp_features_list)

## 4. HOG Feature Extraction

In [None]:
def extract_hog_features(images, pixels_per_cell):
    hog_features_list = []

    for img in images:
        # The HOG descriptor is multichannel-aware, no need for rgb2gray if image has 3 channels
        fd = hog(img, orientations=9, pixels_per_cell=pixels_per_cell,
                 cells_per_block=(2, 2), visualize=False, channel_axis=-1) # Use channel_axis for color images
        hog_features_list.append(fd)

    return np.array(hog_features_list)

## 5. Generate and Save Feature Datasets

In [None]:
for size in IMAGE_SIZES:
    images, labels, _ = load_images(IMAGE_DIR, size)

    # Generate LBP datasets
    for radius in LBP_RADIUS_OPTIONS:
        print(f'Generating LBP dataset for image size {size} and radius {radius}...')
        features = extract_lbp_features(images, radius)
        df = pd.DataFrame(features)
        df['label'] = labels
        filename = f'LBP_{size}_{radius}r.csv'
        df.to_csv(os.path.join(RESULTS_DIR, filename), index=False)
        print(f'Saved {filename}')

    # Generate HOG datasets
    for ppc in HOG_PIXELS_PER_CELL_OPTIONS:
        print(f'Generating HOG dataset for image size {size} and pixels per cell {ppc}...')
        features = extract_hog_features(images, ppc)
        df = pd.DataFrame(features)
        df['label'] = labels
        filename = f'HOG_{size}_{ppc[0]}x{ppc[1]}.csv'
        df.to_csv(os.path.join(RESULTS_DIR, filename), index=False)
        print(f'Saved {filename}')

## 6. Apply PCA to Generated Datasets

In [None]:
csv_files = [f for f in os.listdir(RESULTS_DIR) if f.endswith('.csv') and 'PCA' not in f]

for file in csv_files:
    print(f'Applying PCA to {file}...')
    dataset = pd.read_csv(os.path.join(RESULTS_DIR, file))

    X = dataset.iloc[:, :-1] # Features
    y = dataset.iloc[:, -1]  # Target

    # Standardize the features before PCA
    X_std = StandardScaler().fit_transform(X)

    for var_thresh in PCA_VAR_THRESHOLDS:
        pca = PCA(n_components=var_thresh, whiten=True)
        pca_result = pca.fit_transform(X_std)

        print(f'  - PCA with {var_thresh*100}% variance retained {X.shape[1]} features into {pca.n_components_}.')

        # Create a new DataFrame with the PCA results
        pca_cols = [f'pca_{i+1}' for i in range(pca_result.shape[1])]
        pca_df = pd.DataFrame(data=pca_result, columns=pca_cols)
        final_df = pca_df.join(y)

        # Save the new dataset
        file_without_ext = os.path.splitext(file)[0]
        output_filename = f'{file_without_ext}_PCA-{int(var_thresh*100)}.csv'
        final_df.to_csv(os.path.join(RESULTS_DIR, output_filename), index=False)
        print(f'  - Saved {output_filename}')