# VGG19 Feature Extraction and Classification for CIFAR-10

This notebook implements feature extraction using VGG19 and trains various classifiers on the CIFAR-10 dataset. It uses disk-based storage to handle the full dataset efficiently.

## 1. Import Required Libraries

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import VGG19
from tensorflow.keras.models import Model
from sklearn.preprocessing import StandardScaler, LabelBinarizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import pandas as pd
from time import time
import gc
import h5py
import os

## 2. Define Feature Extractor Class

In [None]:
class FeatureExtractor:
    def __init__(self, batch_size=32, feature_store_path='./features'):
        self.batch_size = batch_size
        self.feature_store_path = feature_store_path
        os.makedirs(feature_store_path, exist_ok=True)
        
        print("Loading VGG19 model...")
        self.base_model = VGG19(
            weights='imagenet', 
            include_top=False, 
            input_shape=(32, 32, 3)
        )
        
        self.layers_to_extract = [
            'block3_conv3',
            'block4_conv3',
            'block5_conv3'
        ]
        
        print("Creating feature extractors...")
        self.feature_extractors = {}
        for layer in self.layers_to_extract:
            self.feature_extractors[layer] = Model(
                inputs=self.base_model.input,
                outputs=self.base_model.get_layer(layer).output
            )
    
    def process_dataset(self, x_data, y_data, dataset_type='train'):
        """Process dataset and save features to disk"""
        num_samples = len(x_data)
        
        h5_path = os.path.join(
            self.feature_store_path, 
            f'features_{dataset_type}.h5'
        )
        
        with h5py.File(h5_path, 'w') as h5f:
            h5f.create_dataset('labels', data=y_data)
            
            for layer_name in self.layers_to_extract:
                print(f"Processing layer {layer_name} for {dataset_type}")
                extractor = self.feature_extractors[layer_name]
                
                for start_idx in range(0, num_samples, self.batch_size):
                    end_idx = min(start_idx + self.batch_size, num_samples)
                    batch_data = x_data[start_idx:end_idx]
                    
                    batch_features = extractor.predict(batch_data, verbose=0)
                    batch_features = np.mean(batch_features, axis=(1, 2))
                    
                    if start_idx == 0:
                        feature_shape = (num_samples, batch_features.shape[1])
                        h5f.create_dataset(
                            f'features_{layer_name}',
                            shape=feature_shape,
                            dtype='float32'
                        )
                    
                    h5f[f'features_{layer_name}'][start_idx:end_idx] = batch_features
                    
                    del batch_features
                    gc.collect()
                    
                    print(f"Processed {end_idx}/{num_samples} samples", end='\r')
                print()

## 3. Define Training and Evaluation Functions

In [None]:
def train_evaluate_model(h5_train_path, h5_test_path, layer_name, classifier):
    """Train and evaluate model using stored features"""
    print(f"Training model on features from {layer_name}...")
    
    with h5py.File(h5_train_path, 'r') as h5f:
        feature_dset = h5f[f'features_{layer_name}']
        label_dset = h5f['labels']
        
        scaler = StandardScaler()
        
        for start_idx in range(0, len(feature_dset), 1000):
            end_idx = min(start_idx + 1000, len(feature_dset))
            scaler.partial_fit(feature_dset[start_idx:end_idx])
        
        X_train = feature_dset[:]
        y_train = label_dset[:]
        X_train_scaled = scaler.transform(X_train)
        
        classifier.fit(X_train_scaled, y_train.argmax(axis=1))
    
    print("Evaluating model...")
    with h5py.File(h5_test_path, 'r') as h5f:
        X_test = h5f[f'features_{layer_name}'][:]
        y_test = h5f['labels'][:]
        
        X_test_scaled = scaler.transform(X_test)
        y_pred = classifier.predict(X_test_scaled)
        
        accuracy = accuracy_score(y_test.argmax(axis=1), y_pred)
        report = classification_report(y_test.argmax(axis=1), y_pred, 
                                    output_dict=True)
        
        return {
            'accuracy': accuracy,
            'precision': report['weighted avg']['precision'],
            'recall': report['weighted avg']['recall'],
            'f1': report['weighted avg']['f1-score']
        }

## 4. Load and Preprocess Data

In [None]:
print("Loading CIFAR-10 dataset...")
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()

# Normalize pixel values
x_train = x_train.astype('float32') / 255.0
x_test = x_test.astype('float32') / 255.0

# Convert labels to one-hot encoding
lb = LabelBinarizer()
y_train = lb.fit_transform(y_train)
y_test = lb.transform(y_test)

print("Training set shape:", x_train.shape)
print("Test set shape:", x_test.shape)

## 5. Extract Features

In [None]:
# Initialize feature extractor
extractor = FeatureExtractor(batch_size=32)

# Process and save features
print("\nProcessing training data...")
extractor.process_dataset(x_train, y_train, 'train')

print("\nProcessing test data...")
extractor.process_dataset(x_test, y_test, 'test')

## 6. Train and Evaluate Models

In [None]:
# Define classifiers
classifiers = {
    'LogisticRegression': LogisticRegression(
        solver='sag',
        max_iter=100,
        n_jobs=-1
    ),
    'LinearSVC': LinearSVC(
        dual=False,
        max_iter=1000
    )
}

# Store results
results = []

# Train and evaluate models
for layer_name in extractor.layers_to_extract:
    for clf_name, clf in classifiers.items():
        print(f"\nEvaluating {clf_name} on {layer_name}")
        
        metrics = train_evaluate_model(
            os.path.join(extractor.feature_store_path, 'features_train.h5'),
            os.path.join(extractor.feature_store_path, 'features_test.h5'),
            layer_name,
            clf
        )
        
        results.append({
            'Layer': layer_name,
            'Classifier': clf_name,
            **metrics
        })

## 7. Display Results

In [None]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)
print("\nResults Summary:")
display(results_df.round(4))

# Find best combination
best_idx = results_df['accuracy'].idxmax()
best_combination = results_df.iloc[best_idx]

print("\nBest Combination:")
print(f"Layer: {best_combination['Layer']}")
print(f"Classifier: {best_combination['Classifier']}")
print(f"Accuracy: {best_combination['accuracy']:.4f}")
print(f"F1 Score: {best_combination['f1']:.4f}")

## 8. Clean Up

In [None]:
# Optional: Remove feature files to free up disk space
import shutil
if os.path.exists('./features'):
    shutil.rmtree('./features')