# LinearSVC on GPU2 Features

This notebook trains a LinearSVC classifier on GPU2 autoencoder features with normalization and validation-based early stopping heuristics.

Objectives:
- Load extracted features and labels
- Normalize with StandardScaler
- Train LinearSVC with tuned iterations/tolerance
- Evaluate and visualize performance

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from pathlib import Path
import time
import pickle

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# Config
output_folder = Path("./output")  # Adjust as needed
feature_size = 8 * 8 * 128  # 8192 latent features
expected_min, expected_max = 0.55, 0.62  # LinearSVC baseline often lower than RBF

## 1. Load Features from Binary Files

Create `FeatureDataLoader` to read train/test features and labels and validate shapes.

In [2]:
class FeatureDataLoader:
    """
    Loads training and test features/labels from binary files.
    Features are float32 flattened arrays; labels are uint8.
    """
    def __init__(self, folder: Path, feature_size: int = 8192):
        self.folder = Path(folder)
        self.feature_size = feature_size
        self.num_classes = 10
    
    def _load_bin(self, path: Path, dtype):
        if not path.exists():
            raise FileNotFoundError(f"Missing file: {path}")
        return np.fromfile(path, dtype=dtype)
    
    def load_train(self):
        X_path = self.folder / "gpu_train_features.bin"
        y_path = self.folder / "train_labels.bin"
        X = self._load_bin(X_path, np.float32)
        y = self._load_bin(y_path, np.uint8)
        n = X.size // self.feature_size
        X = X.reshape(n, self.feature_size)
        if y.shape[0] != n:
            raise ValueError(f"Label count {y.shape[0]} != feature samples {n}")
        return X, y
    
    def load_test(self):
        X_path = self.folder / "gpu_test_features.bin"
        y_path = self.folder / "test_labels.bin"
        X = self._load_bin(X_path, np.float32)
        y = self._load_bin(y_path, np.uint8)
        n = X.size // self.feature_size
        X = X.reshape(n, self.feature_size)
        if y.shape[0] != n:
            raise ValueError(f"Label count {y.shape[0]} != feature samples {n}")
        return X, y

# Load with timing
print("Loading features and labels ...")
load_start = time.time()
loader = FeatureDataLoader(output_folder, feature_size)
train_features, train_labels = loader.load_train()
test_features, test_labels = loader.load_test()
load_time = time.time() - load_start
print(f"✓ Loaded: train {train_features.shape} | test {test_features.shape}")
print(f"Feature loading time: {load_time:.2f} s")

Loading features and labels ...
✓ Loaded: train (50000, 8192) | test (10000, 8192)
Feature loading time: 1.09 s
✓ Loaded: train (50000, 8192) | test (10000, 8192)
Feature loading time: 1.09 s


## 2. Data Preprocessing and Normalization

Apply `StandardScaler` (fit on train, transform test). Persist scaler.

In [3]:
train_features = np.nan_to_num(train_features)
test_features = np.nan_to_num(test_features)

print("Fitting StandardScaler on train features ...")
scaler = StandardScaler(with_mean=True, with_std=True)
scale_start = time.time()
scaler.fit(train_features)
train_features_scaled = scaler.transform(train_features)
test_features_scaled = scaler.transform(test_features)
scale_time = time.time() - scale_start

# scaler_path = output_folder / "scaler_linear.pkl"
# with open(scaler_path, "wb") as f:
#     pickle.dump(scaler, f)
# print(f"✓ Scaler saved: {scaler_path}")

print(f"Scaling time: {scale_time:.2f} s")

Fitting StandardScaler on train features ...
Scaling time: 6.31 s


## 3. Initialize and Train LinearSVC

Use `LinearSVC(dual=False, max_iter=2000, tol=1e-4)`. Employ a simple early-stopping heuristic: train on a train/val split and stop if validation accuracy improves less than a threshold across increments.

In [None]:
print("\n" + "="*60)
print("LinearSVC Training Phase")
print("="*60)

train_start = time.time()
clf = LinearSVC(dual=False, max_iter=3000, tol=1e-4, verbose=1, random_state=42)
# clf.fit(train_features_scaled, train_labels)
clf.fit(train_features, train_labels)

train_time = time.time() - train_start
print(f"Iter: {clf.n_iter_}")
best_model = clf
model_path = output_folder / "linear_svc_model.pkl"
with open(model_path, "wb") as f:
    pickle.dump(best_model, f)
print(f"✓ Trained model saved: {model_path}")
print(f"LinearSVC training time: {train_time:.2f} s ({train_time/60:.2f} min)")


LinearSVC Training Phase
[LibLinear][LibLinear]iter  1 act 2.975e+04 pre 2.974e+04 delta 8.888e-02 f 5.000e+04 |g| 6.695e+05 CG   1
iter  1 act 2.975e+04 pre 2.974e+04 delta 8.888e-02 f 5.000e+04 |g| 6.695e+05 CG   1
cg reaches trust region boundary
iter  2 act 1.769e+03 pre 1.758e+03 delta 1.632e-01 f 2.025e+04 |g| 2.734e+04 CG   1
cg reaches trust region boundary
iter  2 act 1.769e+03 pre 1.758e+03 delta 1.632e-01 f 2.025e+04 |g| 2.734e+04 CG   1
cg reaches trust region boundary
iter  3 act 2.221e+03 pre 2.290e+03 delta 2.121e-01 f 1.848e+04 |g| 6.112e+04 CG   2
cg reaches trust region boundary
iter  3 act 2.221e+03 pre 2.290e+03 delta 2.121e-01 f 1.848e+04 |g| 6.112e+04 CG   2
cg reaches trust region boundary
cg reaches trust region boundary
iter  4 act 1.063e+03 pre 9.791e+02 delta 2.539e-01 f 1.626e+04 |g| 1.893e+04 CG   2
iter  4 act 1.063e+03 pre 9.791e+02 delta 2.539e-01 f 1.626e+04 |g| 1.893e+04 CG   2
cg reaches trust region boundary
iter  5 act 8.261e+02 pre 8.193e+02 delta

## 4. Make Predictions on Test Set

Predict on scaled test features and time inference.

In [None]:
print("\n" + "="*60)
print("LinearSVC Prediction Phase")
print("="*60)

infer_start = time.time()
y_pred = best_model.predict(test_features_scaled)
infer_time = time.time() - infer_start
print(f"✓ Inference time: {infer_time:.2f} s")
print(f"Predictions shape: {y_pred.shape} | Unique classes: {np.unique(y_pred)}")

## 5. Evaluate Classification Performance

Compute accuracy, confusion matrix, and classification report; compare to expected baseline.

In [None]:
accuracy = accuracy_score(test_labels, y_pred)
cm = confusion_matrix(test_labels, y_pred)
report = classification_report(test_labels, y_pred, target_names=[f"Class {i}" for i in range(10)], digits=4)

in_range = expected_min <= accuracy <= expected_max
range_text = "✓ Within expected range" if in_range else "⚠ Outside expected range"

print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Expected: {expected_min*100:.0f}% - {expected_max*100:.0f}% {range_text}\n")
print("Classification Report:\n" + report)

metrics_path = output_folder / "linear_svc_metrics.pkl"
with open(metrics_path, "wb") as f:
    pickle.dump({
        "accuracy": accuracy,
        "cm": cm,
        "report": report,
        "load_time": load_time,
        "scale_time": scale_time,
        "train_time": train_time,
        "infer_time": infer_time,
        "best_iter": best_iter,
        "val_acc": best_acc
    }, f)
print(f"✓ Metrics saved: {metrics_path}")

## 6. Visualize Confusion Matrix and Metrics

Heatmap, accuracy vs expected range, per-class accuracy bars, and distributions.

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Confusion Matrix Heatmap
ax1 = axes[0, 0]
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax1,
            xticklabels=range(10), yticklabels=range(10), cbar_kws={'label': 'Count'})
ax1.set_title('Confusion Matrix - LinearSVC')
ax1.set_xlabel('Predicted Label')
ax1.set_ylabel('True Label')

# 2. Accuracy Comparison
ax2 = axes[0, 1]
categories = ['Model Accuracy', 'Expected Min', 'Expected Max']
values = [accuracy*100, expected_min*100, expected_max*100]
colors = ['#2ecc71' if (expected_min*100 <= values[0] <= expected_max*100) else '#e74c3c', '#3498db', '#3498db']
bars = ax2.bar(categories, values, color=colors, alpha=0.7, edgecolor='black', linewidth=1.5)
ax2.set_ylabel('Accuracy (%)')
ax2.set_title('Model Accuracy vs Expected Range')
ax2.set_ylim([0, 100])
ax2.axhline(y=expected_min*100, color='blue', linestyle='--', alpha=0.5)
ax2.axhline(y=expected_max*100, color='blue', linestyle='--', alpha=0.5)
for bar, val in zip(bars, values):
    ax2.text(bar.get_x() + bar.get_width()/2., val, f'{val:.2f}%', ha='center', va='bottom', fontsize=10)
ax2.grid(axis='y', alpha=0.3)

# 3. Per-class Accuracy
ax3 = axes[1, 0]
per_class_accuracy = cm.diagonal() / cm.sum(axis=1)
bars = ax3.bar(range(10), per_class_accuracy*100, color='#3498db', alpha=0.7, edgecolor='black', linewidth=1.5)
ax3.set_xlabel('Class')
ax3.set_ylabel('Accuracy (%)')
ax3.set_title('Per-Class Accuracy')
ax3.set_ylim([0, 105])
ax3.set_xticks(range(10))
ax3.grid(axis='y', alpha=0.3)
for i, (bar, acc) in enumerate(zip(bars, per_class_accuracy)):
    ax3.text(bar.get_x() + bar.get_width()/2., acc*100, f'{acc*100:.1f}%', ha='center', va='bottom', fontsize=9)

# 4. Prediction Distribution
ax4 = axes[1, 1]
pred_counts = np.bincount(y_pred, minlength=10)
true_counts = np.bincount(test_labels, minlength=10)
x = np.arange(10)
width = 0.35
ax4.bar(x - width/2, true_counts, width, label='True', alpha=0.8, color='#2ecc71', edgecolor='black')
ax4.bar(x + width/2, pred_counts, width, label='Predicted', alpha=0.8, color='#3498db', edgecolor='black')
ax4.set_xlabel('Class')
ax4.set_ylabel('Count')
ax4.set_title('True vs Predicted Class Distribution')
ax4.set_xticks(x)
ax4.legend()
ax4.grid(axis='y', alpha=0.3)

plt.tight_layout()
plot_path = output_folder / "linear_svc_evaluation.png"
plt.savefig(str(plot_path), dpi=150, bbox_inches='tight')
print(f"✓ Visualization saved: {plot_path}")
plt.show()

## 7. Per-Class Accuracy Analysis

Detailed per-class table, easiest/hardest classes, and animal vs vehicle confusion patterns.

In [None]:
cifar10_classes = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

per_class_total = cm.sum(axis=1)
per_class_correct = cm.diagonal()
per_class_accuracy = per_class_correct / per_class_total

results_table = pd.DataFrame({
    'Class': range(10),
    'Name': cifar10_classes,
    'Total Samples': per_class_total,
    'Correct': per_class_correct,
    'Incorrect': per_class_total - per_class_correct,
    'Accuracy (%)': per_class_accuracy * 100
}).sort_values('Accuracy (%)', ascending=False)

print(results_table.to_string(index=False))

easiest_idx = int(results_table.iloc[0]['Class'])
hardest_idx = int(results_table.iloc[-1]['Class'])
print("\nKey Findings:")
print(f"✓ Easiest class: {cifar10_classes[easiest_idx]} (Class {easiest_idx}) - {results_table.iloc[0]['Accuracy (%)']:.2f}%")
print(f"✗ Hardest class: {cifar10_classes[hardest_idx]} (Class {hardest_idx}) - {results_table.iloc[-1]['Accuracy (%)']:.2f}%")
print(f"Accuracy range: {per_class_accuracy.min()*100:.2f}% - {per_class_accuracy.max()*100:.2f}%")
print(f"Std deviation: {per_class_accuracy.std()*100:.2f}%")

animal_classes = [2, 3, 4, 5, 6, 7]
vehicle_classes = [0, 1, 8, 9]
animal_confusion = cm[np.ix_(animal_classes, animal_classes)]
vehicle_confusion = cm[np.ix_(vehicle_classes, vehicle_classes)]
animal_confusion_rate = (animal_confusion.sum() - animal_confusion.diagonal().sum()) / animal_confusion.sum()
vehicle_confusion_rate = (vehicle_confusion.sum() - vehicle_confusion.diagonal().sum()) / vehicle_confusion.sum()
print(f"Animal-to-animal confusion rate: {animal_confusion_rate*100:.2f}%")
print(f"Vehicle-to-vehicle confusion rate: {vehicle_confusion_rate*100:.2f}%")

## 8. Compare with Baseline Methods

Comparison table across methods and expected ranges.

In [None]:
baseline = pd.DataFrame([
    {"Method": "LinearSVC on GPU2 features", "Accuracy": accuracy*100, "Training Time (s)": train_time, "Inference Time (s)": infer_time, "Notes": "This work, early-stopping heuristic"},
    {"Method": "SVM RBF on GPU2 features", "Accuracy": None, "Training Time (s)": None, "Inference Time (s)": None, "Notes": "See RBF notebook"},
    {"Method": "Random baseline", "Accuracy": 10.0, "Training Time (s)": None, "Inference Time (s)": None, "Notes": "Chance level"},
    {"Method": "Linear SVM on raw pixels", "Accuracy": 40.0, "Training Time (s)": None, "Inference Time (s)": None, "Notes": "No feature learning"},
    {"Method": "End-to-end CNN (ResNet-18)", "Accuracy": 78.0, "Training Time (s)": None, "Inference Time (s)": None, "Notes": "Typical benchmark"}
])
print(baseline.to_string(index=False))