In [None]:
# --- 2. Import all necessary packages ---
import os
import cv2
import numpy as np
import pandas as pd
import joblib
import time
from tqdm import tqdm
from datetime import datetime

# Image Processing & Feature Extraction
from skimage.feature import hog

# Modeling, evaluation, and hyperparameter tuning
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb

# Metrics and Reporting
from sklearn.metrics import accuracy_score, classification_report

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns



In [None]:
CONFIG = {
    # --- Paths for Colab environment ---
    "dataset_path": "caltech_15_classes", # Using the new 15-class dataset
    "output_artifacts_path": "image_classifier_artifacts_v1.joblib",

    # --- Preprocessing (Optimized for speed and accuracy) ---
    "image_size": (128, 128), # Smaller size for faster feature extraction

    # --- Feature Extraction (HOG-only for the strongest signal) ---
    "features": {
        "hog_orientations": 9,
        "hog_pixels_per_cell": (16, 16), # Larger cells for a more compact feature vector
        "hog_cells_per_block": (2, 2),
    },
    
    # --- Model Development ---
    "training": {
        "test_size": 0.25,
        "random_state": 42,
        "tuning_n_iter": 10,  # 10 iterations is a good balance for speed
        "tuning_cv": 3,       # 3-fold cross-validation
    }
}




In [None]:
# ==============================================================================
# CELL 3: FEATURE EXTRACTION PIPELINE
# ==============================================================================

def preprocess_image(image_path):
    """Handles Image Loading, Validation, and Resizing."""
    try:
        image = cv2.imread(image_path)
        if image is None: return None
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, CONFIG["image_size"])
        return image
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None

def extract_hog_features(image):
    """Extracts Histogram of Oriented Gradients (HOG) features."""
    gray_image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    features = hog(gray_image,
                   orientations=CONFIG["features"]["hog_orientations"],
                   pixels_per_cell=CONFIG["features"]["hog_pixels_per_cell"],
                   cells_per_block=CONFIG["features"]["hog_cells_per_block"],
                   transform_sqrt=True, block_norm='L2-Hys')
    return features

print("✅ Optimized feature extraction pipeline defined.")


In [None]:
# ==============================================================================
# CELL 4: MAIN EXECUTION - DATA LOADING, TRAINING, AND SAVING
# ==============================================================================

# --- Load Data & Extract HOG Features ---
print("\n--- Starting Data Processing ---")
features_list = []
labels_list = []
class_names = sorted(os.listdir(CONFIG["dataset_path"]))

for class_name in tqdm(class_names, desc="Extracting HOG Features"):
    class_path = os.path.join(CONFIG["dataset_path"], class_name)
    if not os.path.isdir(class_path): continue
    
    for image_name in os.listdir(class_path):
        image_path = os.path.join(class_path, image_name)
        processed_image = preprocess_image(image_path)
        if processed_image is None: continue
        
        features = extract_hog_features(processed_image)
        features_list.append(features)
        labels_list.append(class_name)

# --- Prepare Data for Modeling ---
X = np.array(features_list)
le = LabelEncoder()
y = le.fit_transform(labels_list)
print(f"\nExtracted {X.shape[1]} HOG features for {len(X)} images.")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=CONFIG["training"]["test_size"],
    random_state=CONFIG["training"]["random_state"], stratify=y
)

# --- Scale Features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("Features have been scaled.")

# --- Model Training & Tuning ---
print("\n--- Starting Model Training & Hyperparameter Tuning ---")

models = {
    "SVM": SVC(probability=True, random_state=CONFIG["training"]["random_state"]),
    "RandomForest": RandomForestClassifier(random_state=CONFIG["training"]["random_state"], n_jobs=-1),
    "LightGBM": lgb.LGBMClassifier(random_state=CONFIG["training"]["random_state"], n_jobs=-1)
}

param_spaces = {
    "SVM": {'C': [1, 10, 100], 'kernel': ['rbf'], 'gamma': [0.001, 0.01]},
    "RandomForest": {'n_estimators': [100, 200], 'max_depth': [20, None], 'min_samples_split': [2, 5]},
    "LightGBM": {'n_estimators': [100, 200], 'learning_rate': [0.1, 0.2], 'num_leaves': [31, 50]}
}

best_estimators = {}
model_iterator = tqdm(models.items(), desc="Overall Model Tuning", total=len(models))

for name, model in model_iterator:
    model_iterator.set_description(f"Tuning {name}")
    print("-" * 60)
    print(f"▶️  [{datetime.now().strftime('%H:%M:%S')}] Starting RandomizedSearch for {name}...")
    
    rs = RandomizedSearchCV(
        model, param_spaces[name],
        n_iter=CONFIG["training"]["tuning_n_iter"],
        cv=CONFIG["training"]["tuning_cv"],
        n_jobs=-1,
        random_state=CONFIG["training"]["random_state"],
        verbose=1
    ).fit(X_train_scaled, y_train)
    
    print(f"✅ [{datetime.now().strftime('%H:%M:%S')}] Finished tuning for {name}.")
    best_estimators[name] = rs.best_estimator_

# --- Evaluation & Saving ---
print("\n" + "="*60)
print(f"🏁 [{datetime.now().strftime('%H:%M:%S')}] All models tuned. Evaluating test set performance...")

results = []
for name, model in best_estimators.items():
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    results.append({"Model": name, "Test Accuracy": accuracy})

results_df = pd.DataFrame(results).sort_values(by="Test Accuracy", ascending=False)
print("\n--- Final Model Performance ---")
print(results_df)

best_model_name = results_df.iloc[0]["Model"]
best_model = best_estimators[best_model_name]
print(f"\n🏆 Best Performing Model: {best_model_name}")

# --- Save Final Artifacts ---
artifacts = {
    "model": best_model,
    "label_encoder": le,
    "scaler": scaler,
    "config": CONFIG
}
joblib.dump(artifacts, CONFIG["output_artifacts_path"])
print(f"\n✅ All artifacts saved to your Google Drive at: {CONFIG['output_artifacts_path']}")