# Car Type Classification - Assignment

## System Information
- **Development Environment**: WSL2 Ubuntu on Windows
- **Python Version**: 3.11+
- **TensorFlow Version**: 2.19.0 with GPU support
- **Platform**: Linux (WSL2) - Recommended for optimal performance

## Overview
This notebook implements a car type classification system using TensorFlow 2.19 and the Stanford Cars dataset. It automatically checks for pre-trained models and skips training if available.

---

# 1. Environment Setup and Dependencies

In [None]:
# ==========================
# ENVIRONMENT SETUP
# ==========================

import os
import sys
import platform
from pathlib import Path
import json
import warnings
warnings.filterwarnings('ignore')

# Suppress TensorFlow warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

# Core ML libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras import mixed_precision

# Data processing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Dataset handling
import kagglehub

print("=== System Information ===")
print(f"Platform: {platform.platform()}")
print(f"Python Version: {sys.version.split()[0]}")
print(f"TensorFlow Version: {tf.__version__}")

# Configure GPU
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    print(f"✅ GPU Available: {len(physical_devices)} device(s)")
    for gpu in physical_devices:
        tf.config.experimental.set_memory_growth(gpu, True)
    print("✅ GPU memory growth configured")
else:
    print("⚠️ No GPU detected - using CPU")

# Set mixed precision for better performance
mixed_precision.set_global_policy('mixed_float16')
print("✅ Mixed precision enabled")

print("\n✅ Environment setup complete!")

2025-07-22 23:29:36.149005: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753198176.220165     821 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753198176.241410     821 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1753198176.402448     821 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1753198176.402469     821 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1753198176.402470     821 computation_placer.cc:177] computation placer alr

=== System Information ===
Platform: Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.39
Python Version: 3.12.3
TensorFlow Version: 2.19.0
✅ GPU Available: 1 device(s)
✅ GPU memory growth configured
✅ Mixed precision enabled

✅ Environment setup complete!


# 2. Dataset Configuration and Detection

In [2]:
# ==========================
# DATASET DETECTION
# ==========================

def detect_stanford_cars_dataset():
    """
    Robustly detect various Stanford Cars dataset folder structures
    """
    print("🔍 DETECTING STANFORD CARS DATASET...")
    print("=" * 45)
    
    # Possible dataset locations
    search_paths = [
        Path.cwd() / "stanford-cars-by-classes-folder",
        Path.cwd() / "data", 
        Path.cwd(),
        Path.home() / ".cache" / "kagglehub" / "datasets" / "cyizhuo" / "stanford-cars-by-classes-folder"
    ]
    
    dataset_ready = False
    train_dir = None
    test_dir = None
    
    # Check existing locations
    for base_path in search_paths:
        if base_path.exists():
            print(f"📁 Checking: {base_path}")
            
            # Look for train/test folders
            possible_structures = [
                (base_path / "train", base_path / "test"),
                (base_path / "stanford-cars-by-classes-folder" / "train", 
                 base_path / "stanford-cars-by-classes-folder" / "test"),
                # Handle nested versions folder
                (base_path / "versions" / "5" / "train", base_path / "versions" / "5" / "test"),
            ]
            
            for train_candidate, test_candidate in possible_structures:
                if train_candidate.exists() and test_candidate.exists():
                    # Verify it contains car classes
                    train_classes = [d for d in train_candidate.iterdir() if d.is_dir()]
                    test_classes = [d for d in test_candidate.iterdir() if d.is_dir()]
                    
                    if len(train_classes) >= 150:  # Should be ~196 classes
                        train_dir = train_candidate
                        test_dir = test_candidate
                        dataset_ready = True
                        print(f"✅ Found dataset: {len(train_classes)} classes")
                        break
            
            if dataset_ready:
                break
    
    # If not found, try to download
    if not dataset_ready:
        print("📥 Dataset not found locally, downloading...")
        try:
            download_path = kagglehub.dataset_download("cyizhuo/stanford-cars-by-classes-folder")
            download_path = Path(download_path)
            
            # Check downloaded structure
            if (download_path / "train").exists():
                train_dir = download_path / "train"
                test_dir = download_path / "test"
                dataset_ready = True
                print(f"✅ Downloaded to: {download_path}")
            
        except Exception as e:
            print(f"❌ Download failed: {e}")
    
    # Results
    if dataset_ready:
        train_count = len(list(train_dir.glob("*/*.jpg")))
        test_count = len(list(test_dir.glob("*/*.jpg")))
        class_count = len([d for d in train_dir.iterdir() if d.is_dir()])
        
        print(f"\n✅ DATASET READY!")
        print(f"📊 Classes: {class_count}")
        print(f"🖼️ Training images: {train_count}")
        print(f"🖼️ Test images: {test_count}")
        print(f"📁 Train dir: {train_dir}")
        print(f"📁 Test dir: {test_dir}")
    else:
        print("❌ Stanford Cars dataset not available")
        print("💡 Please download manually or check internet connection")
    
    return dataset_ready, train_dir, test_dir

# Detect dataset
DATASET_READY, TRAIN_DIR, TEST_DIR = detect_stanford_cars_dataset()
USING_REAL_DATASET = DATASET_READY

# Set global parameters
if DATASET_READY:
    NUM_CLASSES = len([d for d in TRAIN_DIR.iterdir() if d.is_dir()])
else:
    NUM_CLASSES = 196  # Default for Stanford Cars

BATCH_SIZE = 32
IMG_SIZE = (224, 224)
EPOCHS = 50
AUTOTUNE = tf.data.AUTOTUNE

print(f"\n📋 Configuration:")
print(f"   Classes: {NUM_CLASSES}")
print(f"   Batch size: {BATCH_SIZE}")
print(f"   Image size: {IMG_SIZE}")
print(f"   Max epochs: {EPOCHS}")

🔍 DETECTING STANFORD CARS DATASET...
📁 Checking: /home/alph/Car-Type-Classification-Service/stanford-cars-by-classes-folder
✅ Found dataset: 196 classes

✅ DATASET READY!
📊 Classes: 196
🖼️ Training images: 8144
🖼️ Test images: 8041
📁 Train dir: /home/alph/Car-Type-Classification-Service/stanford-cars-by-classes-folder/train
📁 Test dir: /home/alph/Car-Type-Classification-Service/stanford-cars-by-classes-folder/test

📋 Configuration:
   Classes: 196
   Batch size: 32
   Image size: (224, 224)
   Max epochs: 50


# 3. Data Preprocessing and Pipeline

In [3]:
# ==========================
# DATA PREPROCESSING
# ==========================

def create_data_pipeline():
    """
    Create TensorFlow 2.19 data pipeline with tf.data.Dataset
    """
    if not DATASET_READY:
        print("⚠️ Real dataset not available - using demo data")
        return None, None, None
    
    print("🔧 Creating data pipeline with TensorFlow 2.19...")
    
    # Training dataset with validation split
    train_ds = tf.keras.utils.image_dataset_from_directory(
        TRAIN_DIR,
        validation_split=0.2,
        subset="training",
        seed=123,
        image_size=IMG_SIZE,
        batch_size=BATCH_SIZE,
        label_mode='categorical'
    )
    
    # Validation dataset
    val_ds = tf.keras.utils.image_dataset_from_directory(
        TRAIN_DIR,
        validation_split=0.2,
        subset="validation", 
        seed=123,
        image_size=IMG_SIZE,
        batch_size=BATCH_SIZE,
        label_mode='categorical'
    )
    
    # Test dataset
    test_ds = tf.keras.utils.image_dataset_from_directory(
        TEST_DIR,
        image_size=IMG_SIZE,
        batch_size=BATCH_SIZE,
        label_mode='categorical',
        shuffle=False
    )
    
    # Data augmentation for training
    data_augmentation = keras.Sequential([
        layers.RandomFlip("horizontal"),
        layers.RandomRotation(0.1),
        layers.RandomZoom(0.1),
        layers.RandomContrast(0.1),
    ])
    
    # Apply augmentation to training data
    train_ds = train_ds.map(
        lambda x, y: (data_augmentation(x, training=True), y),
        num_parallel_calls=AUTOTUNE
    )
    
    # Optimize datasets for performance
    train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
    val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
    test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)
    
    print(f"✅ Data pipeline created:")
    print(f"   Training batches: {len(train_ds)}")
    print(f"   Validation batches: {len(val_ds)}")
    print(f"   Test batches: {len(test_ds)}")
    
    return train_ds, val_ds, test_ds

# Create data pipeline
if DATASET_READY:
    train_ds, val_ds, test_ds = create_data_pipeline()
    print("\n📊 Data pipeline ready for training!")
else:
    train_ds = val_ds = test_ds = None
    print("\n⚠️ Data pipeline skipped - no dataset available")

🔧 Creating data pipeline with TensorFlow 2.19...
Found 8144 files belonging to 196 classes.
Using 6516 files for training.
Found 8144 files belonging to 196 classes.
Using 6516 files for training.


I0000 00:00:1753198180.262720     821 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5563 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060, pci bus id: 0000:01:00.0, compute capability: 8.9


Found 8144 files belonging to 196 classes.
Using 1628 files for validation.
Using 1628 files for validation.
Found 8041 files belonging to 196 classes.
Found 8041 files belonging to 196 classes.
✅ Data pipeline created:
   Training batches: 204
   Validation batches: 51
   Test batches: 252

📊 Data pipeline ready for training!
✅ Data pipeline created:
   Training batches: 204
   Validation batches: 51
   Test batches: 252

📊 Data pipeline ready for training!


# 4. Model Architecture

In [4]:
# ==========================
# MODEL ARCHITECTURE
# ==========================

def create_car_classification_model():
    """
    Create ResNet50-based model optimized for TensorFlow 2.19
    """
    print("🏗️ Creating car classification model...")
    
    # Input layer
    inputs = keras.Input(shape=(*IMG_SIZE, 3))
    
    # Preprocessing
    x = layers.Rescaling(1./255)(inputs)
    
    # Pre-trained ResNet50 backbone
    backbone = ResNet50(
        weights='imagenet',
        include_top=False,
        input_tensor=x
    )
    
    # Freeze early layers, unfreeze later ones for fine-tuning
    backbone.trainable = True
    for layer in backbone.layers[:-20]:
        layer.trainable = False
    
    # Classification head
    x = backbone.output
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(512, activation='relu')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.5)(x)
    
    # Output layer - explicit float32 for mixed precision compatibility
    predictions = layers.Dense(
        NUM_CLASSES, 
        activation='softmax',
        dtype='float32',  # Important for mixed precision
        kernel_regularizer=tf.keras.regularizers.l2(0.01),
        name='predictions'
    )(x)
    
    # Create model
    model = keras.Model(inputs, predictions, name='car_classifier')
    
    # Compile with TF 2.19 compatible metrics
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-4),
        loss='categorical_crossentropy',
        metrics=[
            'accuracy',
            tf.keras.metrics.TopKCategoricalAccuracy(k=5, name='top_5_accuracy', dtype='float32')
        ]
    )
    
    print("✅ Model created and compiled successfully!")
    print(f"📊 Total parameters: {model.count_params():,}")
    print(f"📊 Trainable parameters: {sum(p.numel() for p in model.trainable_weights):,}")
    
    return model

def setup_callbacks():
    """
    Setup training callbacks for TF 2.19
    """
    callbacks = [
        EarlyStopping(
            monitor='val_accuracy',
            patience=10,
            restore_best_weights=True,
            verbose=1
        ),
        ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=5,
            min_lr=1e-7,
            verbose=1
        ),
        ModelCheckpoint(
            'best_car_model.keras',  # TF 2.19 .keras format
            monitor='val_accuracy',
            save_best_only=True,
            verbose=1
        )
    ]
    
    print("⚙️ Training callbacks configured")
    return callbacks

# Check if pre-trained model exists
model_path = Path('best_car_model.keras')
if model_path.exists():
    print("🔍 Found existing trained model!")
    print(f"📁 Loading model from: {model_path}")
    try:
        model = keras.models.load_model(str(model_path))
        print("✅ Pre-trained model loaded successfully!")
        print(f"📊 Model has {model.count_params():,} parameters")
        SKIP_TRAINING = True
    except Exception as e:
        print(f"❌ Failed to load model: {e}")
        print("🔧 Creating new model instead...")
        model = create_car_classification_model()
        SKIP_TRAINING = False
else:
    print("📋 No existing model found - creating new model")
    model = create_car_classification_model()
    SKIP_TRAINING = False

# Setup callbacks regardless (needed for evaluation)
callbacks = setup_callbacks()

print(f"\n🎯 Model ready! Skip training: {SKIP_TRAINING}")

🔍 Found existing trained model!
📁 Loading model from: best_car_model.keras
✅ Pre-trained model loaded successfully!
📊 Model has 24,820,548 parameters
⚙️ Training callbacks configured

🎯 Model ready! Skip training: True
✅ Pre-trained model loaded successfully!
📊 Model has 24,820,548 parameters
⚙️ Training callbacks configured

🎯 Model ready! Skip training: True


# 5. Model Training (Conditional)

In [5]:
# ==========================
# MODEL TRAINING (CONDITIONAL)
# ==========================

if SKIP_TRAINING:
    print("🔄 SKIPPING TRAINING - Pre-trained model available")
    print("=" * 50)
    print(f"✅ Using existing model: best_car_model.keras")
    print(f"📊 Model parameters: {model.count_params():,}")
    print("💡 To retrain, delete the .keras file and rerun this cell")
    history = None
    
elif not DATASET_READY:
    print("⚠️ SKIPPING TRAINING - No dataset available")
    print("=" * 40)
    print("❌ Cannot train without Stanford Cars dataset")
    print("💡 Please ensure dataset is available and rerun")
    history = None
    
else:
    print("🚀 STARTING MODEL TRAINING")
    print("=" * 30)
    print(f"📊 Training on {NUM_CLASSES} car classes")
    print(f"🎯 Max epochs: {EPOCHS} (early stopping enabled)")
    print(f"⚡ Using mixed precision for optimal performance")
    
    # Train the model
    history = model.fit(
        train_ds,
        epochs=EPOCHS,
        validation_data=val_ds,
        callbacks=callbacks,
        verbose=1
    )
    
    print("\n✅ Training completed successfully!")
    print(f"💾 Best model saved as: best_car_model.keras")
    
    # Plot training history
    if history:
        plt.figure(figsize=(15, 5))
        
        # Accuracy
        plt.subplot(1, 3, 1)
        plt.plot(history.history['accuracy'], label='Training')
        plt.plot(history.history['val_accuracy'], label='Validation')
        plt.title('Model Accuracy')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # Loss
        plt.subplot(1, 3, 2)
        plt.plot(history.history['loss'], label='Training')
        plt.plot(history.history['val_loss'], label='Validation')
        plt.title('Model Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # Top-5 Accuracy
        if 'top_5_accuracy' in history.history:
            plt.subplot(1, 3, 3)
            plt.plot(history.history['top_5_accuracy'], label='Training')
            plt.plot(history.history['val_top_5_accuracy'], label='Validation')
            plt.title('Top-5 Accuracy')
            plt.xlabel('Epoch')
            plt.ylabel('Accuracy')
            plt.legend()
            plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()

print(f"\n📋 Training Status: {'Skipped (model exists)' if SKIP_TRAINING else 'Completed' if DATASET_READY else 'Skipped (no dataset)'}")

🔄 SKIPPING TRAINING - Pre-trained model available
✅ Using existing model: best_car_model.keras
📊 Model parameters: 24,820,548
💡 To retrain, delete the .keras file and rerun this cell

📋 Training Status: Skipped (model exists)


# 6. Model Evaluation

In [6]:
# ==========================
# MODEL EVALUATION
# ==========================

def evaluate_model():
    """
    Comprehensive model evaluation
    """
    print("📊 EVALUATING MODEL PERFORMANCE")
    print("=" * 35)
    
    if not DATASET_READY:
        print("⚠️ No dataset available for evaluation")
        return
    
    # Evaluate on validation set
    print("🎯 Validation Set Results:")
    val_results = model.evaluate(val_ds, verbose=0, return_dict=True)
    print(f"   Loss: {val_results['loss']:.4f}")
    print(f"   Accuracy: {val_results['accuracy']:.4f} ({val_results['accuracy']*100:.2f}%)")
    if 'top_5_accuracy' in val_results:
        print(f"   Top-5 Accuracy: {val_results['top_5_accuracy']:.4f} ({val_results['top_5_accuracy']*100:.2f}%)")
    
    # Evaluate on test set
    print("\n🎯 Test Set Results:")
    test_results = model.evaluate(test_ds, verbose=0, return_dict=True)
    print(f"   Loss: {test_results['loss']:.4f}")
    print(f"   Accuracy: {test_results['accuracy']:.4f} ({test_results['accuracy']*100:.2f}%)")
    if 'top_5_accuracy' in test_results:
        print(f"   Top-5 Accuracy: {test_results['top_5_accuracy']:.4f} ({test_results['top_5_accuracy']*100:.2f}%)")
    
    return val_results, test_results

# Run evaluation
if DATASET_READY:
    val_results, test_results = evaluate_model()
    print("\n✅ Model evaluation completed!")
else:
    print("⚠️ Skipping evaluation - no dataset available")
    print("💡 Model is ready for inference with external data")

📊 EVALUATING MODEL PERFORMANCE
🎯 Validation Set Results:


I0000 00:00:1753198184.973795     997 service.cc:152] XLA service 0x754d40002860 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1753198184.973831     997 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 4060, Compute Capability 8.9
I0000 00:00:1753198185.581472     997 cuda_dnn.cc:529] Loaded cuDNN version 91100
I0000 00:00:1753198185.581472     997 cuda_dnn.cc:529] Loaded cuDNN version 91100
I0000 00:00:1753198190.945880     997 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
I0000 00:00:1753198190.945880     997 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


   Loss: 2.1096
   Accuracy: 0.5405 (54.05%)
   Top-5 Accuracy: 0.7869 (78.69%)

🎯 Test Set Results:
   Loss: 2.1031
   Accuracy: 0.5381 (53.81%)
   Top-5 Accuracy: 0.7855 (78.55%)

✅ Model evaluation completed!
   Loss: 2.1031
   Accuracy: 0.5381 (53.81%)
   Top-5 Accuracy: 0.7855 (78.55%)

✅ Model evaluation completed!


# 7. Model Summary and Export

In [7]:
# ==========================
# MODEL SUMMARY AND EXPORT
# ==========================

print("📋 FINAL MODEL SUMMARY")
print("=" * 25)

# Model architecture summary
print(f"🏗️ Architecture: ResNet50-based transfer learning")
print(f"📊 Total parameters: {model.count_params():,}")
print(f"🎯 Output classes: {NUM_CLASSES}")
print(f"📐 Input shape: {IMG_SIZE + (3,)}")
print(f"⚡ Mixed precision: Enabled")

# Model files
model_files = {
    'best_car_model.keras': 'Main model file (TensorFlow 2.19 format)',
    'class_mapping.json': 'Class name mappings',
}

print(f"\n💾 Model Files:")
for filename, description in model_files.items():
    if Path(filename).exists():
        size = Path(filename).stat().st_size / (1024*1024)  # MB
        print(f"   ✅ {filename} ({size:.1f}MB) - {description}")
    else:
        print(f"   ❌ {filename} - {description} (missing)")

# Create class mapping if dataset available
if DATASET_READY and not Path('class_mapping.json').exists():
    print("\n📝 Creating class mapping...")
    class_names = sorted([d.name for d in TRAIN_DIR.iterdir() if d.is_dir()])
    class_mapping = {i: name for i, name in enumerate(class_names)}
    
    with open('class_mapping.json', 'w') as f:
        json.dump(class_mapping, f, indent=2)
    print(f"✅ Class mapping saved with {len(class_mapping)} classes")

# Usage instructions
print(f"\n🚀 USAGE INSTRUCTIONS:")
print(f"   1. Load model: tf.keras.models.load_model('best_car_model.keras')")
print(f"   2. Preprocess image: Resize to {IMG_SIZE}, normalize to [0,1]")
print(f"   3. Predict: model.predict(preprocessed_image)")
print(f"   4. Get class name: Use class_mapping.json for label lookup")

print(f"\n✅ Model ready for deployment and inference!")

# Display model architecture
print(f"\n🏗️ Model Architecture:")
model.summary()

📋 FINAL MODEL SUMMARY
🏗️ Architecture: ResNet50-based transfer learning
📊 Total parameters: 24,820,548
🎯 Output classes: 196
📐 Input shape: (224, 224, 3)
⚡ Mixed precision: Enabled

💾 Model Files:
   ✅ best_car_model.keras (250.4MB) - Main model file (TensorFlow 2.19 format)
   ✅ class_mapping.json (0.0MB) - Class name mappings

🚀 USAGE INSTRUCTIONS:
   1. Load model: tf.keras.models.load_model('best_car_model.keras')
   2. Preprocess image: Resize to (224, 224), normalize to [0,1]
   3. Predict: model.predict(preprocessed_image)
   4. Get class name: Use class_mapping.json for label lookup

✅ Model ready for deployment and inference!

🏗️ Model Architecture:
