Pick any image based dataset from the list, implement the preprocessing and justify the
preprocessing steps, extract features and justify the methods used, select features and justify the
methods used. Some of this is done already in one of the previous assignments. You can reuse
things. 

In [None]:
import os
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score, ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Preprocessing function
def preprocess_image(image_path, downsample_size=(64, 64)):
    """
    Convert image to grayscale, resize, and normalize pixel values.
    """
    image = Image.open(image_path).convert('L')  # Grayscale conversion
    image = image.resize(downsample_size)        # Resizing
    image_array = np.array(image, dtype=np.float32)
    normalized_image = image_array / 255.0       # Normalization
    return normalized_image

# Load and preprocess images
image_dir = 'vehicles'  # Path to the vehicles dataset
images = []
labels = []

# Assuming images are organized in directories per vehicle type
vehicle_labels = os.listdir(image_dir)

for vehicle_label in vehicle_labels:
    vehicle_dir = os.path.join(image_dir, vehicle_label)
    if os.path.isdir(vehicle_dir):  # Ensure it's a directory
        image_files = os.listdir(vehicle_dir)
        for image_file in image_files:
            image_path = os.path.join(vehicle_dir, image_file)
            if os.path.isfile(image_path):
                image = preprocess_image(image_path)
                images.append(image)
                labels.append(vehicle_label)
            else:
                print(f"File not found: {image_path}")
    else:
        print(f"Directory not found: {vehicle_dir}")

images = np.array(images)
labels = np.array(labels)

# Display sample images
def display_sample_images(images, labels, label_encoder):
    plt.figure(figsize=(10, 5))
    for i in range(1, 6):
        idx = np.random.randint(0, len(images))
        plt.subplot(1, 5, i)
        plt.imshow(images[idx].reshape(64, 64), cmap='gray')
        plt.title(label_encoder.inverse_transform([labels[idx]])[0])
        plt.axis('off')
    plt.suptitle('Sample Images from Dataset')
    plt.show()

# Encode vehicle labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Display sample images
display_sample_images(images, labels_encoded, label_encoder)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    images, labels_encoded, test_size=0.2, random_state=42, stratify=labels_encoded
)

# Flatten images for ML algorithms
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)

# Standardize features
scaler = StandardScaler()
X_train_flat_scaled = scaler.fit_transform(X_train_flat)
X_test_flat_scaled = scaler.transform(X_test_flat)

# Feature Extraction using PCA
pca = PCA()
pca.fit(X_train_flat_scaled)

# Explained Variance - Individual and Cumulative
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance_ratio)

# Focus on the first 100 components for plotting
max_components_to_plot = 100
explained_variance_ratio_subset = explained_variance_ratio[:max_components_to_plot]
cumulative_variance_subset = cumulative_variance[:max_components_to_plot]

# Plot individual and cumulative variance
plt.figure(figsize=(10, 6))
plt.bar(range(1, len(explained_variance_ratio_subset) + 1), explained_variance_ratio_subset, alpha=0.6, align='center',
        label='Individual Explained Variance')
plt.step(range(1, len(cumulative_variance_subset) + 1), cumulative_variance_subset, where='mid',
         label='Cumulative Explained Variance', color='red')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.title('PCA Explained Variance (Top 100 Components)')
plt.legend(loc='best')
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()


# Select number of components explaining 95% variance
k = np.argmax(cumulative_variance >= 0.95) + 1
print(f"Number of components explaining 95% variance: {k}")

# Transform data using PCA
pca = PCA(n_components=k)
X_train_pca = pca.fit_transform(X_train_flat_scaled)
X_test_pca = pca.transform(X_test_flat_scaled)


# Basic ML Algorithm: K-Nearest Neighbors
start_time = time.time()
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_pca, y_train)
y_pred_knn = knn.predict(X_test_pca)
knn_time = time.time() - start_time
knn_accuracy = accuracy_score(y_test, y_pred_knn)
print(f"KNN Accuracy: {knn_accuracy*100:.2f}%, Time: {knn_time:.2f}s")
print("KNN Classification Report:")
print(classification_report(y_test, y_pred_knn, target_names=label_encoder.classes_, zero_division=0))

# Confusion Matrix for KNN
disp_knn = ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred_knn, display_labels=label_encoder.classes_, xticks_rotation='vertical'
)
plt.title('KNN Confusion Matrix')
plt.show()

# Advanced ML Algorithm: Support Vector Machine
start_time = time.time()
svm = SVC(kernel='rbf', gamma='scale')
svm.fit(X_train_pca, y_train)
y_pred_svm = svm.predict(X_test_pca)
svm_time = time.time() - start_time
svm_accuracy = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {svm_accuracy*100:.2f}%, Time: {svm_time:.2f}s")
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_, zero_division=0))

# Confusion Matrix for SVM
disp_svm = ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred_svm, display_labels=label_encoder.classes_, xticks_rotation='vertical'
)
plt.title('SVM Confusion Matrix')
plt.show()

# CNN Model
start_time = time.time()
X_train_cnn = X_train.reshape(-1, 64, 64, 1)
X_test_cnn = X_test.reshape(-1, 64, 64, 1)
y_train_cnn = to_categorical(y_train, num_classes=len(label_encoder.classes_))
y_test_cnn = to_categorical(y_test, num_classes=len(label_encoder.classes_))

model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 1)),
    MaxPooling2D((2, 2)),
    Dropout(0.3),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Dropout(0.3),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3)
history = model.fit(
    X_train_cnn, y_train_cnn,
    epochs=20, batch_size=32,
    validation_split=0.1,
    callbacks=[early_stopping],
    verbose=1
)

# Evaluate CNN
y_pred_cnn = model.predict(X_test_cnn)
y_pred_cnn_classes = np.argmax(y_pred_cnn, axis=1)
cnn_time = time.time() - start_time
cnn_accuracy = accuracy_score(y_test, y_pred_cnn_classes)
print(f"CNN Accuracy: {cnn_accuracy*100:.2f}%")

def plot_training_history(history):
    epochs = range(1, len(history.history['accuracy']) + 1)

    plt.figure(figsize=(14, 6))

    # Training and Validation Accuracy
    plt.subplot(1, 2, 1)
    plt.plot(epochs, history.history['accuracy'], 'bo-', label='Training accuracy')
    plt.plot(epochs, history.history['val_accuracy'], 'ro-', label='Validation accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    # Training and Validation Loss
    plt.subplot(1, 2, 2)
    plt.plot(epochs, history.history['loss'], 'bo-', label='Training loss')
    plt.plot(epochs, history.history['val_loss'], 'ro-', label='Validation loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

# Call the function to plot the results
plot_training_history(history)

# Confusion Matrix for CNN
disp_cnn = ConfusionMatrixDisplay.from_predictions(
    y_test, y_pred_cnn_classes, display_labels=label_encoder.classes_, xticks_rotation='vertical'
)
plt.title('CNN Confusion Matrix')
plt.show()

# Comparison of Results
print("\nComparison of Algorithms:")
print(f"KNN - Accuracy: {knn_accuracy*100:.2f}%, Time: {knn_time:.2f}s")
print(f"SVM - Accuracy: {svm_accuracy*100:.2f}%, Time: {svm_time:.2f}s")
print(f"CNN - Accuracy: {cnn_accuracy*100:.2f}%,Time: {cnn_time:.2f}s")
