# Assignment 4: Deep learning and unsupervised learning
For this assignment you are allowed to use data augmentation

# Task 1
Pick any image based dataset from the list, implement the preprocessing and justify the preprocessing steps, extract features and justify the methods used, select features and justify the methods used. Some of this is done already in one of the previous assignments. You can reuse
things.

- [] Implement (using the selected features) one basic machine learning algorithm for classification and justify your choice 20 (without justification 10).

- [] Implement (using the selected features) one advanced machine learning algorithm for classification and justify your choice 20 (without justification 10).

- [] Implement a CNN with hyperparameter tuning (for this you can directly use the data after the preprocessing) (30)

- [] Compare and Explain the results in terms of both the computation time and the performance of the classification algorithms. (30)

In [None]:
# Preprocessing, extract features, select features.
import os
import cv2
import numpy as np

# Define paths and parameters
dataset_path = "image_dataset"
preprocessed_path = "preprocessed_arrays"
image_size = (224, 224)  # Image size
class_names = ["hatchback", "motorcycle", "pickup", "sedan", "suv"]

# Create directory to save preprocessed arrays
os.makedirs(preprocessed_path, exist_ok=True)

def preprocess_image(img):
    """
    Preprocess the image: convert to grayscale, resize, and normalize.
    """
    # Step 1: Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # Step 2: Resize the image
    img_resized = cv2.resize(gray, image_size)
    # Step 3: Normalize the image
    img_normalized = img_resized / 255.0
    return img_normalized

# Preprocess images and save as NumPy arrays
for class_name in class_names:
    folder_path = os.path.join(dataset_path, class_name)
    class_output_path = os.path.join(preprocessed_path, class_name)
    os.makedirs(class_output_path, exist_ok=True)

    if not os.path.exists(folder_path):
        print(f"Folder not found: {folder_path}")
        continue

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)

        if file_name.lower().endswith(('.png', '.jpg', '.jpeg')):
            try:
                # Read and preprocess the image
                img = cv2.imread(file_path)
                img_preprocessed = preprocess_image(img)

                # Save the preprocessed image as a NumPy array
                save_path = os.path.join(class_output_path, file_name.split('.')[0] + ".npy")
                np.save(save_path, img_preprocessed)
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")

print(f"Preprocessing completed. Preprocessed arrays saved in '{preprocessed_path}'")

In [None]:
# Feature Extraction
# Define paths
edge_detected_path = "edge_detected_arrays"

# Create directory to save edge-detected arrays
os.makedirs(edge_detected_path, exist_ok=True)

def apply_edge_detection(img):
    """
    Apply edge detection to a preprocessed image (already in grayscale).
    """
    # Apply Gaussian Blur to reduce noise
    blurred = cv2.GaussianBlur(img, (5, 5), 0)
    # Apply Canny Edge Detection
    edges = cv2.Canny((blurred * 255).astype(np.uint8), threshold1=50, threshold2=150)
    return edges / 255.0  # Normalize to [0, 1]

# Perform edge detection and save as NumPy arrays
for class_name in class_names:
    class_input_path = os.path.join(preprocessed_path, class_name)
    class_output_path = os.path.join(edge_detected_path, class_name)
    os.makedirs(class_output_path, exist_ok=True)

    if not os.path.exists(class_input_path):
        print(f"Folder not found: {class_input_path}")
        continue

    for file_name in os.listdir(class_input_path):
        file_path = os.path.join(class_input_path, file_name)

        if file_name.lower().endswith('.npy'):
            try:
                # Load the preprocessed image as a NumPy array
                img_preprocessed = np.load(file_path)
                # Apply edge detection
                img_edges = apply_edge_detection(img_preprocessed)

                # Save the edge-detected image as a NumPy array
                save_path = os.path.join(class_output_path, file_name)
                np.save(save_path, img_edges)
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")

print(f"Edge detection completed. Edge-detected arrays saved in '{edge_detected_path}'")

In [None]:
# Feature Selection
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline


# Define paths
preprocessed_path = "edge_detected_arrays"
feature_selected_path = "feature_selected_arrays"

# Create directory to save feature-selected arrays
os.makedirs(feature_selected_path, exist_ok=True)

# Load preprocessed data
X = []  # Feature data
y = []  # Labels

for class_name in sorted(os.listdir(preprocessed_path)):  # Sort class directories
    class_input_path = os.path.join(preprocessed_path, class_name)

    if not os.path.exists(class_input_path):
        print(f"Folder not found: {class_input_path}")
        continue

    for file_name in sorted(os.listdir(class_input_path)):  # Sort files within each class
        if file_name.lower().endswith('.npy'):
            try:
                # Load preprocessed image
                file_path = os.path.join(class_input_path, file_name)
                img_array = np.load(file_path).flatten()  # Flatten image array
                X.append(img_array)
                y.append(class_name)  # Add corresponding class label
            except Exception as e:
                print(f"Error loading file {file_path}: {e}")

# Convert to NumPy arrays
X = np.array(X)
y = np.array(y)

# Define the range of components to test
components_range = [5, 15, 16, 17, 18, 19, 20]#, 50, 100, 300, 900]

# Dictionary to store cross-validation results
results = {}

# Loop over different numbers of PCA components
for n_components in components_range:
    # Create a pipeline that applies PCA and then a classifier
    pipeline = Pipeline([
        ('pca', PCA(n_components=n_components)),
        ('classifier', RandomForestClassifier(random_state=42))
    ])
    
    # Perform cross-validation and compute the mean accuracy
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
    results[n_components] = np.mean(cv_scores)
    print(f"Components: {n_components}, CV Accuracy: {results[n_components]:.4f}")

# Find the optimal number of components based on cross-validation accuracy
best_n_components = max(results, key=results.get)
print(f"Optimal number of components: {best_n_components}")
print(f"Best cross-validated accuracy: {results[best_n_components]:.4f}")

pca = PCA(n_components=best_n_components)
image_data_reduced = pca.fit_transform(X)

In [None]:
# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(image_data_reduced, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
# Implement Basic Classification (Random Forest)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"SVM Model Accuracy: {accuracy * 100:.2f}%")

# Optional: Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Optional: Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
# Implement Advanced Classification (SVM)
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 0.1, 1],
    'kernel': ['rbf']
}

# Perform Grid Search
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Display the best parameters
print(f"Best parameters: {grid_search.best_params_}")

# Use the best model
best_svm = grid_search.best_estimator_
y_pred_best = best_svm.predict(X_test)

# Evaluate the optimized model
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"Optimized SVM Model Accuracy: {accuracy_best * 100:.2f}%")

# Optional: Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best))

# Optional: Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_best))

In [None]:
# import os
# import numpy as np
# import tensorflow as tf
# from sklearn.model_selection import train_test_split
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
# from tensorflow.keras.utils import to_categorical
# from tensorflow.keras.preprocessing.image import ImageDataGenerator

# # Define paths
# preprocessed_path = "preprocessed_arrays"

# # Load preprocessed data
# X = []  # Feature data
# y = []  # Labels

# for class_name in sorted(os.listdir(preprocessed_path)):  # Sort class directories
#     class_input_path = os.path.join(preprocessed_path, class_name)

#     if not os.path.exists(class_input_path):
#         print(f"Folder not found: {class_input_path}")
#         continue

#     for file_name in sorted(os.listdir(class_input_path)):  # Sort files within each class
#         if file_name.lower().endswith('.npy'):
#             try:
#                 # Load preprocessed image
#                 file_path = os.path.join(class_input_path, file_name)
#                 img_array = np.load(file_path)  # Do not flatten for CNN
#                 X.append(img_array)
#                 y.append(class_name)  # Add corresponding class label
#             except Exception as e:
#                 print(f"Error loading file {file_path}: {e}")

# # Convert to NumPy arrays
# X = np.array(X)
# y = np.array(y)

# # Add channel dimension
# X = np.expand_dims(X, axis=-1)  # Shape becomes (batch_size, height, width, 1)

# # Encode labels
# unique_classes = sorted(set(y))
# label_to_index = {label: idx for idx, label in enumerate(unique_classes)}
# y = np.array([label_to_index[label] for label in y])

# # Convert labels to one-hot encoding
# y = to_categorical(y, num_classes=len(unique_classes))

# # Split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# # Define data augmentation
# datagen = ImageDataGenerator(
#     rotation_range=20,       # Rotate images by up to 20 degrees
#     width_shift_range=0.2,   # Shift images horizontally by 20% of the width
#     height_shift_range=0.2,  # Shift images vertically by 20% of the height
#     zoom_range=0.2,          # Randomly zoom in/out
#     horizontal_flip=True,    # Flip images horizontally
#     fill_mode='nearest'      # Fill missing pixels with nearest values
# )

# # Fit the data generator to the training data
# datagen.fit(X_train)

# # Hyperparameters
# num_filters = 64
# kernel_size = (3, 3)
# pool_size = (2, 2)
# dense_units = 128
# dropout_rate = 0.3
# input_shape = X_train.shape[1:]  # Shape of a single image

# # Build CNN model
# model = Sequential([
#     Conv2D(num_filters, kernel_size, activation='relu', input_shape=input_shape),
#     MaxPooling2D(pool_size=pool_size),
#     Conv2D(num_filters * 2, kernel_size, activation='relu'),
#     MaxPooling2D(pool_size=pool_size),
#     Flatten(),
#     Dense(dense_units, activation='relu'),
#     Dropout(dropout_rate),
#     Dense(len(unique_classes), activation='softmax')  # Output layer
# ])

# # Compile the model
# model.compile(optimizer='adam',
#               loss='categorical_crossentropy',
#               metrics=['accuracy'])

# # Train the model with data augmentation
# batch_size = 32
# epochs = 20
# history = model.fit(
#     datagen.flow(X_train, y_train, batch_size=batch_size),
#     validation_data=(X_test, y_test),
#     epochs=epochs
# )

# # Evaluate the model
# test_loss, test_accuracy = model.evaluate(X_test, y_test)
# print(f"Test Accuracy: {test_accuracy:.4f}")

# # Save the model
# model.save("cnn_model_with_augmentation.h5")

# Task 2
Pick any dataset from the list, implement the preprocessing and justify the preprocessing steps, extract features and justify the methods used, select features and justify the methods used. Some of this is done already in one of the previous assignments. You can reuse things.

Implement three clustering methods out of the following and justify your choices (30)

- K-means
- Hierarchical Clustering
- Fuzzy-C-means
- DBSCAN
- Gaussian mixture models
- Self-organizing maps

Compare and Explain the results (30).

In [None]:
# Preprocessing, extract features, select features. (Can reuse)

In [None]:
# Implement cluster method 1


In [None]:
# Implement cluster method 2

In [None]:
# Implement cluster method 3