In [5]:
import pandas as pd
import numpy as np
from PIL import Image
import os
from skimage.feature import hog
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

# Load and preprocess dataset
def load_data(csv_file, base_folder, image_size=(64, 64)):
    df = pd.read_csv(csv_file)
    
    # Remove "Ahegao" class if present
    df = df[df['label'] != 'Ahegao']
    
    # Initialize lists for images and labels
    images = []
    labels = []
    
    # Iterate through CSV, load images, and resize them
    for index, row in df.iterrows():
        img_path = os.path.join(base_folder, row['path'].strip())
        label = row['label']
        
        # Load and preprocess image
        try:
            img = Image.open(img_path).convert('RGB').resize(image_size)
            img = np.array(img) / 255.0  # Normalize to [0, 1]
            images.append(img)
            labels.append(label)
        except:
            continue  # Skip corrupted or unreadable images

    return np.array(images), np.array(labels)

# Correct paths to your dataset
csv_file = r"C:/Users/kasse/Documents/faceRecog/archive/data.csv"
base_folder = r"C:/Users/kasse/Documents/faceRecog/archive/dataset"

# Load images and labels
X, y = load_data(csv_file, base_folder)

### Feature Engineering ###
# Step 1: Convert RGB images to grayscale
def rgb_to_grayscale(images):
    return np.dot(images[..., :3], [0.2989, 0.5870, 0.1140])

X_gray = rgb_to_grayscale(X)

# Step 2: Extract HOG features
def extract_hog_features(images):
    hog_features = []
    for img in images:
        features = hog(img, pixels_per_cell=(8, 8), cells_per_block=(2, 2), feature_vector=True)
        hog_features.append(features)
    return np.array(hog_features)

X_hog = extract_hog_features(X_gray)

# Step 3: Apply PCA for dimensionality reduction
def apply_pca(features, n_components=100):
    pca = PCA(n_components=n_components)
    return pca.fit_transform(features)

X_pca = apply_pca(X_hog, n_components=100)

# Step 4: Normalize the features
def normalize_features(features):
    scaler = StandardScaler()
    return scaler.fit_transform(features)

X_normalized = normalize_features(X_pca)

### Prepare labels ###
# Encode labels to numerical values
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# One-hot encode the labels for softmax regression
y_one_hot = np.eye(len(np.unique(y_encoded)))[y_encoded]

### Train/test split ###
# Split into training, validation, and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X_normalized, y_one_hot, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

y_test_labels = np.argmax(y_test, axis=1)  # True labels for test data

### Logistic Regression Implementation ###
class LogisticRegression:
    def __init__(self, input_size, num_classes, learning_rate=0.1, regularization=0.001):  # Best hyperparameters applied directly
        self.W = np.random.randn(input_size, num_classes) * 0.01
        self.b = np.zeros((1, num_classes))
        self.learning_rate = learning_rate
        self.regularization = regularization
    
    def softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)
    
    def compute_loss(self, y, y_hat):
        m = y.shape[0]
        cross_entropy_loss = -np.sum(y * np.log(y_hat + 1e-8)) / m
        l2_loss = self.regularization * np.sum(np.square(self.W)) / 2  # L2 regularization term
        return cross_entropy_loss + l2_loss
    
    def compute_gradients(self, X, y, y_hat):
        m = X.shape[0]
        dW = np.dot(X.T, (y_hat - y)) / m + self.regularization * self.W  # L2 regularization gradient
        db = np.sum(y_hat - y, axis=0, keepdims=True) / m
        return dW, db
    
    def train(self, X, y, epochs=1000):  # Best number of epochs applied directly
        for i in range(epochs):
            z = np.dot(X, self.W) + self.b
            y_hat = self.softmax(z)
            
            loss = self.compute_loss(y, y_hat)
            dW, db = self.compute_gradients(X, y, y_hat)
            
            self.W -= self.learning_rate * dW
            self.b -= self.learning_rate * db
            
            if i % 100 == 0:
                print(f"Epoch {i}, Loss: {loss}")
    
    def predict(self, X):
        z = np.dot(X, self.W) + self.b
        y_hat = self.softmax(z)
        return np.argmax(y_hat, axis=1)

### Hyperparameter Selection Step - Commented Out ###
# We are commenting out the random search section as we have already determined the best hyperparameters

"""
# Random Search for Hyperparameter Tuning
def random_search(param_dist, n_iter=10):
    best_accuracy = 0
    best_params = None
    
    for _ in range(n_iter):
        params = {k: random.choice(v) for k, v in param_dist.items()}
        accuracy = train_and_evaluate(params)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = params
    
    return best_params, best_accuracy

# Random Search hyperparameter ranges
param_dist = {
    'learning_rate': [0.1, 0.01, 0.001, 0.0001],
    'regularization': [0.1, 0.01, 0.001],
    'epochs': [500, 1000, 1500]
}

# Perform random search with 10 iterations
best_params_random, best_accuracy_random = random_search(param_dist, n_iter=10)
print(f"Best Hyperparameters from Random Search: {best_params_random}")
print(f"Best Validation Accuracy from Random Search: {best_accuracy_random}")

# Save the best hyperparameters to reuse them later without hyperparameter tuning
best_hyperparameters = best_params_random
"""

### K-Fold Cross-Validation ###
def cross_validate_model(X, y, num_folds=5):
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    fold_accuracies = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Train the model
        model = LogisticRegression(input_size=X_train.shape[1], num_classes=y_train.shape[1], learning_rate=0.1, regularization=0.001)
        model.train(X_train, y_train, epochs=1000)

        # Validate the model
        y_val_pred = model.predict(X_val)
        val_accuracy = np.mean(np.argmax(y_val, axis=1) == y_val_pred)
        fold_accuracies.append(val_accuracy)
        print(f"Validation Accuracy for fold: {val_accuracy}")

    avg_accuracy = np.mean(fold_accuracies)
    print(f"Average Cross-Validation Accuracy: {avg_accuracy}")
    return avg_accuracy

# Perform cross-validation with 5 folds
cross_validation_accuracy = cross_validate_model(X_normalized, y_one_hot, num_folds=5)

### Final Model Training with the Best Hyperparameters ###
# Use the best hyperparameters directly
final_model = LogisticRegression(input_size=X_train_full.shape[1], num_classes=y_train_full.shape[1], learning_rate=0.1, regularization=0.001)
final_model.train(X_train_full, y_train_full, epochs=1000)

# Test the final model
y_test_pred = final_model.predict(X_test)
final_accuracy = np.mean(np.argmax(y_test, axis=1) == y_test_pred)
print(f"Final Test Accuracy: {final_accuracy}")


Epoch 0, Loss: 1.6105440751826163
Epoch 100, Loss: 1.0228165208175526
Epoch 200, Loss: 0.980322380079138
Epoch 300, Loss: 0.9681703572995907
Epoch 400, Loss: 0.9631022070100799
Epoch 500, Loss: 0.9605844746694882
Epoch 600, Loss: 0.9591927284396055
Epoch 700, Loss: 0.9583637319736434
Epoch 800, Loss: 0.9578413627332292
Epoch 900, Loss: 0.957497301812286
Validation Accuracy for fold: 0.6112280701754386
Epoch 0, Loss: 1.613511604226043
Epoch 100, Loss: 1.0182859052352555
Epoch 200, Loss: 0.9749652276755921
Epoch 300, Loss: 0.9623647473011693
Epoch 400, Loss: 0.957025567325256
Epoch 500, Loss: 0.9543380534983229
Epoch 600, Loss: 0.9528367406568513
Epoch 700, Loss: 0.9519348457247151
Epoch 800, Loss: 0.9513624300794101
Epoch 900, Loss: 0.9509829269611436
Validation Accuracy for fold: 0.5992982456140351
Epoch 0, Loss: 1.6101373401164965
Epoch 100, Loss: 1.015201390729419
Epoch 200, Loss: 0.9722600903119383
Epoch 300, Loss: 0.9598866630287781
Epoch 400, Loss: 0.9546685842129263
Epoch 500, Lo